From: Scott Gasch Date: Thu, 5 Aug 2021 21:56:34 +0000 (-0700) Subject: Adds profanity filter, fixes bugs. X-Git-Url: https://wannabe.guru.org/gitweb/?a=commitdiff_plain;h=b843703134a166013518c707fa5a77373f1bf0bf;p=python_utils.git Adds profanity filter, fixes bugs. --- diff --git a/dateparse/dateparse_utils.py b/dateparse/dateparse_utils.py index 4decb81..cd8bc35 100755 --- a/dateparse/dateparse_utils.py +++ b/dateparse/dateparse_utils.py @@ -385,7 +385,7 @@ class DateParser(dateparse_utilsListener): tz = pytz.timezone(txt) if tz is not None: return tz - except: + except Exception: pass # Try dateutil @@ -393,7 +393,7 @@ class DateParser(dateparse_utilsListener): tz = dateutil.tz.gettz(txt) if tz is not None: return tz - except: + except Exception: pass # Try constructing an offset in seconds @@ -406,7 +406,7 @@ class DateParser(dateparse_utilsListener): offset = sign * (hour * 60 * 60) + sign * (minute * 60) tzoffset = dateutil.tz.tzoffset(txt, offset) return tzoffset - except: + except Exception: pass return None @@ -574,7 +574,7 @@ class DateParser(dateparse_utilsListener): unit = self._figure_out_date_unit( ctx.deltaUnit().getText().lower() ) - except: + except Exception: raise ParseException(f'Invalid Delta +/-: {ctx.getText()}') else: self.context['delta_int'] = n @@ -585,7 +585,7 @@ class DateParser(dateparse_utilsListener): ) -> None: try: unit = self._figure_out_date_unit(ctx.getText().lower()) - except: + except Exception: raise ParseException(f'Bad delta unit: {ctx.getText()}') else: self.context['delta_unit'] = unit @@ -595,7 +595,7 @@ class DateParser(dateparse_utilsListener): ) -> None: try: txt = ctx.getText().lower() - except: + except Exception: raise ParseException(f'Bad next/last: {ctx.getText()}') if ( 'month' in self.context or @@ -630,7 +630,7 @@ class DateParser(dateparse_utilsListener): ctx.deltaTimeUnit().getText().lower() ) self.context['time_delta_unit'] = unit - except: + except Exception: raise ParseException(f'Bad delta unit: {ctx.getText()}') if 'time_delta_before_after' not in self.context: raise ParseException( @@ -654,7 +654,7 @@ class DateParser(dateparse_utilsListener): ] = TimeUnit.MINUTES else: raise ParseException(f'Bad time fraction {ctx.getText()}') - except: + except Exception: raise ParseException(f'Bad time fraction {ctx.getText()}') def exitDeltaBeforeAfter( @@ -662,7 +662,7 @@ class DateParser(dateparse_utilsListener): ) -> None: try: txt = ctx.getText().lower() - except: + except Exception: raise ParseException(f'Bad delta before|after: {ctx.getText()}') else: self.context['delta_before_after'] = txt @@ -672,7 +672,7 @@ class DateParser(dateparse_utilsListener): ) -> None: try: txt = ctx.getText().lower() - except: + except Exception: raise ParseException(f'Bad delta before|after: {ctx.getText()}') else: self.context['time_delta_before_after'] = txt @@ -732,7 +732,7 @@ class DateParser(dateparse_utilsListener): self.context['month'] = month self.context['day'] = 1 self.main_type = DateParser.PARSE_TYPE_BASE_AND_OFFSET_EXPR - except: + except Exception: raise ParseException( f'Invalid nthWeekday expression: {ctx.getText()}' ) @@ -746,7 +746,7 @@ class DateParser(dateparse_utilsListener): def exitNth(self, ctx: dateparse_utilsParser.NthContext) -> None: try: i = self._get_int(ctx.getText()) - except: + except Exception: raise ParseException(f'Bad nth expression: {ctx.getText()}') else: self.context['nth'] = i @@ -764,7 +764,7 @@ class DateParser(dateparse_utilsListener): raise ParseException( f'Bad first|last expression: {ctx.getText()}' ) - except: + except Exception: raise ParseException(f'Bad first|last expression: {ctx.getText()}') else: self.context['nth'] = txt @@ -773,7 +773,7 @@ class DateParser(dateparse_utilsListener): try: dow = ctx.getText().lower()[:3] dow = self.day_name_to_number.get(dow, None) - except: + except Exception: raise ParseException('Bad day of week') else: self.context['dow'] = dow @@ -797,7 +797,7 @@ class DateParser(dateparse_utilsListener): raise ParseException( f'Bad dayOfMonth expression: {ctx.getText()}' ) - except: + except Exception: raise ParseException(f'Bad dayOfMonth expression: {ctx.getText()}') self.context['day'] = day @@ -814,7 +814,7 @@ class DateParser(dateparse_utilsListener): raise ParseException( f'Bad monthName expression: {ctx.getText()}' ) - except: + except Exception: raise ParseException(f'Bad monthName expression: {ctx.getText()}') else: self.context['month'] = month @@ -828,7 +828,7 @@ class DateParser(dateparse_utilsListener): raise ParseException( f'Bad monthNumber expression: {ctx.getText()}' ) - except: + except Exception: raise ParseException( f'Bad monthNumber expression: {ctx.getText()}' ) @@ -840,7 +840,7 @@ class DateParser(dateparse_utilsListener): year = self._get_int(ctx.getText()) if year < 1: raise ParseException(f'Bad year expression: {ctx.getText()}') - except: + except Exception: raise ParseException(f'Bad year expression: {ctx.getText()}') else: self.context['year'] = year @@ -851,7 +851,7 @@ class DateParser(dateparse_utilsListener): try: special = ctx.specialDate().getText().lower() self.context['special'] = special - except: + except Exception: raise ParseException( f'Bad specialDate expression: {ctx.specialDate().getText()}' ) @@ -864,7 +864,7 @@ class DateParser(dateparse_utilsListener): self.context['special_next_last'] = 'next' elif mod.LAST() is not None: self.context['special_next_last'] = 'last' - except: + except Exception: raise ParseException( f'Bad specialDateNextLast expression: {ctx.getText()}' ) @@ -877,7 +877,7 @@ class DateParser(dateparse_utilsListener): count = self._get_int(ctx.unsignedInt().getText()) unit = ctx.deltaUnit().getText().lower() ago_from_now = ctx.AGO_FROM_NOW().getText() - except: + except Exception: raise ParseException( f'Bad NFoosFromTodayAgoExpr: {ctx.getText()}' ) @@ -911,7 +911,7 @@ class DateParser(dateparse_utilsListener): f'Bad This/Next/Last modifier: {mod}' ) unit = ctx.deltaUnit().getText().lower() - except: + except Exception: raise ParseException( f'Bad DeltaRelativeToTodayExpr: {ctx.getText()}' ) @@ -929,7 +929,7 @@ class DateParser(dateparse_utilsListener): ) -> None: try: txt = ctx.specialTime().getText().lower() - except: + except Exception: raise ParseException( f'Bad special time expression: {ctx.getText()}' ) @@ -950,7 +950,7 @@ class DateParser(dateparse_utilsListener): try: tz = ctx.tzExpr().getText() self.context['tz'] = self._parse_tz(tz) - except: + except Exception: pass def exitTwelveHourTimeExpr( @@ -961,14 +961,14 @@ class DateParser(dateparse_utilsListener): while not hour[-1].isdigit(): hour = hour[:-1] hour = self._get_int(hour) - except: + except Exception: raise ParseException(f'Bad hour: {ctx.hour().getText()}') if hour <= 0 or hour > 12: raise ParseException(f'Bad hour (out of range): {hour}') try: minute = self._get_int(ctx.minute().getText()) - except: + except Exception: minute = 0 if minute < 0 or minute > 59: raise ParseException(f'Bad minute (out of range): {minute}') @@ -976,7 +976,7 @@ class DateParser(dateparse_utilsListener): try: seconds = self._get_int(ctx.second().getText()) - except: + except Exception: seconds = 0 if seconds < 0 or seconds > 59: raise ParseException(f'Bad second (out of range): {seconds}') @@ -984,7 +984,7 @@ class DateParser(dateparse_utilsListener): try: micros = self._get_int(ctx.micros().getText()) - except: + except Exception: micros = 0 if micros < 0 or micros > 1000000: raise ParseException(f'Bad micros (out of range): {micros}') @@ -992,7 +992,7 @@ class DateParser(dateparse_utilsListener): try: ampm = ctx.ampm().getText() - except: + except Exception: raise ParseException(f'Bad ampm: {ctx.ampm().getText()}') if hour == 12: hour = 0 @@ -1003,7 +1003,7 @@ class DateParser(dateparse_utilsListener): try: tz = ctx.tzExpr().getText() self.context['tz'] = self._parse_tz(tz) - except: + except Exception: pass def exitTwentyFourHourTimeExpr( @@ -1014,7 +1014,7 @@ class DateParser(dateparse_utilsListener): while not hour[-1].isdigit(): hour = hour[:-1] hour = self._get_int(hour) - except: + except Exception: raise ParseException(f'Bad hour: {ctx.hour().getText()}') if hour < 0 or hour > 23: raise ParseException(f'Bad hour (out of range): {hour}') @@ -1022,7 +1022,7 @@ class DateParser(dateparse_utilsListener): try: minute = self._get_int(ctx.minute().getText()) - except: + except Exception: minute = 0 if minute < 0 or minute > 59: raise ParseException(f'Bad minute (out of range): {ctx.getText()}') @@ -1030,7 +1030,7 @@ class DateParser(dateparse_utilsListener): try: seconds = self._get_int(ctx.second().getText()) - except: + except Exception: seconds = 0 if seconds < 0 or seconds > 59: raise ParseException(f'Bad second (out of range): {ctx.getText()}') @@ -1038,7 +1038,7 @@ class DateParser(dateparse_utilsListener): try: micros = self._get_int(ctx.micros().getText()) - except: + except Exception: micros = 0 if micros < 0 or micros >= 1000000: raise ParseException(f'Bad micros (out of range): {ctx.getText()}') @@ -1047,7 +1047,7 @@ class DateParser(dateparse_utilsListener): try: tz = ctx.tzExpr().getText() self.context['tz'] = self._parse_tz(tz) - except: + except Exception: pass diff --git a/dict_utils.py b/dict_utils.py index 292b933..74e8fda 100644 --- a/dict_utils.py +++ b/dict_utils.py @@ -3,6 +3,7 @@ from itertools import islice from typing import Any, Callable, Dict, Iterator, Tuple + def init_or_inc( d: Dict[Any, Any], key: Any, diff --git a/lockfile.py b/lockfile.py index 770beaa..34279ba 100644 --- a/lockfile.py +++ b/lockfile.py @@ -123,9 +123,10 @@ class LockFile(object): cmd = self.override_command else: cmd = ' '.join(sys.argv) + print(cmd) contents = LockFileContents( pid = os.getpid(), - cmd, + commandline = cmd, expiration_timestamp = self.expiration_timestamp, ) return json.dumps(contents.__dict__) diff --git a/logging_utils.py b/logging_utils.py index 0c7d193..a0131b1 100644 --- a/logging_utils.py +++ b/logging_utils.py @@ -269,7 +269,7 @@ class OutputMultiplexer(object): open(filename, 'wb', buffering=0) for filename in filenames ] else: - if self.destination_bitv & OutputMultiplexer.FILENAMES: + if destination_bitv & OutputMultiplexer.FILENAMES: raise ValueError( "Filenames argument is required if bitv & FILENAMES" ) @@ -278,7 +278,7 @@ class OutputMultiplexer(object): if handles is not None: self.h = [handle for handle in handles] else: - if self.destination_bitv & OutputMultiplexer.FILEHANDLES: + if destination_bitv & OutputMultiplexer.Destination.FILEHANDLES: raise ValueError( "Handle argument is required if bitv & FILEHANDLES" ) diff --git a/presence.py b/presence.py old mode 100644 new mode 100755 index 682855d..c697124 --- a/presence.py +++ b/presence.py @@ -8,6 +8,7 @@ import re from typing import Dict, List import argparse_utils +import bootstrap import config logger = logging.getLogger(__name__) @@ -111,6 +112,7 @@ class PresenceDetection(object): if "cabin_" in line: continue if location == Location.CABIN: + logger.debug('Cabin count: {cabin_count}') cabin_count += 1 try: (mac, count, ip_name, mfg, ts) = line.split(",") @@ -128,6 +130,7 @@ class PresenceDetection(object): name = match.group(2) self.names_by_mac[mac] = name if cabin_count > 0: + logger.debug('Weird MAC at the cabin') self.weird_mac_at_cabin = True def is_anyone_in_location_now(self, location: Location) -> bool: @@ -152,15 +155,18 @@ class PresenceDetection(object): tiebreaks: Dict[Location, datetime.datetime] = {} credit = 10000 for mac in self.devices_by_person[name]: + logger.debug(f'Looking for {name}... check for mac {mac}') if mac not in self.names_by_mac: continue for location in self.location_ts_by_mac: if mac in self.location_ts_by_mac[location]: ts = (self.location_ts_by_mac[location])[mac] + logger.debug(f'I saw {mac} at {location} at {ts}') tiebreaks[location] = ts location = dict_utils.key_with_min_value(tiebreaks) v = votes.get(location, 0) votes[location] = v + credit + logger.debug('{name}: {location} gets {credit} votes.') credit = int( credit * 0.667 ) # Note: list most important devices first @@ -170,3 +176,17 @@ class PresenceDetection(object): item = dict_utils.item_with_max_value(votes) return item[0] return Location.UNKNOWN + + +@bootstrap.initialize +def main() -> None: + p = PresenceDetection() + for person in Person: + print(f'{person} => {p.where_is_person_now(person)}') + print() + for location in Location: + print(f'{location} => {p.is_anyone_in_location_now(location)}') + + +if __name__ == '__main__': + main() diff --git a/profanity_filter.py b/profanity_filter.py new file mode 100755 index 0000000..e1b4743 --- /dev/null +++ b/profanity_filter.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python3 + +import logging +import random +import string +import sys + +import nltk +from nltk.stem import PorterStemmer + +import string_utils + + +logger = logging.getLogger(__name__) + + +class ProfanityFilter(object): + def __init__(self): + self.bad_words = set([ + 'acrotomophilia', + 'anal', + 'analingus', + 'anally', + 'anilingus', + 'anus', + 'arsehol', + 'arsehole', + 'ass', + 'asses', + 'asshol', + 'asshole', + 'assmunch', + 'auto erot', + 'auto erotic', + 'autoerotic', + 'babeland', + 'babi batter', + 'baby batter', + 'ball gag', + 'ball gravi', + 'ball gravy', + 'ball kick', + 'ball kicking', + 'ball lick', + 'ball licking', + 'ball sack', + 'ball suck', + 'ball sucking', + 'ball zack', + 'bangbro', + 'bangbros', + 'bare legal', + 'bareback', + 'barely legal', + 'barenak', + 'barenaked', + 'bastardo', + 'bastinado', + 'bbc', + 'bbw', + 'bdsm', + 'beaver cleaver', + 'beaver lip', + 'beaver lips', + 'bestial', + 'bestiality', + 'bi curiou', + 'bi curious', + 'big black', + 'big breasts', + 'big knocker', + 'big knockers', + 'big tit', + 'big tits', + 'bimbo', + 'birdlock', + 'bitch', + 'bitches', + 'black cock', + 'blond action', + 'blond on blond', + 'blonde action', + 'blow j', + 'blow job', + 'blow my', + 'blow me', + 'blow ourselv', + 'blow ourselves', + 'blow your load', + 'blue waffl', + 'blue waffle', + 'blumpkin', + 'bollock', + 'bollocks', + 'bondag', + 'bondage', + 'boner', + 'boob', + 'boobs', + 'booti call', + 'booty call', + 'breast', + 'breasts', + 'brown shower', + 'brown showers', + 'brunett action', + 'brunette action', + 'bukkak', + 'bukkake', + 'bulldyk', + 'bulldyke', + 'bullet vibe', + 'bullshit', + 'bung hole', + 'bunghol', + 'bunghole', + 'busti', + 'busty', + 'butt', + 'buttcheek', + 'buttcheeks', + 'butthol', + 'butthole', + 'camel toe', + 'camgirl', + 'camslut', + 'camwhore', + 'carpet muncher', + 'carpetmuncher', + 'chocol rosebud', + 'chocolate rosebuds', + 'circlejerk', + 'chink', + 'cleveland steamer', + 'clit', + 'clitori', + 'clitoris', + 'clover clamp', + 'clover clamps', + 'clusterfuck', + 'cock', + 'cocks', + 'coprolagnia', + 'coprophilia', + 'cornhol', + 'cornhole', + 'cream pie', + 'creampi', + 'creampie', + 'cum', + 'cumming', + 'cunnilingu', + 'cunnilingus', + 'cunt', + 'damn', + 'darki', + 'darkie', + 'date rape', + 'daterap', + 'daterape', + 'deep throat', + 'deepthroat', + 'dick', + 'dildo', + 'dirti pillow', + 'dirti sanchez', + 'dirty pillow', + 'dirty sanchez', + 'dog style', + 'doggi style', + 'doggie style', + 'doggiestyl', + 'doggiestyle', + 'doggystyle', + 'dolcett', + 'domination', + 'dominatrix', + 'domm', + 'dommes', + 'donkey punch', + 'doubl dick', + 'doubl dong', + 'doubl penetr', + 'double dick', + 'double dong', + 'double penetration', + 'dp action', + 'dtf', + 'eat my ass', + 'ecchi', + 'ejacul', + 'erection', + 'erotic', + 'erotism', + 'escort', + 'ethical slut', + 'eunuch', + 'faggot', + 'fecal', + 'felch', + 'fellatio', + 'feltch', + 'female squirting', + 'femdom', + 'figging', + 'fingered', + 'fingering', + 'fingers', + 'fisted', + 'fisting', + 'fists', + 'foot fetish', + 'footjob', + 'frotting', + 'fuck button', + 'fuck', + 'fucked', + 'fucker', + 'fuckhead', + 'fuckin', + 'fucking', + 'fudge packer', + 'fudgepack', + 'fudgepacker', + 'futanari', + 'g spot', + 'g-spot', + 'gang bang', + 'gay sex', + 'gee spot', + 'genital', + 'giant cock', + 'girl gone wild', + 'girl on top', + 'girl on', + 'goatcx', + 'goatse', + 'goddamn', + 'gokkun', + 'golden shower', + 'goo girl', + 'goodpoop', + 'goregasm', + 'grope', + 'group sex', + 'gspot', + 'guro', + 'hand job', + 'handjob', + 'hard core', + 'hardcore', + 'hentai', + 'homoerotic', + 'honkey', + 'hooker', + 'horni', + 'horny', + 'hot chick', + 'how to kill', + 'how to murder', + 'huge fat', + 'humped', + 'humping', + 'humps', + 'incest', + 'intercourse', + 'jack off', + 'jail bait', + 'jailbait', + 'jerk off', + 'jigaboo', + 'jiggaboo', + 'jiggerboo', + 'jizz', + 'jugg', + 'kike', + 'kinbaku', + 'kinkster', + 'kinky', + 'knobbing', + 'leather restraint', + 'lemon party', + 'lolita', + 'lovemaking', + 'make me come', + 'male squirting', + 'masturb', + 'menage a trois', + 'milf', + 'missionary position', + 'motherfuck', + 'mound of venus', + 'mr hand', + 'muff diver', + 'muffdiv', + 'muffdiving', + 'nambla', + 'nawashi', + 'negro', + 'neonazi', + 'nig nog', + 'nigga', + 'nigger', + 'nimphomania', + 'nipple', + 'nip', + 'not safe for', + 'nsfl', + 'nsfw', + 'nude', + 'nudes', + 'nudity', + 'nut sack', + 'nutsack', + 'nympho', + 'nymphomania', + 'octopussy', + 'omorashi', + 'one night stand', + 'orgasm', + 'orgy', + 'paedophil', + 'paedophile', + 'panties', + 'panty', + 'pedobear', + 'pedophil', + 'pedophile', + 'pee', + 'pegging', + 'peni', + 'penis', + 'phone sex', + 'pigfucker', + 'piss pig', + 'piss', + 'pissing', + 'pisspig', + 'playboy', + 'pleasure chest', + 'pole smoker', + 'ponyplay', + 'poof', + 'poop chute', + 'poopchute', + 'porn', + 'pornhub', + 'porno', + 'pornographi', + 'pornography', + 'prince albert', + 'pthc', + 'pube', + 'pussi', + 'pussies', + 'pussy', + 'queaf', + 'queer', + 'raghead', + 'raging boner', + 'rape', + 'raping', + 'rapist', + 'rectum', + 'reverse cowgirl', + 'rimjob', + 'rimming', + 'rosy palm', + 'rusty trombone', + 's & m', + 's&m', + 's+m', + 'sadism', + 'scat', + 'schlong', + 'scissoring', + 'semen', + 'sex', + 'sexi', + 'sexo', + 'sexy', + 'shaved beaver', + 'shaved pussy', + 'shemale', + 'shibari', + 'shit', + 'shota', + 'shrimping', + 'slanteye', + 'slut', + 'smut', + 'snatch', + 'snm', + 'snowballing', + 'sodomi', + 'sodomize', + 'sodomy', + 'spic', + 'spooge', + 'spread legs', + 'squirting', + 'strap on', + 'strapon', + 'strappado', + 'strip club', + 'style doggy', + 'suck', + 'suicide girls', + 'sultry women', + 'swastika', + 'swinger', + 'taint', + 'tainted love', + 'taste my', + 'tea bagging', + 'threesome', + 'throating', + 'tied up', + 'tight white', + 'tit', + 'tits', + 'titti', + 'titties', + 'titty', + 'tongue in', + 'topless', + 'tosser', + 'towelhead', + 'tranny', + 'tribadism', + 'tub girl', + 'tubgirl', + 'tushy', + 'twat', + 'twink', + 'twinki', + 'twinkie', + 'undress', + 'upskirt', + 'urethra play', + 'urophilia', + 'vag', + 'vagina', + 'venus mound', + 'vibrator', + 'violet blue', + 'violet wand', + 'vorarephilia', + 'voyeur', + 'vulva', + 'wank', + 'wet dream', + 'wetback', + 'white power', + 'whore', + 'women rapping', + 'wrapping men', + 'wrinkled starfish', + 'xx', + 'xxx', + 'yaoi', + 'yellow shower', + 'yiffy', + 'zoophilia', + ]) + self.stemmer = PorterStemmer() + + def _normalize(self, text: str) -> str: + result = text.lower() + result = result.replace("_", " ") + for x in string.punctuation: + result = result.replace(x, "") + chunks = [ + self.stemmer.stem(word) for word in nltk.word_tokenize(result) + ] + return ' '.join(chunks) + + def contains_bad_word(self, text: str) -> bool: + words = nltk.word_tokenize(text) + for word in words: + if self.is_bad_word(word): + logger.debug(f'"{word}" is profanity') + return True + + if len(words) > 1: + for bigram in string_utils.ngrams_presplit(words, 2): + if self.is_bad_word(bigram): + logger.debug('"{bigram}" is profanity') + return True + + if len(words) > 2: + for trigram in string_utils.ngrams_presplit(words, 3): + if self.is_bad_word(trigram): + logger.debug('"{trigram}" is profanity') + return True + return False + + def is_bad_word(self, word: str) -> bool: + return ( + word in self.bad_words or + self._normalize(word) in self.bad_words + ) + + def obscure_bad_words(self, text: str) -> str: + + def obscure(word: str): + out = '' + last = '' + for letter in word: + if letter.isspace(): + out += letter + else: + while True: + char = random.choice(['#', '%', '!', '@', '&', '*']) + if last != char: + last = char + out += char + break + return out + + words = nltk.word_tokenize(text) + words.append('') + words.append('') + words.append('') + out = '' + + cursor = 0 + while cursor < len(words) - 3: + word = words[cursor] + bigram = word + ' ' + words[cursor + 1] + trigram = bigram + ' ' + words[cursor + 2] + if self.is_bad_word(trigram): + out += obscure(trigram) + ' ' + cursor += 3 + elif self.is_bad_word(bigram): + out += obscure(bigram) + ' ' + cursor += 2 + elif self.is_bad_word(word): + out += obscure(word) + ' ' + cursor += 1 + else: + out += word + ' ' + cursor += 1 + return out.strip() + + +def main() -> None: + pf = ProfanityFilter() + phrase = ' '.join(sys.argv[1:]) + print(pf.contains_bad_word(phrase)) + print(pf.obscure_bad_words(phrase)) + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/string_utils.py b/string_utils.py index 6fc257d..45cf5aa 100644 --- a/string_utils.py +++ b/string_utils.py @@ -9,7 +9,7 @@ import logging import random import re import string -from typing import Any, Callable, List, Optional +from typing import Any, Callable, Iterable, List, Optional import unicodedata from uuid import uuid4 @@ -963,3 +963,21 @@ def thify(n: int) -> str: return "rd" else: return "th" + + +def ngrams(txt: str, n: int): + words = txt.split() + return ngrams_presplit(words, n) + + +def ngrams_presplit(words: Iterable[str], n: int): + for ngram in zip(*[words[i:] for i in range(n)]): + yield(' '.join(ngram)) + + +def bigrams(txt: str): + return ngrams(txt, 2) + + +def trigrams(txt: str): + return ngrams(txt, 3) diff --git a/tests/profanity_filter_test.py b/tests/profanity_filter_test.py new file mode 100755 index 0000000..5648ad3 --- /dev/null +++ b/tests/profanity_filter_test.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import unittest + +import profanity_filter as pf +import unittest_utils + + +class TestProfanityFilter(unittest.TestCase): + + def test_basic_functionality(self): + p = pf.ProfanityFilter() + self.assertTrue(p.is_bad_word('shit')) + self.assertTrue(p.contains_bad_word('this is another fucking test')) + self.assertTrue(p.contains_bad_word('this is another fuckin test')) + self.assertFalse(p.contains_bad_word('Mary had a little lamb whose fleese was white as snow.')) + + +if __name__ == '__main__': + unittest.main()