Adds profanity filter, fixes bugs.
authorScott Gasch <[email protected]>
Thu, 5 Aug 2021 21:56:34 +0000 (14:56 -0700)
committerScott Gasch <[email protected]>
Thu, 5 Aug 2021 21:56:34 +0000 (14:56 -0700)
dateparse/dateparse_utils.py
dict_utils.py
lockfile.py
logging_utils.py
presence.py [changed mode: 0644->0755]
profanity_filter.py [new file with mode: 0755]
string_utils.py
tests/profanity_filter_test.py [new file with mode: 0755]

index 4decb81570bb0bbf02f066ec31993b20ea0ce6af..cd8bc3543513d147b030c5303424964aaa49b5b9 100755 (executable)
@@ -385,7 +385,7 @@ class DateParser(dateparse_utilsListener):
             tz = pytz.timezone(txt)
             if tz is not None:
                 return tz
-        except:
+        except Exception:
             pass
 
         # Try dateutil
@@ -393,7 +393,7 @@ class DateParser(dateparse_utilsListener):
             tz = dateutil.tz.gettz(txt)
             if tz is not None:
                 return tz
-        except:
+        except Exception:
             pass
 
         # Try constructing an offset in seconds
@@ -406,7 +406,7 @@ class DateParser(dateparse_utilsListener):
                 offset = sign * (hour * 60 * 60) + sign * (minute * 60)
                 tzoffset = dateutil.tz.tzoffset(txt, offset)
                 return tzoffset
-        except:
+        except Exception:
             pass
         return None
 
@@ -574,7 +574,7 @@ class DateParser(dateparse_utilsListener):
             unit = self._figure_out_date_unit(
                 ctx.deltaUnit().getText().lower()
             )
-        except:
+        except Exception:
             raise ParseException(f'Invalid Delta +/-: {ctx.getText()}')
         else:
             self.context['delta_int'] = n
@@ -585,7 +585,7 @@ class DateParser(dateparse_utilsListener):
     ) -> None:
         try:
             unit = self._figure_out_date_unit(ctx.getText().lower())
-        except:
+        except Exception:
             raise ParseException(f'Bad delta unit: {ctx.getText()}')
         else:
             self.context['delta_unit'] = unit
@@ -595,7 +595,7 @@ class DateParser(dateparse_utilsListener):
     ) -> None:
         try:
             txt = ctx.getText().lower()
-        except:
+        except Exception:
             raise ParseException(f'Bad next/last: {ctx.getText()}')
         if (
                 'month' in self.context or
@@ -630,7 +630,7 @@ class DateParser(dateparse_utilsListener):
                 ctx.deltaTimeUnit().getText().lower()
             )
             self.context['time_delta_unit'] = unit
-        except:
+        except Exception:
             raise ParseException(f'Bad delta unit: {ctx.getText()}')
         if 'time_delta_before_after' not in self.context:
             raise ParseException(
@@ -654,7 +654,7 @@ class DateParser(dateparse_utilsListener):
                 ] = TimeUnit.MINUTES
             else:
                 raise ParseException(f'Bad time fraction {ctx.getText()}')
-        except:
+        except Exception:
             raise ParseException(f'Bad time fraction {ctx.getText()}')
 
     def exitDeltaBeforeAfter(
@@ -662,7 +662,7 @@ class DateParser(dateparse_utilsListener):
     ) -> None:
         try:
             txt = ctx.getText().lower()
-        except:
+        except Exception:
             raise ParseException(f'Bad delta before|after: {ctx.getText()}')
         else:
             self.context['delta_before_after'] = txt
@@ -672,7 +672,7 @@ class DateParser(dateparse_utilsListener):
     ) -> None:
         try:
             txt = ctx.getText().lower()
-        except:
+        except Exception:
             raise ParseException(f'Bad delta before|after: {ctx.getText()}')
         else:
             self.context['time_delta_before_after'] = txt
@@ -732,7 +732,7 @@ class DateParser(dateparse_utilsListener):
                 self.context['month'] = month
                 self.context['day'] = 1
             self.main_type = DateParser.PARSE_TYPE_BASE_AND_OFFSET_EXPR
-        except:
+        except Exception:
             raise ParseException(
                 f'Invalid nthWeekday expression: {ctx.getText()}'
             )
@@ -746,7 +746,7 @@ class DateParser(dateparse_utilsListener):
     def exitNth(self, ctx: dateparse_utilsParser.NthContext) -> None:
         try:
             i = self._get_int(ctx.getText())
-        except:
+        except Exception:
             raise ParseException(f'Bad nth expression: {ctx.getText()}')
         else:
             self.context['nth'] = i
@@ -764,7 +764,7 @@ class DateParser(dateparse_utilsListener):
                 raise ParseException(
                     f'Bad first|last expression: {ctx.getText()}'
                 )
-        except:
+        except Exception:
             raise ParseException(f'Bad first|last expression: {ctx.getText()}')
         else:
             self.context['nth'] = txt
@@ -773,7 +773,7 @@ class DateParser(dateparse_utilsListener):
         try:
             dow = ctx.getText().lower()[:3]
             dow = self.day_name_to_number.get(dow, None)
-        except:
+        except Exception:
             raise ParseException('Bad day of week')
         else:
             self.context['dow'] = dow
@@ -797,7 +797,7 @@ class DateParser(dateparse_utilsListener):
                 raise ParseException(
                     f'Bad dayOfMonth expression: {ctx.getText()}'
                 )
-        except:
+        except Exception:
             raise ParseException(f'Bad dayOfMonth expression: {ctx.getText()}')
         self.context['day'] = day
 
@@ -814,7 +814,7 @@ class DateParser(dateparse_utilsListener):
                 raise ParseException(
                     f'Bad monthName expression: {ctx.getText()}'
                 )
-        except:
+        except Exception:
             raise ParseException(f'Bad monthName expression: {ctx.getText()}')
         else:
             self.context['month'] = month
@@ -828,7 +828,7 @@ class DateParser(dateparse_utilsListener):
                 raise ParseException(
                     f'Bad monthNumber expression: {ctx.getText()}'
                 )
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad monthNumber expression: {ctx.getText()}'
             )
@@ -840,7 +840,7 @@ class DateParser(dateparse_utilsListener):
             year = self._get_int(ctx.getText())
             if year < 1:
                 raise ParseException(f'Bad year expression: {ctx.getText()}')
-        except:
+        except Exception:
             raise ParseException(f'Bad year expression: {ctx.getText()}')
         else:
             self.context['year'] = year
@@ -851,7 +851,7 @@ class DateParser(dateparse_utilsListener):
         try:
             special = ctx.specialDate().getText().lower()
             self.context['special'] = special
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad specialDate expression: {ctx.specialDate().getText()}'
             )
@@ -864,7 +864,7 @@ class DateParser(dateparse_utilsListener):
                     self.context['special_next_last'] = 'next'
                 elif mod.LAST() is not None:
                     self.context['special_next_last'] = 'last'
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad specialDateNextLast expression: {ctx.getText()}'
             )
@@ -877,7 +877,7 @@ class DateParser(dateparse_utilsListener):
             count = self._get_int(ctx.unsignedInt().getText())
             unit = ctx.deltaUnit().getText().lower()
             ago_from_now = ctx.AGO_FROM_NOW().getText()
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad NFoosFromTodayAgoExpr: {ctx.getText()}'
             )
@@ -911,7 +911,7 @@ class DateParser(dateparse_utilsListener):
                     f'Bad This/Next/Last modifier: {mod}'
                 )
             unit = ctx.deltaUnit().getText().lower()
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad DeltaRelativeToTodayExpr: {ctx.getText()}'
             )
@@ -929,7 +929,7 @@ class DateParser(dateparse_utilsListener):
     ) -> None:
         try:
             txt = ctx.specialTime().getText().lower()
-        except:
+        except Exception:
             raise ParseException(
                 f'Bad special time expression: {ctx.getText()}'
             )
@@ -950,7 +950,7 @@ class DateParser(dateparse_utilsListener):
         try:
             tz = ctx.tzExpr().getText()
             self.context['tz'] = self._parse_tz(tz)
-        except:
+        except Exception:
             pass
 
     def exitTwelveHourTimeExpr(
@@ -961,14 +961,14 @@ class DateParser(dateparse_utilsListener):
             while not hour[-1].isdigit():
                 hour = hour[:-1]
             hour = self._get_int(hour)
-        except:
+        except Exception:
             raise ParseException(f'Bad hour: {ctx.hour().getText()}')
         if hour <= 0 or hour > 12:
             raise ParseException(f'Bad hour (out of range): {hour}')
 
         try:
             minute = self._get_int(ctx.minute().getText())
-        except:
+        except Exception:
             minute = 0
         if minute < 0 or minute > 59:
             raise ParseException(f'Bad minute (out of range): {minute}')
@@ -976,7 +976,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             seconds = self._get_int(ctx.second().getText())
-        except:
+        except Exception:
             seconds = 0
         if seconds < 0 or seconds > 59:
             raise ParseException(f'Bad second (out of range): {seconds}')
@@ -984,7 +984,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             micros = self._get_int(ctx.micros().getText())
-        except:
+        except Exception:
             micros = 0
         if micros < 0 or micros > 1000000:
             raise ParseException(f'Bad micros (out of range): {micros}')
@@ -992,7 +992,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             ampm = ctx.ampm().getText()
-        except:
+        except Exception:
             raise ParseException(f'Bad ampm: {ctx.ampm().getText()}')
         if hour == 12:
             hour = 0
@@ -1003,7 +1003,7 @@ class DateParser(dateparse_utilsListener):
         try:
             tz = ctx.tzExpr().getText()
             self.context['tz'] = self._parse_tz(tz)
-        except:
+        except Exception:
             pass
 
     def exitTwentyFourHourTimeExpr(
@@ -1014,7 +1014,7 @@ class DateParser(dateparse_utilsListener):
             while not hour[-1].isdigit():
                 hour = hour[:-1]
             hour = self._get_int(hour)
-        except:
+        except Exception:
             raise ParseException(f'Bad hour: {ctx.hour().getText()}')
         if hour < 0 or hour > 23:
             raise ParseException(f'Bad hour (out of range): {hour}')
@@ -1022,7 +1022,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             minute = self._get_int(ctx.minute().getText())
-        except:
+        except Exception:
             minute = 0
         if minute < 0 or minute > 59:
             raise ParseException(f'Bad minute (out of range): {ctx.getText()}')
@@ -1030,7 +1030,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             seconds = self._get_int(ctx.second().getText())
-        except:
+        except Exception:
             seconds = 0
         if seconds < 0 or seconds > 59:
             raise ParseException(f'Bad second (out of range): {ctx.getText()}')
@@ -1038,7 +1038,7 @@ class DateParser(dateparse_utilsListener):
 
         try:
             micros = self._get_int(ctx.micros().getText())
-        except:
+        except Exception:
             micros = 0
         if micros < 0 or micros >= 1000000:
             raise ParseException(f'Bad micros (out of range): {ctx.getText()}')
@@ -1047,7 +1047,7 @@ class DateParser(dateparse_utilsListener):
         try:
             tz = ctx.tzExpr().getText()
             self.context['tz'] = self._parse_tz(tz)
-        except:
+        except Exception:
             pass
 
 
index 292b933886d7b6b5cc80ec98f358ad1f29ae9abf..74e8fdab22749917f21ae88c150b838120bb0820 100644 (file)
@@ -3,6 +3,7 @@
 from itertools import islice
 from typing import Any, Callable, Dict, Iterator, Tuple
 
+
 def init_or_inc(
     d: Dict[Any, Any],
     key: Any,
index 770beaa9f97e3525f55b938e09a972e3b67e0e5a..34279ba8392c0e538ede3bfbc09b7af882657c16 100644 (file)
@@ -123,9 +123,10 @@ class LockFile(object):
             cmd = self.override_command
         else:
             cmd = ' '.join(sys.argv)
+        print(cmd)
         contents = LockFileContents(
             pid = os.getpid(),
-            cmd,
+            commandline = cmd,
             expiration_timestamp = self.expiration_timestamp,
         )
         return json.dumps(contents.__dict__)
index 0c7d19362d7ed59cf9009053465763e47e6e4709..a0131b15373482fc00edd12297209622d0a70128 100644 (file)
@@ -269,7 +269,7 @@ class OutputMultiplexer(object):
                 open(filename, 'wb', buffering=0) for filename in filenames
             ]
         else:
-            if self.destination_bitv & OutputMultiplexer.FILENAMES:
+            if destination_bitv & OutputMultiplexer.FILENAMES:
                 raise ValueError(
                     "Filenames argument is required if bitv & FILENAMES"
                 )
@@ -278,7 +278,7 @@ class OutputMultiplexer(object):
         if handles is not None:
             self.h = [handle for handle in handles]
         else:
-            if self.destination_bitv & OutputMultiplexer.FILEHANDLES:
+            if destination_bitv & OutputMultiplexer.Destination.FILEHANDLES:
                 raise ValueError(
                     "Handle argument is required if bitv & FILEHANDLES"
                 )
old mode 100644 (file)
new mode 100755 (executable)
index 682855d..c697124
@@ -8,6 +8,7 @@ import re
 from typing import Dict, List
 
 import argparse_utils
+import bootstrap
 import config
 
 logger = logging.getLogger(__name__)
@@ -111,6 +112,7 @@ class PresenceDetection(object):
             if "cabin_" in line:
                 continue
             if location == Location.CABIN:
+                logger.debug('Cabin count: {cabin_count}')
                 cabin_count += 1
             try:
                 (mac, count, ip_name, mfg, ts) = line.split(",")
@@ -128,6 +130,7 @@ class PresenceDetection(object):
                 name = match.group(2)
                 self.names_by_mac[mac] = name
         if cabin_count > 0:
+            logger.debug('Weird MAC at the cabin')
             self.weird_mac_at_cabin = True
 
     def is_anyone_in_location_now(self, location: Location) -> bool:
@@ -152,15 +155,18 @@ class PresenceDetection(object):
         tiebreaks: Dict[Location, datetime.datetime] = {}
         credit = 10000
         for mac in self.devices_by_person[name]:
+            logger.debug(f'Looking for {name}... check for mac {mac}')
             if mac not in self.names_by_mac:
                 continue
             for location in self.location_ts_by_mac:
                 if mac in self.location_ts_by_mac[location]:
                     ts = (self.location_ts_by_mac[location])[mac]
+                    logger.debug(f'I saw {mac} at {location} at {ts}')
                     tiebreaks[location] = ts
             location = dict_utils.key_with_min_value(tiebreaks)
             v = votes.get(location, 0)
             votes[location] = v + credit
+            logger.debug('{name}: {location} gets {credit} votes.')
             credit = int(
                 credit * 0.667
             )  # Note: list most important devices first
@@ -170,3 +176,17 @@ class PresenceDetection(object):
             item = dict_utils.item_with_max_value(votes)
             return item[0]
         return Location.UNKNOWN
+
+
+def main() -> None:
+    p = PresenceDetection()
+    for person in Person:
+        print(f'{person} => {p.where_is_person_now(person)}')
+    print()
+    for location in Location:
+        print(f'{location} => {p.is_anyone_in_location_now(location)}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/profanity_filter.py b/profanity_filter.py
new file mode 100755 (executable)
index 0000000..e1b4743
--- /dev/null
@@ -0,0 +1,556 @@
+#!/usr/bin/env python3
+
+import logging
+import random
+import string
+import sys
+
+import nltk
+from nltk.stem import PorterStemmer
+
+import string_utils
+
+
+logger = logging.getLogger(__name__)
+
+
+class ProfanityFilter(object):
+    def __init__(self):
+        self.bad_words = set([
+            'acrotomophilia',
+            'anal',
+            'analingus',
+            'anally',
+            'anilingus',
+            'anus',
+            'arsehol',
+            'arsehole',
+            'ass',
+            'asses',
+            'asshol',
+            'asshole',
+            'assmunch',
+            'auto erot',
+            'auto erotic',
+            'autoerotic',
+            'babeland',
+            'babi batter',
+            'baby batter',
+            'ball gag',
+            'ball gravi',
+            'ball gravy',
+            'ball kick',
+            'ball kicking',
+            'ball lick',
+            'ball licking',
+            'ball sack',
+            'ball suck',
+            'ball sucking',
+            'ball zack',
+            'bangbro',
+            'bangbros',
+            'bare legal',
+            'bareback',
+            'barely legal',
+            'barenak',
+            'barenaked',
+            'bastardo',
+            'bastinado',
+            'bbc',
+            'bbw',
+            'bdsm',
+            'beaver cleaver',
+            'beaver lip',
+            'beaver lips',
+            'bestial',
+            'bestiality',
+            'bi curiou',
+            'bi curious',
+            'big black',
+            'big breasts',
+            'big knocker',
+            'big knockers',
+            'big tit',
+            'big tits',
+            'bimbo',
+            'birdlock',
+            'bitch',
+            'bitches',
+            'black cock',
+            'blond action',
+            'blond on blond',
+            'blonde action',
+            'blow j',
+            'blow job',
+            'blow my',
+            'blow me',
+            'blow ourselv',
+            'blow ourselves',
+            'blow your load',
+            'blue waffl',
+            'blue waffle',
+            'blumpkin',
+            'bollock',
+            'bollocks',
+            'bondag',
+            'bondage',
+            'boner',
+            'boob',
+            'boobs',
+            'booti call',
+            'booty call',
+            'breast',
+            'breasts',
+            'brown shower',
+            'brown showers',
+            'brunett action',
+            'brunette action',
+            'bukkak',
+            'bukkake',
+            'bulldyk',
+            'bulldyke',
+            'bullet vibe',
+            'bullshit',
+            'bung hole',
+            'bunghol',
+            'bunghole',
+            'busti',
+            'busty',
+            'butt',
+            'buttcheek',
+            'buttcheeks',
+            'butthol',
+            'butthole',
+            'camel toe',
+            'camgirl',
+            'camslut',
+            'camwhore',
+            'carpet muncher',
+            'carpetmuncher',
+            'chocol rosebud',
+            'chocolate rosebuds',
+            'circlejerk',
+            'chink',
+            'cleveland steamer',
+            'clit',
+            'clitori',
+            'clitoris',
+            'clover clamp',
+            'clover clamps',
+            'clusterfuck',
+            'cock',
+            'cocks',
+            'coprolagnia',
+            'coprophilia',
+            'cornhol',
+            'cornhole',
+            'cream pie',
+            'creampi',
+            'creampie',
+            'cum',
+            'cumming',
+            'cunnilingu',
+            'cunnilingus',
+            'cunt',
+            'damn',
+            'darki',
+            'darkie',
+            'date rape',
+            'daterap',
+            'daterape',
+            'deep throat',
+            'deepthroat',
+            'dick',
+            'dildo',
+            'dirti pillow',
+            'dirti sanchez',
+            'dirty pillow',
+            'dirty sanchez',
+            'dog style',
+            'doggi style',
+            'doggie style',
+            'doggiestyl',
+            'doggiestyle',
+            'doggystyle',
+            'dolcett',
+            'domination',
+            'dominatrix',
+            'domm',
+            'dommes',
+            'donkey punch',
+            'doubl dick',
+            'doubl dong',
+            'doubl penetr',
+            'double dick',
+            'double dong',
+            'double penetration',
+            'dp action',
+            'dtf',
+            'eat my ass',
+            'ecchi',
+            'ejacul',
+            'erection',
+            'erotic',
+            'erotism',
+            'escort',
+            'ethical slut',
+            'eunuch',
+            'faggot',
+            'fecal',
+            'felch',
+            'fellatio',
+            'feltch',
+            'female squirting',
+            'femdom',
+            'figging',
+            'fingered',
+            'fingering',
+            'fingers',
+            'fisted',
+            'fisting',
+            'fists',
+            'foot fetish',
+            'footjob',
+            'frotting',
+            'fuck button',
+            'fuck',
+            'fucked',
+            'fucker',
+            'fuckhead',
+            'fuckin',
+            'fucking',
+            'fudge packer',
+            'fudgepack',
+            'fudgepacker',
+            'futanari',
+            'g spot',
+            'g-spot',
+            'gang bang',
+            'gay sex',
+            'gee spot',
+            'genital',
+            'giant cock',
+            'girl gone wild',
+            'girl on top',
+            'girl on',
+            'goatcx',
+            'goatse',
+            'goddamn',
+            'gokkun',
+            'golden shower',
+            'goo girl',
+            'goodpoop',
+            'goregasm',
+            'grope',
+            'group sex',
+            'gspot',
+            'guro',
+            'hand job',
+            'handjob',
+            'hard core',
+            'hardcore',
+            'hentai',
+            'homoerotic',
+            'honkey',
+            'hooker',
+            'horni',
+            'horny',
+            'hot chick',
+            'how to kill',
+            'how to murder',
+            'huge fat',
+            'humped',
+            'humping',
+            'humps',
+            'incest',
+            'intercourse',
+            'jack off',
+            'jail bait',
+            'jailbait',
+            'jerk off',
+            'jigaboo',
+            'jiggaboo',
+            'jiggerboo',
+            'jizz',
+            'jugg',
+            'kike',
+            'kinbaku',
+            'kinkster',
+            'kinky',
+            'knobbing',
+            'leather restraint',
+            'lemon party',
+            'lolita',
+            'lovemaking',
+            'make me come',
+            'male squirting',
+            'masturb',
+            'menage a trois',
+            'milf',
+            'missionary position',
+            'motherfuck',
+            'mound of venus',
+            'mr hand',
+            'muff diver',
+            'muffdiv',
+            'muffdiving',
+            'nambla',
+            'nawashi',
+            'negro',
+            'neonazi',
+            'nig nog',
+            'nigga',
+            'nigger',
+            'nimphomania',
+            'nipple',
+            'nip',
+            'not safe for',
+            'nsfl',
+            'nsfw',
+            'nude',
+            'nudes',
+            'nudity',
+            'nut sack',
+            'nutsack',
+            'nympho',
+            'nymphomania',
+            'octopussy',
+            'omorashi',
+            'one night stand',
+            'orgasm',
+            'orgy',
+            'paedophil',
+            'paedophile',
+            'panties',
+            'panty',
+            'pedobear',
+            'pedophil',
+            'pedophile',
+            'pee',
+            'pegging',
+            'peni',
+            'penis',
+            'phone sex',
+            'pigfucker',
+            'piss pig',
+            'piss',
+            'pissing',
+            'pisspig',
+            'playboy',
+            'pleasure chest',
+            'pole smoker',
+            'ponyplay',
+            'poof',
+            'poop chute',
+            'poopchute',
+            'porn',
+            'pornhub',
+            'porno',
+            'pornographi',
+            'pornography',
+            'prince albert',
+            'pthc',
+            'pube',
+            'pussi',
+            'pussies',
+            'pussy',
+            'queaf',
+            'queer',
+            'raghead',
+            'raging boner',
+            'rape',
+            'raping',
+            'rapist',
+            'rectum',
+            'reverse cowgirl',
+            'rimjob',
+            'rimming',
+            'rosy palm',
+            'rusty trombone',
+            's & m',
+            's&m',
+            's+m',
+            'sadism',
+            'scat',
+            'schlong',
+            'scissoring',
+            'semen',
+            'sex',
+            'sexi',
+            'sexo',
+            'sexy',
+            'shaved beaver',
+            'shaved pussy',
+            'shemale',
+            'shibari',
+            'shit',
+            'shota',
+            'shrimping',
+            'slanteye',
+            'slut',
+            'smut',
+            'snatch',
+            'snm',
+            'snowballing',
+            'sodomi',
+            'sodomize',
+            'sodomy',
+            'spic',
+            'spooge',
+            'spread legs',
+            'squirting',
+            'strap on',
+            'strapon',
+            'strappado',
+            'strip club',
+            'style doggy',
+            'suck',
+            'suicide girls',
+            'sultry women',
+            'swastika',
+            'swinger',
+            'taint',
+            'tainted love',
+            'taste my',
+            'tea bagging',
+            'threesome',
+            'throating',
+            'tied up',
+            'tight white',
+            'tit',
+            'tits',
+            'titti',
+            'titties',
+            'titty',
+            'tongue in',
+            'topless',
+            'tosser',
+            'towelhead',
+            'tranny',
+            'tribadism',
+            'tub girl',
+            'tubgirl',
+            'tushy',
+            'twat',
+            'twink',
+            'twinki',
+            'twinkie',
+            'undress',
+            'upskirt',
+            'urethra play',
+            'urophilia',
+            'vag',
+            'vagina',
+            'venus mound',
+            'vibrator',
+            'violet blue',
+            'violet wand',
+            'vorarephilia',
+            'voyeur',
+            'vulva',
+            'wank',
+            'wet dream',
+            'wetback',
+            'white power',
+            'whore',
+            'women rapping',
+            'wrapping men',
+            'wrinkled starfish',
+            'xx',
+            'xxx',
+            'yaoi',
+            'yellow shower',
+            'yiffy',
+            'zoophilia',
+        ])
+        self.stemmer = PorterStemmer()
+
+    def _normalize(self, text: str) -> str:
+        result = text.lower()
+        result = result.replace("_", " ")
+        for x in string.punctuation:
+            result = result.replace(x, "")
+        chunks = [
+            self.stemmer.stem(word) for word in nltk.word_tokenize(result)
+        ]
+        return ' '.join(chunks)
+
+    def contains_bad_word(self, text: str) -> bool:
+        words = nltk.word_tokenize(text)
+        for word in words:
+            if self.is_bad_word(word):
+                logger.debug(f'"{word}" is profanity')
+                return True
+
+        if len(words) > 1:
+            for bigram in string_utils.ngrams_presplit(words, 2):
+                if self.is_bad_word(bigram):
+                    logger.debug('"{bigram}" is profanity')
+                    return True
+
+        if len(words) > 2:
+            for trigram in string_utils.ngrams_presplit(words, 3):
+                if self.is_bad_word(trigram):
+                    logger.debug('"{trigram}" is profanity')
+                    return True
+        return False
+
+    def is_bad_word(self, word: str) -> bool:
+        return (
+            word in self.bad_words or
+            self._normalize(word) in self.bad_words
+        )
+
+    def obscure_bad_words(self, text: str) -> str:
+
+        def obscure(word: str):
+            out = ''
+            last = ''
+            for letter in word:
+                if letter.isspace():
+                    out += letter
+                else:
+                    while True:
+                        char = random.choice(['#', '%', '!', '@', '&', '*'])
+                        if last != char:
+                            last = char
+                            out += char
+                            break
+            return out
+
+        words = nltk.word_tokenize(text)
+        words.append('')
+        words.append('')
+        words.append('')
+        out = ''
+
+        cursor = 0
+        while cursor < len(words) - 3:
+            word = words[cursor]
+            bigram = word + ' ' + words[cursor + 1]
+            trigram = bigram + ' ' + words[cursor + 2]
+            if self.is_bad_word(trigram):
+                out += obscure(trigram) + ' '
+                cursor += 3
+            elif self.is_bad_word(bigram):
+                out += obscure(bigram) + ' '
+                cursor += 2
+            elif self.is_bad_word(word):
+                out += obscure(word) + ' '
+                cursor += 1
+            else:
+                out += word + ' '
+                cursor += 1
+        return out.strip()
+
+
+def main() -> None:
+    pf = ProfanityFilter()
+    phrase = ' '.join(sys.argv[1:])
+    print(pf.contains_bad_word(phrase))
+    print(pf.obscure_bad_words(phrase))
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
index 6fc257de52c48f34e207e79e8b2227e914ad2b8c..45cf5aab7ac7f5202346745de733c792c984214d 100644 (file)
@@ -9,7 +9,7 @@ import logging
 import random
 import re
 import string
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Iterable, List, Optional
 import unicodedata
 from uuid import uuid4
 
@@ -963,3 +963,21 @@ def thify(n: int) -> str:
         return "rd"
     else:
         return "th"
+
+
+def ngrams(txt: str, n: int):
+    words = txt.split()
+    return ngrams_presplit(words, n)
+
+
+def ngrams_presplit(words: Iterable[str], n: int):
+    for ngram in zip(*[words[i:] for i in range(n)]):
+        yield(' '.join(ngram))
+
+
+def bigrams(txt: str):
+    return ngrams(txt, 2)
+
+
+def trigrams(txt: str):
+    return ngrams(txt, 3)
diff --git a/tests/profanity_filter_test.py b/tests/profanity_filter_test.py
new file mode 100755 (executable)
index 0000000..5648ad3
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import unittest
+
+import profanity_filter as pf
+import unittest_utils
+
+
+class TestProfanityFilter(unittest.TestCase):
+
+    def test_basic_functionality(self):
+        p = pf.ProfanityFilter()
+        self.assertTrue(p.is_bad_word('shit'))
+        self.assertTrue(p.contains_bad_word('this is another fucking test'))
+        self.assertTrue(p.contains_bad_word('this is another fuckin test'))
+        self.assertFalse(p.contains_bad_word('Mary had a little lamb whose fleese was white as snow.'))
+
+
+if __name__ == '__main__':
+    unittest.main()