Easier and more self documenting patterns for loading/saving Persistent
[python_utils.git] / profanity_filter.py
index 37756bac99abdaaa298b92ab7ff4f984ec844d51..1a855857478089f010a16115166c3ea488922259 100755 (executable)
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 
-"""A helper to identify and optionally obscure some bad words."""
+# © Copyright 2021-2022, Scott Gasch
+
+"""A helper to identify and optionally obscure some bad words.  Not
+perfect but decent.  Uses a fuzzy block list rather than ML."""
 
 import logging
 import random
@@ -26,14 +29,13 @@ class ProfanityFilter(object):
             [
                 'acrotomophilia',
                 'anal',
-                'analingus',
+                'analingu',
                 'anally',
-                'anilingus',
+                'anilingu',
                 'anus',
                 'arsehol',
                 'arsehole',
                 'ass',
-                'asses',
                 'asshol',
                 'asshole',
                 'assmunch',
@@ -55,7 +57,6 @@ class ProfanityFilter(object):
                 'ball sucking',
                 'ball zack',
                 'bangbro',
-                'bangbros',
                 'bare legal',
                 'bareback',
                 'barely legal',
@@ -68,21 +69,16 @@ class ProfanityFilter(object):
                 'bdsm',
                 'beaver cleaver',
                 'beaver lip',
-                'beaver lips',
                 'bestial',
                 'bestiality',
                 'bi curiou',
-                'bi curious',
                 'big black',
-                'big breasts',
+                'big breast',
                 'big knocker',
-                'big knockers',
                 'big tit',
-                'big tits',
                 'bimbo',
                 'birdlock',
                 'bitch',
-                'bitches',
                 'black cock',
                 'blond action',
                 'blond on blond',
@@ -99,18 +95,14 @@ class ProfanityFilter(object):
                 'blue waffle',
                 'blumpkin',
                 'bollock',
-                'bollocks',
                 'bondag',
                 'bondage',
                 'boner',
                 'boob',
-                'boobs',
                 'booti call',
                 'booty call',
                 'breast',
-                'breasts',
                 'brown shower',
-                'brown showers',
                 'brunett action',
                 'brunette action',
                 'bukkak',
@@ -126,7 +118,6 @@ class ProfanityFilter(object):
                 'busty',
                 'butt',
                 'buttcheek',
-                'buttcheeks',
                 'butthol',
                 'butthole',
                 'camel toe',
@@ -136,18 +127,16 @@ class ProfanityFilter(object):
                 'carpet muncher',
                 'carpetmuncher',
                 'chocol rosebud',
-                'chocolate rosebuds',
                 'circlejerk',
                 'chink',
                 'cleveland steamer',
                 'clit',
+                'clitor',
                 'clitori',
-                'clitoris',
                 'clover clamp',
-                'clover clamps',
                 'clusterfuck',
+                'cluster fuck',
                 'cock',
-                'cocks',
                 'coprolagnia',
                 'coprophilia',
                 'cornhol',
@@ -158,7 +147,6 @@ class ProfanityFilter(object):
                 'cum',
                 'cumming',
                 'cunnilingu',
-                'cunnilingus',
                 'cunt',
                 'damn',
                 'darki',
@@ -211,12 +199,8 @@ class ProfanityFilter(object):
                 'female squirting',
                 'femdom',
                 'figging',
-                'fingered',
-                'fingering',
-                'fingers',
-                'fisted',
-                'fisting',
-                'fists',
+                'finger',
+                'fist',
                 'foot fetish',
                 'footjob',
                 'frotting',
@@ -244,6 +228,11 @@ class ProfanityFilter(object):
                 'give head',
                 'giving head',
                 'gave head',
+                'gave you head',
+                'gave him head',
+                'gave them head',
+                'gave us head',
+                'glori hole',
                 'goatcx',
                 'goatse',
                 'goddamn',
@@ -272,7 +261,7 @@ class ProfanityFilter(object):
                 'huge fat',
                 'humped',
                 'humping',
-                'humps',
+                'hump',
                 'incest',
                 'intercourse',
                 'jack off',
@@ -297,10 +286,11 @@ class ProfanityFilter(object):
                 'male squirting',
                 'masturb',
                 'menage a trois',
+                'menag a troi',
                 'milf',
                 'missionary position',
                 'motherfuck',
-                'mound of venus',
+                'mound of venu',
                 'mr hand',
                 'muff diver',
                 'muffdiv',
@@ -319,7 +309,6 @@ class ProfanityFilter(object):
                 'nsfl',
                 'nsfw',
                 'nude',
-                'nudes',
                 'nudity',
                 'nut sack',
                 'nutsack',
@@ -333,7 +322,7 @@ class ProfanityFilter(object):
                 'paedophil',
                 'paedophile',
                 'panties',
-                'panty',
+                'panti',
                 'pedobear',
                 'pedophil',
                 'pedophile',
@@ -391,8 +380,8 @@ class ProfanityFilter(object):
                 'sexi',
                 'sexo',
                 'sexy',
-                'shaved beaver',
-                'shaved pussy',
+                'shave beaver',
+                'shave pussi',
                 'shemale',
                 'shibari',
                 'shit',
@@ -409,7 +398,7 @@ class ProfanityFilter(object):
                 'sodomy',
                 'spic',
                 'spooge',
-                'spread legs',
+                'spread leg',
                 'squirting',
                 'strap on',
                 'strapon',
@@ -417,7 +406,7 @@ class ProfanityFilter(object):
                 'strip club',
                 'style doggy',
                 'suck',
-                'suicide girls',
+                'suicid girl',
                 'sultry women',
                 'swastika',
                 'swinger',
@@ -430,7 +419,6 @@ class ProfanityFilter(object):
                 'tied up',
                 'tight white',
                 'tit',
-                'tits',
                 'titti',
                 'titties',
                 'titty',
@@ -453,7 +441,7 @@ class ProfanityFilter(object):
                 'urophilia',
                 'vag',
                 'vagina',
-                'venus mound',
+                'venu mound',
                 'vibrator',
                 'violet blue',
                 'violet wand',
@@ -490,6 +478,9 @@ class ProfanityFilter(object):
         >>> _normalize('fucking a whore')
         'fuck a whore'
 
+        >>> _normalize('pu55y')
+        'pussy'
+
         """
         result = text.lower()
         result = result.replace("_", " ")
@@ -505,6 +496,7 @@ class ProfanityFilter(object):
 
     @staticmethod
     def tokenize(text: str):
+        """Tokenize text into word-like chunks"""
         for x in nltk.word_tokenize(text):
             for y in re.split(r'\W+', x):
                 yield y
@@ -545,12 +537,12 @@ class ProfanityFilter(object):
         return False
 
     def is_bad_word(self, word: str) -> bool:
+        """True if we think word is a bad word."""
         return word in self.bad_words or self._normalize(word) in self.bad_words
 
     def obscure_bad_words(self, text: str) -> str:
         """Obscure bad words that are detected by inserting random punctuation
         characters.
-
         """
 
         def obscure(word: str):