Since this thing is on the innerwebs I suppose it should have a
[python_utils.git] / profanity_filter.py
index 4723a2db0679e5f866f14bbb723c66391fa06ae6..a1f0c0b9adaa8971dfd243694cd096a2e84a077d 100755 (executable)
@@ -1,5 +1,9 @@
 #!/usr/bin/env python3
 
+# © Copyright 2021-2022, Scott Gasch
+
+"""A helper to identify and optionally obscure some bad words."""
+
 import logging
 import random
 import re
@@ -12,25 +16,25 @@ from nltk.stem import PorterStemmer
 import decorator_utils
 import string_utils
 
-
 logger = logging.getLogger(__name__)
 
 
 @decorator_utils.singleton
 class ProfanityFilter(object):
+    """A helper to identify and optionally obscure some bad words."""
+
     def __init__(self):
         self.bad_words = set(
             [
                 'acrotomophilia',
                 'anal',
-                'analingus',
+                'analingu',
                 'anally',
-                'anilingus',
+                'anilingu',
                 'anus',
                 'arsehol',
                 'arsehole',
                 'ass',
-                'asses',
                 'asshol',
                 'asshole',
                 'assmunch',
@@ -52,7 +56,6 @@ class ProfanityFilter(object):
                 'ball sucking',
                 'ball zack',
                 'bangbro',
-                'bangbros',
                 'bare legal',
                 'bareback',
                 'barely legal',
@@ -65,21 +68,16 @@ class ProfanityFilter(object):
                 'bdsm',
                 'beaver cleaver',
                 'beaver lip',
-                'beaver lips',
                 'bestial',
                 'bestiality',
                 'bi curiou',
-                'bi curious',
                 'big black',
-                'big breasts',
+                'big breast',
                 'big knocker',
-                'big knockers',
                 'big tit',
-                'big tits',
                 'bimbo',
                 'birdlock',
                 'bitch',
-                'bitches',
                 'black cock',
                 'blond action',
                 'blond on blond',
@@ -96,18 +94,14 @@ class ProfanityFilter(object):
                 'blue waffle',
                 'blumpkin',
                 'bollock',
-                'bollocks',
                 'bondag',
                 'bondage',
                 'boner',
                 'boob',
-                'boobs',
                 'booti call',
                 'booty call',
                 'breast',
-                'breasts',
                 'brown shower',
-                'brown showers',
                 'brunett action',
                 'brunette action',
                 'bukkak',
@@ -123,7 +117,6 @@ class ProfanityFilter(object):
                 'busty',
                 'butt',
                 'buttcheek',
-                'buttcheeks',
                 'butthol',
                 'butthole',
                 'camel toe',
@@ -133,18 +126,16 @@ class ProfanityFilter(object):
                 'carpet muncher',
                 'carpetmuncher',
                 'chocol rosebud',
-                'chocolate rosebuds',
                 'circlejerk',
                 'chink',
                 'cleveland steamer',
                 'clit',
+                'clitor',
                 'clitori',
-                'clitoris',
                 'clover clamp',
-                'clover clamps',
                 'clusterfuck',
+                'cluster fuck',
                 'cock',
-                'cocks',
                 'coprolagnia',
                 'coprophilia',
                 'cornhol',
@@ -155,7 +146,6 @@ class ProfanityFilter(object):
                 'cum',
                 'cumming',
                 'cunnilingu',
-                'cunnilingus',
                 'cunt',
                 'damn',
                 'darki',
@@ -208,12 +198,8 @@ class ProfanityFilter(object):
                 'female squirting',
                 'femdom',
                 'figging',
-                'fingered',
-                'fingering',
-                'fingers',
-                'fisted',
-                'fisting',
-                'fists',
+                'finger',
+                'fist',
                 'foot fetish',
                 'footjob',
                 'frotting',
@@ -238,6 +224,14 @@ class ProfanityFilter(object):
                 'girl gone wild',
                 'girl on top',
                 'girl on',
+                'give head',
+                'giving head',
+                'gave head',
+                'gave you head',
+                'gave him head',
+                'gave them head',
+                'gave us head',
+                'glori hole',
                 'goatcx',
                 'goatse',
                 'goddamn',
@@ -266,7 +260,7 @@ class ProfanityFilter(object):
                 'huge fat',
                 'humped',
                 'humping',
-                'humps',
+                'hump',
                 'incest',
                 'intercourse',
                 'jack off',
@@ -291,10 +285,11 @@ class ProfanityFilter(object):
                 'male squirting',
                 'masturb',
                 'menage a trois',
+                'menag a troi',
                 'milf',
                 'missionary position',
                 'motherfuck',
-                'mound of venus',
+                'mound of venu',
                 'mr hand',
                 'muff diver',
                 'muffdiv',
@@ -313,7 +308,6 @@ class ProfanityFilter(object):
                 'nsfl',
                 'nsfw',
                 'nude',
-                'nudes',
                 'nudity',
                 'nut sack',
                 'nutsack',
@@ -327,7 +321,7 @@ class ProfanityFilter(object):
                 'paedophil',
                 'paedophile',
                 'panties',
-                'panty',
+                'panti',
                 'pedobear',
                 'pedophil',
                 'pedophile',
@@ -385,8 +379,8 @@ class ProfanityFilter(object):
                 'sexi',
                 'sexo',
                 'sexy',
-                'shaved beaver',
-                'shaved pussy',
+                'shave beaver',
+                'shave pussi',
                 'shemale',
                 'shibari',
                 'shit',
@@ -403,7 +397,7 @@ class ProfanityFilter(object):
                 'sodomy',
                 'spic',
                 'spooge',
-                'spread legs',
+                'spread leg',
                 'squirting',
                 'strap on',
                 'strapon',
@@ -411,7 +405,7 @@ class ProfanityFilter(object):
                 'strip club',
                 'style doggy',
                 'suck',
-                'suicide girls',
+                'suicid girl',
                 'sultry women',
                 'swastika',
                 'swinger',
@@ -424,7 +418,6 @@ class ProfanityFilter(object):
                 'tied up',
                 'tight white',
                 'tit',
-                'tits',
                 'titti',
                 'titties',
                 'titty',
@@ -447,7 +440,7 @@ class ProfanityFilter(object):
                 'urophilia',
                 'vag',
                 'vagina',
-                'venus mound',
+                'venu mound',
                 'vibrator',
                 'violet blue',
                 'violet wand',
@@ -497,9 +490,10 @@ class ProfanityFilter(object):
         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
         return ' '.join(chunks)
 
-    def tokenize(self, text: str):
+    @staticmethod
+    def tokenize(text: str):
         for x in nltk.word_tokenize(text):
-            for y in re.split('\W+', x):
+            for y in re.split(r'\W+', x):
                 yield y
 
     def contains_bad_word(self, text: str) -> bool:
@@ -516,24 +510,24 @@ class ProfanityFilter(object):
         False
 
         """
-        words = [word for word in self.tokenize(text)]
+        words = list(self.tokenize(text))
         for word in words:
             if self.is_bad_word(word):
-                logger.debug(f'"{word}" is profanity')
+                logger.debug('"%s" is profanity', word)
                 return True
 
         if len(words) > 1:
             for bigram in string_utils.ngrams_presplit(words, 2):
                 bigram = ' '.join(bigram)
                 if self.is_bad_word(bigram):
-                    logger.debug(f'"{bigram}" is profanity')
+                    logger.debug('"%s" is profanity', bigram)
                     return True
 
         if len(words) > 2:
             for trigram in string_utils.ngrams_presplit(words, 3):
                 trigram = ' '.join(trigram)
                 if self.is_bad_word(trigram):
-                    logger.debug(f'"{trigram}" is profanity')
+                    logger.debug('"%s" is profanity', trigram)
                     return True
         return False
 
@@ -561,7 +555,7 @@ class ProfanityFilter(object):
                             break
             return out
 
-        words = self.tokenize(text)
+        words = list(self.tokenize(text))
         words.append('')
         words.append('')
         words.append('')