X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=1a855857478089f010a16115166c3ea488922259;hb=02302bbd9363facb59c4df2c1f4013087702cfa6;hp=95540fa7b36f0bd8fcf813196e2f9f2390569fce;hpb=36fea7f15ed17150691b5b3ead75450e575229ef;p=python_utils.git

diff --git a/profanity_filter.py b/profanity_filter.py
index 95540fa..1a85585 100755
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python3
 
+# Â© Copyright 2021-2022, Scott Gasch
+
+"""A helper to identify and optionally obscure some bad words.  Not
+perfect but decent.  Uses a fuzzy block list rather than ML."""
+
 import logging
 import random
 import re
@@ -12,25 +17,25 @@ from nltk.stem import PorterStemmer
 import decorator_utils
 import string_utils
 
-
 logger = logging.getLogger(__name__)
 
 
 @decorator_utils.singleton
 class ProfanityFilter(object):
+    """A helper to identify and optionally obscure some bad words."""
+
     def __init__(self):
         self.bad_words = set(
             [
                 'acrotomophilia',
                 'anal',
-                'analingus',
+                'analingu',
                 'anally',
-                'anilingus',
+                'anilingu',
                 'anus',
                 'arsehol',
                 'arsehole',
                 'ass',
-                'asses',
                 'asshol',
                 'asshole',
                 'assmunch',
@@ -52,7 +57,6 @@ class ProfanityFilter(object):
                 'ball sucking',
                 'ball zack',
                 'bangbro',
-                'bangbros',
                 'bare legal',
                 'bareback',
                 'barely legal',
@@ -65,21 +69,16 @@ class ProfanityFilter(object):
                 'bdsm',
                 'beaver cleaver',
                 'beaver lip',
-                'beaver lips',
                 'bestial',
                 'bestiality',
                 'bi curiou',
-                'bi curious',
                 'big black',
-                'big breasts',
+                'big breast',
                 'big knocker',
-                'big knockers',
                 'big tit',
-                'big tits',
                 'bimbo',
                 'birdlock',
                 'bitch',
-                'bitches',
                 'black cock',
                 'blond action',
                 'blond on blond',
@@ -96,18 +95,14 @@ class ProfanityFilter(object):
                 'blue waffle',
                 'blumpkin',
                 'bollock',
-                'bollocks',
                 'bondag',
                 'bondage',
                 'boner',
                 'boob',
-                'boobs',
                 'booti call',
                 'booty call',
                 'breast',
-                'breasts',
                 'brown shower',
-                'brown showers',
                 'brunett action',
                 'brunette action',
                 'bukkak',
@@ -123,7 +118,6 @@ class ProfanityFilter(object):
                 'busty',
                 'butt',
                 'buttcheek',
-                'buttcheeks',
                 'butthol',
                 'butthole',
                 'camel toe',
@@ -133,18 +127,16 @@ class ProfanityFilter(object):
                 'carpet muncher',
                 'carpetmuncher',
                 'chocol rosebud',
-                'chocolate rosebuds',
                 'circlejerk',
                 'chink',
                 'cleveland steamer',
                 'clit',
+                'clitor',
                 'clitori',
-                'clitoris',
                 'clover clamp',
-                'clover clamps',
                 'clusterfuck',
+                'cluster fuck',
                 'cock',
-                'cocks',
                 'coprolagnia',
                 'coprophilia',
                 'cornhol',
@@ -155,7 +147,6 @@ class ProfanityFilter(object):
                 'cum',
                 'cumming',
                 'cunnilingu',
-                'cunnilingus',
                 'cunt',
                 'damn',
                 'darki',
@@ -208,12 +199,8 @@ class ProfanityFilter(object):
                 'female squirting',
                 'femdom',
                 'figging',
-                'fingered',
-                'fingering',
-                'fingers',
-                'fisted',
-                'fisting',
-                'fists',
+                'finger',
+                'fist',
                 'foot fetish',
                 'footjob',
                 'frotting',
@@ -238,6 +225,14 @@ class ProfanityFilter(object):
                 'girl gone wild',
                 'girl on top',
                 'girl on',
+                'give head',
+                'giving head',
+                'gave head',
+                'gave you head',
+                'gave him head',
+                'gave them head',
+                'gave us head',
+                'glori hole',
                 'goatcx',
                 'goatse',
                 'goddamn',
@@ -266,7 +261,7 @@ class ProfanityFilter(object):
                 'huge fat',
                 'humped',
                 'humping',
-                'humps',
+                'hump',
                 'incest',
                 'intercourse',
                 'jack off',
@@ -291,10 +286,11 @@ class ProfanityFilter(object):
                 'male squirting',
                 'masturb',
                 'menage a trois',
+                'menag a troi',
                 'milf',
                 'missionary position',
                 'motherfuck',
-                'mound of venus',
+                'mound of venu',
                 'mr hand',
                 'muff diver',
                 'muffdiv',
@@ -313,7 +309,6 @@ class ProfanityFilter(object):
                 'nsfl',
                 'nsfw',
                 'nude',
-                'nudes',
                 'nudity',
                 'nut sack',
                 'nutsack',
@@ -327,7 +322,7 @@ class ProfanityFilter(object):
                 'paedophil',
                 'paedophile',
                 'panties',
-                'panty',
+                'panti',
                 'pedobear',
                 'pedophil',
                 'pedophile',
@@ -385,8 +380,8 @@ class ProfanityFilter(object):
                 'sexi',
                 'sexo',
                 'sexy',
-                'shaved beaver',
-                'shaved pussy',
+                'shave beaver',
+                'shave pussi',
                 'shemale',
                 'shibari',
                 'shit',
@@ -403,7 +398,7 @@ class ProfanityFilter(object):
                 'sodomy',
                 'spic',
                 'spooge',
-                'spread legs',
+                'spread leg',
                 'squirting',
                 'strap on',
                 'strapon',
@@ -411,7 +406,7 @@ class ProfanityFilter(object):
                 'strip club',
                 'style doggy',
                 'suck',
-                'suicide girls',
+                'suicid girl',
                 'sultry women',
                 'swastika',
                 'swinger',
@@ -424,7 +419,6 @@ class ProfanityFilter(object):
                 'tied up',
                 'tight white',
                 'tit',
-                'tits',
                 'titti',
                 'titties',
                 'titty',
@@ -447,7 +441,7 @@ class ProfanityFilter(object):
                 'urophilia',
                 'vag',
                 'vagina',
-                'venus mound',
+                'venu mound',
                 'vibrator',
                 'violet blue',
                 'violet wand',
@@ -484,6 +478,9 @@ class ProfanityFilter(object):
         >>> _normalize('fucking a whore')
         'fuck a whore'
 
+        >>> _normalize('pu55y')
+        'pussy'
+
         """
         result = text.lower()
         result = result.replace("_", " ")
@@ -494,14 +491,14 @@ class ProfanityFilter(object):
         result = result.replace('3', 'e')
         for x in string.punctuation:
             result = result.replace(x, "")
-        chunks = [
-            self.stemmer.stem(word) for word in nltk.word_tokenize(result)
-        ]
+        chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
         return ' '.join(chunks)
 
-    def tokenize(self, text: str):
+    @staticmethod
+    def tokenize(text: str):
+        """Tokenize text into word-like chunks"""
         for x in nltk.word_tokenize(text):
-            for y in re.split('\W+', x):
+            for y in re.split(r'\W+', x):
                 yield y
 
     def contains_bad_word(self, text: str) -> bool:
@@ -518,34 +515,34 @@ class ProfanityFilter(object):
         False
 
         """
-        words = [word for word in self.tokenize(text)]
+        words = list(self.tokenize(text))
         for word in words:
             if self.is_bad_word(word):
-                logger.debug(f'"{word}" is profanity')
+                logger.debug('"%s" is profanity', word)
                 return True
 
         if len(words) > 1:
             for bigram in string_utils.ngrams_presplit(words, 2):
                 bigram = ' '.join(bigram)
                 if self.is_bad_word(bigram):
-                    logger.debug(f'"{bigram}" is profanity')
+                    logger.debug('"%s" is profanity', bigram)
                     return True
 
         if len(words) > 2:
             for trigram in string_utils.ngrams_presplit(words, 3):
                 trigram = ' '.join(trigram)
                 if self.is_bad_word(trigram):
-                    logger.debug(f'"{trigram}" is profanity')
+                    logger.debug('"%s" is profanity', trigram)
                     return True
         return False
 
     def is_bad_word(self, word: str) -> bool:
+        """True if we think word is a bad word."""
         return word in self.bad_words or self._normalize(word) in self.bad_words
 
     def obscure_bad_words(self, text: str) -> str:
         """Obscure bad words that are detected by inserting random punctuation
         characters.
-
         """
 
         def obscure(word: str):
@@ -563,7 +560,7 @@ class ProfanityFilter(object):
                             break
             return out
 
-        words = self.tokenize(text)
+        words = list(self.tokenize(text))
         words.append('')
         words.append('')
         words.append('')