X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=37756bac99abdaaa298b92ab7ff4f984ec844d51;hb=e8fbbb7306430478dec55d2c963eed116d8330cc;hp=95540fa7b36f0bd8fcf813196e2f9f2390569fce;hpb=36fea7f15ed17150691b5b3ead75450e575229ef;p=python_utils.git

diff --git a/profanity_filter.py b/profanity_filter.py
index 95540fa..37756ba 100755
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 
+"""A helper to identify and optionally obscure some bad words."""
+
 import logging
 import random
 import re
@@ -12,12 +14,13 @@ from nltk.stem import PorterStemmer
 import decorator_utils
 import string_utils
 
-
 logger = logging.getLogger(__name__)
 
 
 @decorator_utils.singleton
 class ProfanityFilter(object):
+    """A helper to identify and optionally obscure some bad words."""
+
     def __init__(self):
         self.bad_words = set(
             [
@@ -238,6 +241,9 @@ class ProfanityFilter(object):
                 'girl gone wild',
                 'girl on top',
                 'girl on',
+                'give head',
+                'giving head',
+                'gave head',
                 'goatcx',
                 'goatse',
                 'goddamn',
@@ -494,14 +500,13 @@ class ProfanityFilter(object):
         result = result.replace('3', 'e')
         for x in string.punctuation:
             result = result.replace(x, "")
-        chunks = [
-            self.stemmer.stem(word) for word in nltk.word_tokenize(result)
-        ]
+        chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
         return ' '.join(chunks)
 
-    def tokenize(self, text: str):
+    @staticmethod
+    def tokenize(text: str):
         for x in nltk.word_tokenize(text):
-            for y in re.split('\W+', x):
+            for y in re.split(r'\W+', x):
                 yield y
 
     def contains_bad_word(self, text: str) -> bool:
@@ -518,24 +523,24 @@ class ProfanityFilter(object):
         False
 
         """
-        words = [word for word in self.tokenize(text)]
+        words = list(self.tokenize(text))
         for word in words:
             if self.is_bad_word(word):
-                logger.debug(f'"{word}" is profanity')
+                logger.debug('"%s" is profanity', word)
                 return True
 
         if len(words) > 1:
             for bigram in string_utils.ngrams_presplit(words, 2):
                 bigram = ' '.join(bigram)
                 if self.is_bad_word(bigram):
-                    logger.debug(f'"{bigram}" is profanity')
+                    logger.debug('"%s" is profanity', bigram)
                     return True
 
         if len(words) > 2:
             for trigram in string_utils.ngrams_presplit(words, 3):
                 trigram = ' '.join(trigram)
                 if self.is_bad_word(trigram):
-                    logger.debug(f'"{trigram}" is profanity')
+                    logger.debug('"%s" is profanity', trigram)
                     return True
         return False
 
@@ -563,7 +568,7 @@ class ProfanityFilter(object):
                             break
             return out
 
-        words = self.tokenize(text)
+        words = list(self.tokenize(text))
         words.append('')
         words.append('')
         words.append('')