Fix a bug, add some testcases.

[python_utils.git] / profanity_filter.py
diff --git a/profanity_filter.py b/profanity_filter.py

index 4723a2db0679e5f866f14bbb723c66391fa06ae6..37756bac99abdaaa298b92ab7ff4f984ec844d51 100755 (executable)
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -1,5 +1,7 @@
  #!/usr/bin/env python3
  
+"""A helper to identify and optionally obscure some bad words."""
+
  import logging
  import random
  import re
@@ -12,12 +14,13 @@ from nltk.stem import PorterStemmer
  import decorator_utils
  import string_utils
  
-
  logger = logging.getLogger(__name__)
  
  
  @decorator_utils.singleton
  class ProfanityFilter(object):
+    """A helper to identify and optionally obscure some bad words."""
+
      def __init__(self):
          self.bad_words = set(
              [
@@ -238,6 +241,9 @@ class ProfanityFilter(object):
                  'girl gone wild',
                  'girl on top',
                  'girl on',
+                'give head',
+                'giving head',
+                'gave head',
                  'goatcx',
                  'goatse',
                  'goddamn',
@@ -497,9 +503,10 @@ class ProfanityFilter(object):
          chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
          return ' '.join(chunks)
  
-    def tokenize(self, text: str):
+    @staticmethod
+    def tokenize(text: str):
          for x in nltk.word_tokenize(text):
-            for y in re.split('\W+', x):
+            for y in re.split(r'\W+', x):
                  yield y
  
      def contains_bad_word(self, text: str) -> bool:
@@ -516,24 +523,24 @@ class ProfanityFilter(object):
          False
  
          """
-        words = [word for word in self.tokenize(text)]
+        words = list(self.tokenize(text))
          for word in words:
              if self.is_bad_word(word):
-                logger.debug(f'"{word}" is profanity')
+                logger.debug('"%s" is profanity', word)
                  return True
  
          if len(words) > 1:
              for bigram in string_utils.ngrams_presplit(words, 2):
                  bigram = ' '.join(bigram)
                  if self.is_bad_word(bigram):
-                    logger.debug(f'"{bigram}" is profanity')
+                    logger.debug('"%s" is profanity', bigram)
                      return True
  
          if len(words) > 2:
              for trigram in string_utils.ngrams_presplit(words, 3):
                  trigram = ' '.join(trigram)
                  if self.is_bad_word(trigram):
-                    logger.debug(f'"{trigram}" is profanity')
+                    logger.debug('"%s" is profanity', trigram)
                      return True
          return False
  
@@ -561,7 +568,7 @@ class ProfanityFilter(object):
                              break
              return out
  
-        words = self.tokenize(text)
+        words = list(self.tokenize(text))
          words.append('')
          words.append('')
          words.append('')