Fix a couple of bugs in executors. Use run_tests.sh as a pre commit

[python_utils.git] / profanity_filter.py
diff --git a/profanity_filter.py b/profanity_filter.py

index e1b474323fb67823a0c1607e69df08ec41b5398d..fe5422179ba9a50c678188e088689184f139a14d 100755 (executable)
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -8,12 +8,14 @@ import sys
  import nltk
  from nltk.stem import PorterStemmer
  
+import decorator_utils
  import string_utils
  
  
  logger = logging.getLogger(__name__)
  
  
+@decorator_utils.singleton
  class ProfanityFilter(object):
      def __init__(self):
          self.bad_words = set([
@@ -82,6 +84,7 @@ class ProfanityFilter(object):
              'blonde action',
              'blow j',
              'blow job',
+            'blowjob',
              'blow my',
              'blow me',
              'blow ourselv',
@@ -344,6 +347,7 @@ class ProfanityFilter(object):
              'poop chute',
              'poopchute',
              'porn',
+            'pron',
              'pornhub',
              'porno',
              'pornographi',
@@ -468,6 +472,11 @@ class ProfanityFilter(object):
      def _normalize(self, text: str) -> str:
          result = text.lower()
          result = result.replace("_", " ")
+        result = result.replace('0', 'o')
+        result = result.replace('1', 'l')
+        result = result.replace('4', 'a')
+        result = result.replace('5', 's')
+        result = result.replace('3', 'e')
          for x in string.punctuation:
              result = result.replace(x, "")
          chunks = [
@@ -484,14 +493,16 @@ class ProfanityFilter(object):
  
          if len(words) > 1:
              for bigram in string_utils.ngrams_presplit(words, 2):
+                bigram = ' '.join(bigram)
                  if self.is_bad_word(bigram):
-                    logger.debug('"{bigram}" is profanity')
+                    logger.debug(f'"{bigram}" is profanity')
                      return True
  
          if len(words) > 2:
              for trigram in string_utils.ngrams_presplit(words, 3):
+                trigram = ' '.join(trigram)
                  if self.is_bad_word(trigram):
-                    logger.debug('"{trigram}" is profanity')
+                    logger.debug(f'"{trigram}" is profanity')
                      return True
          return False