Experiment with audit events in bootstrap.
[python_utils.git] / profanity_filter.py
index f238e7dbda085486f41483ca73ebdbe2b99569b3..fe5422179ba9a50c678188e088689184f139a14d 100755 (executable)
@@ -8,12 +8,14 @@ import sys
 import nltk
 from nltk.stem import PorterStemmer
 
+import decorator_utils
 import string_utils
 
 
 logger = logging.getLogger(__name__)
 
 
+@decorator_utils.singleton
 class ProfanityFilter(object):
     def __init__(self):
         self.bad_words = set([
@@ -82,6 +84,7 @@ class ProfanityFilter(object):
             'blonde action',
             'blow j',
             'blow job',
+            'blowjob',
             'blow my',
             'blow me',
             'blow ourselv',
@@ -344,6 +347,7 @@ class ProfanityFilter(object):
             'poop chute',
             'poopchute',
             'porn',
+            'pron',
             'pornhub',
             'porno',
             'pornographi',
@@ -468,6 +472,11 @@ class ProfanityFilter(object):
     def _normalize(self, text: str) -> str:
         result = text.lower()
         result = result.replace("_", " ")
+        result = result.replace('0', 'o')
+        result = result.replace('1', 'l')
+        result = result.replace('4', 'a')
+        result = result.replace('5', 's')
+        result = result.replace('3', 'e')
         for x in string.punctuation:
             result = result.replace(x, "")
         chunks = [
@@ -486,14 +495,14 @@ class ProfanityFilter(object):
             for bigram in string_utils.ngrams_presplit(words, 2):
                 bigram = ' '.join(bigram)
                 if self.is_bad_word(bigram):
-                    logger.debug('"{bigram}" is profanity')
+                    logger.debug(f'"{bigram}" is profanity')
                     return True
 
         if len(words) > 2:
             for trigram in string_utils.ngrams_presplit(words, 3):
                 trigram = ' '.join(trigram)
                 if self.is_bad_word(trigram):
-                    logger.debug('"{trigram}" is profanity')
+                    logger.debug(f'"{trigram}" is profanity')
                     return True
         return False