X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=db014e1704742c7cab01bc6e7ca1f6ca7f874de5;hb=36fe954a689c26e7082c61c1c8dbbf76dd7cf6c8;hp=e1b474323fb67823a0c1607e69df08ec41b5398d;hpb=b843703134a166013518c707fa5a77373f1bf0bf;p=python_utils.git

diff --git a/profanity_filter.py b/profanity_filter.py
index e1b4743..db014e1 100755
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -8,12 +8,14 @@ import sys
 import nltk
 from nltk.stem import PorterStemmer
 
+import decorator_utils
 import string_utils
 
 
 logger = logging.getLogger(__name__)
 
 
+@decorator_utils.singleton
 class ProfanityFilter(object):
     def __init__(self):
         self.bad_words = set([
@@ -82,6 +84,7 @@ class ProfanityFilter(object):
             'blonde action',
             'blow j',
             'blow job',
+            'blowjob',
             'blow my',
             'blow me',
             'blow ourselv',
@@ -344,6 +347,7 @@ class ProfanityFilter(object):
             'poop chute',
             'poopchute',
             'porn',
+            'pron',
             'pornhub',
             'porno',
             'pornographi',
@@ -466,8 +470,25 @@ class ProfanityFilter(object):
         self.stemmer = PorterStemmer()
 
     def _normalize(self, text: str) -> str:
+        """Normalize text.
+
+        >>> _normalize('Tittie5')
+        'titties'
+
+        >>> _normalize('Suck a Dick!')
+        'suck a dick'
+
+        >>> _normalize('fucking a whore')
+        'fuck a whore'
+
+        """
         result = text.lower()
         result = result.replace("_", " ")
+        result = result.replace('0', 'o')
+        result = result.replace('1', 'l')
+        result = result.replace('4', 'a')
+        result = result.replace('5', 's')
+        result = result.replace('3', 'e')
         for x in string.punctuation:
             result = result.replace(x, "")
         chunks = [
@@ -476,6 +497,19 @@ class ProfanityFilter(object):
         return ' '.join(chunks)
 
     def contains_bad_word(self, text: str) -> bool:
+        """Returns True if text contains a bad word (or more than one) 
+        and False if no bad words were detected.
+
+        >>> contains_bad_word('fuck you')
+        True
+
+        >>> contains_bad_word('FucK u')
+        True
+
+        >>> contains_bad_word('FuK U')
+        False
+
+        """
         words = nltk.word_tokenize(text)
         for word in words:
             if self.is_bad_word(word):
@@ -484,14 +518,16 @@ class ProfanityFilter(object):
 
         if len(words) > 1:
             for bigram in string_utils.ngrams_presplit(words, 2):
+                bigram = ' '.join(bigram)
                 if self.is_bad_word(bigram):
-                    logger.debug('"{bigram}" is profanity')
+                    logger.debug(f'"{bigram}" is profanity')
                     return True
 
         if len(words) > 2:
             for trigram in string_utils.ngrams_presplit(words, 3):
+                trigram = ' '.join(trigram)
                 if self.is_bad_word(trigram):
-                    logger.debug('"{trigram}" is profanity')
+                    logger.debug(f'"{trigram}" is profanity')
                     return True
         return False
 
@@ -502,7 +538,10 @@ class ProfanityFilter(object):
         )
 
     def obscure_bad_words(self, text: str) -> str:
+        """Obscure bad words that are detected by inserting random punctuation
+        characters.
 
+        """
         def obscure(word: str):
             out = ''
             last = ''
@@ -545,6 +584,8 @@ class ProfanityFilter(object):
 
 
 def main() -> None:
+    import doctest
+    doctest.testmod()
     pf = ProfanityFilter()
     phrase = ' '.join(sys.argv[1:])
     print(pf.contains_bad_word(phrase))