Make profanity filter catch foo/bar where foo and/or bar are bad

author Scott <[email protected]>

Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)

committer Scott <[email protected]>

Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)
author Scott <[email protected]>
Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)
committer Scott <[email protected]>
Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)
diff --git a/profanity_filter.py b/profanity_filter.py

index db014e1704742c7cab01bc6e7ca1f6ca7f874de5..3109f166af211d0160aeca81ddf72e526ceaf2d3 100755 (executable)
--- a/profanity_filter.py
+++ b/profanity_filter.py
@@ -2,6 +2,7 @@
  
  import logging
  import random
  
  import logging
  import random
+import re
  import string
  import sys
  
  import string
  import sys
  
@@ -496,6 +497,11 @@ class ProfanityFilter(object):
          ]
          return ' '.join(chunks)
  
          ]
          return ' '.join(chunks)
  
+    def tokenize(self, text: str):
+        for x in nltk.word_tokenize(text):
+            for y in re.split('\W+', x):
+                yield y
+
      def contains_bad_word(self, text: str) -> bool:
          """Returns True if text contains a bad word (or more than one) 
          and False if no bad words were detected.
      def contains_bad_word(self, text: str) -> bool:
          """Returns True if text contains a bad word (or more than one) 
          and False if no bad words were detected.
@@ -510,7 +516,7 @@ class ProfanityFilter(object):
          False
  
          """
          False
  
          """
-        words = nltk.word_tokenize(text)
+        words = [word for word in self.tokenize(text)]
          for word in words:
              if self.is_bad_word(word):
                  logger.debug(f'"{word}" is profanity')
          for word in words:
              if self.is_bad_word(word):
                  logger.debug(f'"{word}" is profanity')
@@ -557,7 +563,7 @@ class ProfanityFilter(object):
                              break
              return out
  
                              break
              return out
  
-        words = nltk.word_tokenize(text)
+        words = self.tokenize(text)
          words.append('')
          words.append('')
          words.append('')
          words.append('')
          words.append('')
          words.append('')
author	Scott <[email protected]>
	Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)
committer	Scott <[email protected]>
	Mon, 24 Jan 2022 00:13:44 +0000 (16:13 -0800)