import logging
import random
+import re
import string
import sys
self.stemmer = PorterStemmer()
def _normalize(self, text: str) -> str:
+ """Normalize text.
+
+ >>> _normalize('Tittie5')
+ 'titties'
+
+ >>> _normalize('Suck a Dick!')
+ 'suck a dick'
+
+ >>> _normalize('fucking a whore')
+ 'fuck a whore'
+
+ """
result = text.lower()
result = result.replace("_", " ")
result = result.replace('0', 'o')
]
return ' '.join(chunks)
+ def tokenize(self, text: str):
+ for x in nltk.word_tokenize(text):
+ for y in re.split('\W+', x):
+ yield y
+
def contains_bad_word(self, text: str) -> bool:
- words = nltk.word_tokenize(text)
+ """Returns True if text contains a bad word (or more than one)
+ and False if no bad words were detected.
+
+ >>> contains_bad_word('fuck you')
+ True
+
+ >>> contains_bad_word('FucK u')
+ True
+
+ >>> contains_bad_word('FuK U')
+ False
+
+ """
+ words = [word for word in self.tokenize(text)]
for word in words:
if self.is_bad_word(word):
logger.debug(f'"{word}" is profanity')
)
def obscure_bad_words(self, text: str) -> str:
+ """Obscure bad words that are detected by inserting random punctuation
+ characters.
+ """
def obscure(word: str):
out = ''
last = ''
break
return out
- words = nltk.word_tokenize(text)
+ words = self.tokenize(text)
words.append('')
words.append('')
words.append('')
def main() -> None:
+ import doctest
+ doctest.testmod()
pf = ProfanityFilter()
phrase = ' '.join(sys.argv[1:])
print(pf.contains_bad_word(phrase))