import nltk
from nltk.stem import PorterStemmer
+import decorator_utils
import string_utils
logger = logging.getLogger(__name__)
+@decorator_utils.singleton
class ProfanityFilter(object):
def __init__(self):
self.bad_words = set([
'blonde action',
'blow j',
'blow job',
+ 'blowjob',
'blow my',
'blow me',
'blow ourselv',
'poop chute',
'poopchute',
'porn',
+ 'pron',
'pornhub',
'porno',
'pornographi',
self.stemmer = PorterStemmer()
def _normalize(self, text: str) -> str:
+ """Normalize text.
+
+ >>> _normalize('Tittie5')
+ 'titties'
+
+ >>> _normalize('Suck a Dick!')
+ 'suck a dick'
+
+ >>> _normalize('fucking a whore')
+ 'fuck a whore'
+
+ """
result = text.lower()
result = result.replace("_", " ")
+ result = result.replace('0', 'o')
+ result = result.replace('1', 'l')
+ result = result.replace('4', 'a')
+ result = result.replace('5', 's')
+ result = result.replace('3', 'e')
for x in string.punctuation:
result = result.replace(x, "")
chunks = [
return ' '.join(chunks)
def contains_bad_word(self, text: str) -> bool:
+ """Returns True if text contains a bad word (or more than one)
+ and False if no bad words were detected.
+
+ >>> contains_bad_word('fuck you')
+ True
+
+ >>> contains_bad_word('FucK u')
+ True
+
+ >>> contains_bad_word('FuK U')
+ False
+
+ """
words = nltk.word_tokenize(text)
for word in words:
if self.is_bad_word(word):
if len(words) > 1:
for bigram in string_utils.ngrams_presplit(words, 2):
+ bigram = ' '.join(bigram)
if self.is_bad_word(bigram):
- logger.debug('"{bigram}" is profanity')
+ logger.debug(f'"{bigram}" is profanity')
return True
if len(words) > 2:
for trigram in string_utils.ngrams_presplit(words, 3):
+ trigram = ' '.join(trigram)
if self.is_bad_word(trigram):
- logger.debug('"{trigram}" is profanity')
+ logger.debug(f'"{trigram}" is profanity')
return True
return False
)
def obscure_bad_words(self, text: str) -> str:
+ """Obscure bad words that are detected by inserting random punctuation
+ characters.
+ """
def obscure(word: str):
out = ''
last = ''
def main() -> None:
+ import doctest
+ doctest.testmod()
pf = ProfanityFilter()
phrase = ' '.join(sys.argv[1:])
print(pf.contains_bad_word(phrase))