3 # © Copyright 2021-2022, Scott Gasch
5 """A helper to identify and optionally obscure some bad words. Not
6 perfect but decent. Uses a fuzzy block list rather than ML."""
15 from nltk.stem import PorterStemmer
17 import decorator_utils
20 logger = logging.getLogger(__name__)
23 @decorator_utils.singleton
24 class ProfanityFilter(object):
25 """A helper to identify and optionally obscure some bad words."""
182 'double penetration',
291 'missionary position',
467 self.stemmer = PorterStemmer()
469 def _normalize(self, text: str) -> str:
472 >>> _normalize('Tittie5')
475 >>> _normalize('Suck a Dick!')
478 >>> _normalize('fucking a whore')
481 >>> _normalize('pu55y')
485 result = text.lower()
486 result = result.replace("_", " ")
487 result = result.replace('0', 'o')
488 result = result.replace('1', 'l')
489 result = result.replace('4', 'a')
490 result = result.replace('5', 's')
491 result = result.replace('3', 'e')
492 for x in string.punctuation:
493 result = result.replace(x, "")
494 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
495 return ' '.join(chunks)
498 def tokenize(text: str):
499 """Tokenize text into word-like chunks"""
500 for x in nltk.word_tokenize(text):
501 for y in re.split(r'\W+', x):
504 def contains_bad_word(self, text: str) -> bool:
505 """Returns True if text contains a bad word (or more than one)
506 and False if no bad words were detected.
508 >>> contains_bad_word('fuck you')
511 >>> contains_bad_word('FucK u')
514 >>> contains_bad_word('FuK U')
518 words = list(self.tokenize(text))
520 if self.is_bad_word(word):
521 logger.debug('"%s" is profanity', word)
525 for bigram in string_utils.ngrams_presplit(words, 2):
526 bigram = ' '.join(bigram)
527 if self.is_bad_word(bigram):
528 logger.debug('"%s" is profanity', bigram)
532 for trigram in string_utils.ngrams_presplit(words, 3):
533 trigram = ' '.join(trigram)
534 if self.is_bad_word(trigram):
535 logger.debug('"%s" is profanity', trigram)
539 def is_bad_word(self, word: str) -> bool:
540 """True if we think word is a bad word."""
541 return word in self.bad_words or self._normalize(word) in self.bad_words
543 def obscure_bad_words(self, text: str) -> str:
544 """Obscure bad words that are detected by inserting random punctuation
548 def obscure(word: str):
556 char = random.choice(['#', '%', '!', '@', '&', '*'])
563 words = list(self.tokenize(text))
570 while cursor < len(words) - 3:
572 bigram = word + ' ' + words[cursor + 1]
573 trigram = bigram + ' ' + words[cursor + 2]
574 if self.is_bad_word(trigram):
575 out += obscure(trigram) + ' '
577 elif self.is_bad_word(bigram):
578 out += obscure(bigram) + ' '
580 elif self.is_bad_word(word):
581 out += obscure(word) + ' '
593 pf = ProfanityFilter()
594 phrase = ' '.join(sys.argv[1:])
595 print(pf.contains_bad_word(phrase))
596 print(pf.obscure_bad_words(phrase))
600 if __name__ == '__main__':