3 # © Copyright 2021-2022, Scott Gasch
5 """A helper to identify and optionally obscure some bad words."""
14 from nltk.stem import PorterStemmer
16 import decorator_utils
19 logger = logging.getLogger(__name__)
22 @decorator_utils.singleton
23 class ProfanityFilter(object):
24 """A helper to identify and optionally obscure some bad words."""
181 'double penetration',
290 'missionary position',
466 self.stemmer = PorterStemmer()
468 def _normalize(self, text: str) -> str:
471 >>> _normalize('Tittie5')
474 >>> _normalize('Suck a Dick!')
477 >>> _normalize('fucking a whore')
481 result = text.lower()
482 result = result.replace("_", " ")
483 result = result.replace('0', 'o')
484 result = result.replace('1', 'l')
485 result = result.replace('4', 'a')
486 result = result.replace('5', 's')
487 result = result.replace('3', 'e')
488 for x in string.punctuation:
489 result = result.replace(x, "")
490 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
491 return ' '.join(chunks)
494 def tokenize(text: str):
495 for x in nltk.word_tokenize(text):
496 for y in re.split(r'\W+', x):
499 def contains_bad_word(self, text: str) -> bool:
500 """Returns True if text contains a bad word (or more than one)
501 and False if no bad words were detected.
503 >>> contains_bad_word('fuck you')
506 >>> contains_bad_word('FucK u')
509 >>> contains_bad_word('FuK U')
513 words = list(self.tokenize(text))
515 if self.is_bad_word(word):
516 logger.debug('"%s" is profanity', word)
520 for bigram in string_utils.ngrams_presplit(words, 2):
521 bigram = ' '.join(bigram)
522 if self.is_bad_word(bigram):
523 logger.debug('"%s" is profanity', bigram)
527 for trigram in string_utils.ngrams_presplit(words, 3):
528 trigram = ' '.join(trigram)
529 if self.is_bad_word(trigram):
530 logger.debug('"%s" is profanity', trigram)
534 def is_bad_word(self, word: str) -> bool:
535 return word in self.bad_words or self._normalize(word) in self.bad_words
537 def obscure_bad_words(self, text: str) -> str:
538 """Obscure bad words that are detected by inserting random punctuation
543 def obscure(word: str):
551 char = random.choice(['#', '%', '!', '@', '&', '*'])
558 words = list(self.tokenize(text))
565 while cursor < len(words) - 3:
567 bigram = word + ' ' + words[cursor + 1]
568 trigram = bigram + ' ' + words[cursor + 2]
569 if self.is_bad_word(trigram):
570 out += obscure(trigram) + ' '
572 elif self.is_bad_word(bigram):
573 out += obscure(bigram) + ' '
575 elif self.is_bad_word(word):
576 out += obscure(word) + ' '
588 pf = ProfanityFilter()
589 phrase = ' '.join(sys.argv[1:])
590 print(pf.contains_bad_word(phrase))
591 print(pf.obscure_bad_words(phrase))
595 if __name__ == '__main__':