3 """A helper to identify and optionally obscure some bad words."""
12 from nltk.stem import PorterStemmer
14 import decorator_utils
17 logger = logging.getLogger(__name__)
20 @decorator_utils.singleton
21 class ProfanityFilter(object):
22 """A helper to identify and optionally obscure some bad words."""
179 'double penetration',
288 'missionary position',
464 self.stemmer = PorterStemmer()
466 def _normalize(self, text: str) -> str:
469 >>> _normalize('Tittie5')
472 >>> _normalize('Suck a Dick!')
475 >>> _normalize('fucking a whore')
479 result = text.lower()
480 result = result.replace("_", " ")
481 result = result.replace('0', 'o')
482 result = result.replace('1', 'l')
483 result = result.replace('4', 'a')
484 result = result.replace('5', 's')
485 result = result.replace('3', 'e')
486 for x in string.punctuation:
487 result = result.replace(x, "")
488 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
489 return ' '.join(chunks)
492 def tokenize(text: str):
493 for x in nltk.word_tokenize(text):
494 for y in re.split(r'\W+', x):
497 def contains_bad_word(self, text: str) -> bool:
498 """Returns True if text contains a bad word (or more than one)
499 and False if no bad words were detected.
501 >>> contains_bad_word('fuck you')
504 >>> contains_bad_word('FucK u')
507 >>> contains_bad_word('FuK U')
511 words = list(self.tokenize(text))
513 if self.is_bad_word(word):
514 logger.debug('"%s" is profanity', word)
518 for bigram in string_utils.ngrams_presplit(words, 2):
519 bigram = ' '.join(bigram)
520 if self.is_bad_word(bigram):
521 logger.debug('"%s" is profanity', bigram)
525 for trigram in string_utils.ngrams_presplit(words, 3):
526 trigram = ' '.join(trigram)
527 if self.is_bad_word(trigram):
528 logger.debug('"%s" is profanity', trigram)
532 def is_bad_word(self, word: str) -> bool:
533 return word in self.bad_words or self._normalize(word) in self.bad_words
535 def obscure_bad_words(self, text: str) -> str:
536 """Obscure bad words that are detected by inserting random punctuation
541 def obscure(word: str):
549 char = random.choice(['#', '%', '!', '@', '&', '*'])
556 words = list(self.tokenize(text))
563 while cursor < len(words) - 3:
565 bigram = word + ' ' + words[cursor + 1]
566 trigram = bigram + ' ' + words[cursor + 2]
567 if self.is_bad_word(trigram):
568 out += obscure(trigram) + ' '
570 elif self.is_bad_word(bigram):
571 out += obscure(bigram) + ' '
573 elif self.is_bad_word(word):
574 out += obscure(word) + ' '
586 pf = ProfanityFilter()
587 phrase = ' '.join(sys.argv[1:])
588 print(pf.contains_bad_word(phrase))
589 print(pf.obscure_bad_words(phrase))
593 if __name__ == '__main__':