3 """A helper to identify and optionally obscure some bad words."""
12 from nltk.stem import PorterStemmer
14 import decorator_utils
17 logger = logging.getLogger(__name__)
20 @decorator_utils.singleton
21 class ProfanityFilter(object):
22 """A helper to identify and optionally obscure some bad words."""
139 'chocolate rosebuds',
194 'double penetration',
305 'missionary position',
483 self.stemmer = PorterStemmer()
485 def _normalize(self, text: str) -> str:
488 >>> _normalize('Tittie5')
491 >>> _normalize('Suck a Dick!')
494 >>> _normalize('fucking a whore')
498 result = text.lower()
499 result = result.replace("_", " ")
500 result = result.replace('0', 'o')
501 result = result.replace('1', 'l')
502 result = result.replace('4', 'a')
503 result = result.replace('5', 's')
504 result = result.replace('3', 'e')
505 for x in string.punctuation:
506 result = result.replace(x, "")
507 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
508 return ' '.join(chunks)
511 def tokenize(text: str):
512 for x in nltk.word_tokenize(text):
513 for y in re.split(r'\W+', x):
516 def contains_bad_word(self, text: str) -> bool:
517 """Returns True if text contains a bad word (or more than one)
518 and False if no bad words were detected.
520 >>> contains_bad_word('fuck you')
523 >>> contains_bad_word('FucK u')
526 >>> contains_bad_word('FuK U')
530 words = list(self.tokenize(text))
532 if self.is_bad_word(word):
533 logger.debug('"%s" is profanity', word)
537 for bigram in string_utils.ngrams_presplit(words, 2):
538 bigram = ' '.join(bigram)
539 if self.is_bad_word(bigram):
540 logger.debug('"%s" is profanity', bigram)
544 for trigram in string_utils.ngrams_presplit(words, 3):
545 trigram = ' '.join(trigram)
546 if self.is_bad_word(trigram):
547 logger.debug('"%s" is profanity', trigram)
551 def is_bad_word(self, word: str) -> bool:
552 return word in self.bad_words or self._normalize(word) in self.bad_words
554 def obscure_bad_words(self, text: str) -> str:
555 """Obscure bad words that are detected by inserting random punctuation
560 def obscure(word: str):
568 char = random.choice(['#', '%', '!', '@', '&', '*'])
575 words = list(self.tokenize(text))
582 while cursor < len(words) - 3:
584 bigram = word + ' ' + words[cursor + 1]
585 trigram = bigram + ' ' + words[cursor + 2]
586 if self.is_bad_word(trigram):
587 out += obscure(trigram) + ' '
589 elif self.is_bad_word(bigram):
590 out += obscure(bigram) + ' '
592 elif self.is_bad_word(word):
593 out += obscure(word) + ' '
605 pf = ProfanityFilter()
606 phrase = ' '.join(sys.argv[1:])
607 print(pf.contains_bad_word(phrase))
608 print(pf.obscure_bad_words(phrase))
612 if __name__ == '__main__':