3 """A helper to identify and optionally obscure some bad words."""
12 from nltk.stem import PorterStemmer
14 import decorator_utils
17 logger = logging.getLogger(__name__)
20 @decorator_utils.singleton
21 class ProfanityFilter(object):
22 """A helper to identify and optionally obscure some bad words."""
139 'chocolate rosebuds',
194 'double penetration',
301 'missionary position',
479 self.stemmer = PorterStemmer()
481 def _normalize(self, text: str) -> str:
484 >>> _normalize('Tittie5')
487 >>> _normalize('Suck a Dick!')
490 >>> _normalize('fucking a whore')
494 result = text.lower()
495 result = result.replace("_", " ")
496 result = result.replace('0', 'o')
497 result = result.replace('1', 'l')
498 result = result.replace('4', 'a')
499 result = result.replace('5', 's')
500 result = result.replace('3', 'e')
501 for x in string.punctuation:
502 result = result.replace(x, "")
503 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
504 return ' '.join(chunks)
507 def tokenize(text: str):
508 for x in nltk.word_tokenize(text):
509 for y in re.split(r'\W+', x):
512 def contains_bad_word(self, text: str) -> bool:
513 """Returns True if text contains a bad word (or more than one)
514 and False if no bad words were detected.
516 >>> contains_bad_word('fuck you')
519 >>> contains_bad_word('FucK u')
522 >>> contains_bad_word('FuK U')
526 words = list(self.tokenize(text))
528 if self.is_bad_word(word):
529 logger.debug('"%s" is profanity', word)
533 for bigram in string_utils.ngrams_presplit(words, 2):
534 bigram = ' '.join(bigram)
535 if self.is_bad_word(bigram):
536 logger.debug('"%s" is profanity', bigram)
540 for trigram in string_utils.ngrams_presplit(words, 3):
541 trigram = ' '.join(trigram)
542 if self.is_bad_word(trigram):
543 logger.debug('"%s" is profanity', trigram)
547 def is_bad_word(self, word: str) -> bool:
548 return word in self.bad_words or self._normalize(word) in self.bad_words
550 def obscure_bad_words(self, text: str) -> str:
551 """Obscure bad words that are detected by inserting random punctuation
556 def obscure(word: str):
564 char = random.choice(['#', '%', '!', '@', '&', '*'])
571 words = list(self.tokenize(text))
578 while cursor < len(words) - 3:
580 bigram = word + ' ' + words[cursor + 1]
581 trigram = bigram + ' ' + words[cursor + 2]
582 if self.is_bad_word(trigram):
583 out += obscure(trigram) + ' '
585 elif self.is_bad_word(bigram):
586 out += obscure(bigram) + ' '
588 elif self.is_bad_word(word):
589 out += obscure(word) + ' '
601 pf = ProfanityFilter()
602 phrase = ' '.join(sys.argv[1:])
603 print(pf.contains_bad_word(phrase))
604 print(pf.obscure_bad_words(phrase))
608 if __name__ == '__main__':