9 from nltk.stem import PorterStemmer
11 import decorator_utils
15 logger = logging.getLogger(__name__)
18 @decorator_utils.singleton
19 class ProfanityFilter(object):
21 self.bad_words = set([
134 'chocolate rosebuds',
189 'double penetration',
293 'missionary position',
470 self.stemmer = PorterStemmer()
472 def _normalize(self, text: str) -> str:
475 >>> _normalize('Tittie5')
478 >>> _normalize('Suck a Dick!')
481 >>> _normalize('fucking a whore')
485 result = text.lower()
486 result = result.replace("_", " ")
487 result = result.replace('0', 'o')
488 result = result.replace('1', 'l')
489 result = result.replace('4', 'a')
490 result = result.replace('5', 's')
491 result = result.replace('3', 'e')
492 for x in string.punctuation:
493 result = result.replace(x, "")
495 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
497 return ' '.join(chunks)
499 def contains_bad_word(self, text: str) -> bool:
500 """Returns True if text contains a bad word (or more than one)
501 and False if no bad words were detected.
503 >>> contains_bad_word('fuck you')
506 >>> contains_bad_word('FucK u')
509 >>> contains_bad_word('FuK U')
513 words = nltk.word_tokenize(text)
515 if self.is_bad_word(word):
516 logger.debug(f'"{word}" is profanity')
520 for bigram in string_utils.ngrams_presplit(words, 2):
521 bigram = ' '.join(bigram)
522 if self.is_bad_word(bigram):
523 logger.debug(f'"{bigram}" is profanity')
527 for trigram in string_utils.ngrams_presplit(words, 3):
528 trigram = ' '.join(trigram)
529 if self.is_bad_word(trigram):
530 logger.debug(f'"{trigram}" is profanity')
534 def is_bad_word(self, word: str) -> bool:
536 word in self.bad_words or
537 self._normalize(word) in self.bad_words
540 def obscure_bad_words(self, text: str) -> str:
541 """Obscure bad words that are detected by inserting random punctuation
545 def obscure(word: str):
553 char = random.choice(['#', '%', '!', '@', '&', '*'])
560 words = nltk.word_tokenize(text)
567 while cursor < len(words) - 3:
569 bigram = word + ' ' + words[cursor + 1]
570 trigram = bigram + ' ' + words[cursor + 2]
571 if self.is_bad_word(trigram):
572 out += obscure(trigram) + ' '
574 elif self.is_bad_word(bigram):
575 out += obscure(bigram) + ' '
577 elif self.is_bad_word(word):
578 out += obscure(word) + ' '
589 pf = ProfanityFilter()
590 phrase = ' '.join(sys.argv[1:])
591 print(pf.contains_bad_word(phrase))
592 print(pf.obscure_bad_words(phrase))
596 if __name__ == '__main__':