9 from nltk.stem import PorterStemmer
11 import decorator_utils
15 logger = logging.getLogger(__name__)
18 @decorator_utils.singleton
19 class ProfanityFilter(object):
21 self.bad_words = set([
134 'chocolate rosebuds',
189 'double penetration',
293 'missionary position',
469 self.stemmer = PorterStemmer()
471 def _normalize(self, text: str) -> str:
472 result = text.lower()
473 result = result.replace("_", " ")
474 for x in string.punctuation:
475 result = result.replace(x, "")
477 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
479 return ' '.join(chunks)
481 def contains_bad_word(self, text: str) -> bool:
482 words = nltk.word_tokenize(text)
484 if self.is_bad_word(word):
485 logger.debug(f'"{word}" is profanity')
489 for bigram in string_utils.ngrams_presplit(words, 2):
490 bigram = ' '.join(bigram)
491 if self.is_bad_word(bigram):
492 logger.debug(f'"{bigram}" is profanity')
496 for trigram in string_utils.ngrams_presplit(words, 3):
497 trigram = ' '.join(trigram)
498 if self.is_bad_word(trigram):
499 logger.debug(f'"{trigram}" is profanity')
503 def is_bad_word(self, word: str) -> bool:
505 word in self.bad_words or
506 self._normalize(word) in self.bad_words
509 def obscure_bad_words(self, text: str) -> str:
511 def obscure(word: str):
519 char = random.choice(['#', '%', '!', '@', '&', '*'])
526 words = nltk.word_tokenize(text)
533 while cursor < len(words) - 3:
535 bigram = word + ' ' + words[cursor + 1]
536 trigram = bigram + ' ' + words[cursor + 2]
537 if self.is_bad_word(trigram):
538 out += obscure(trigram) + ' '
540 elif self.is_bad_word(bigram):
541 out += obscure(bigram) + ' '
543 elif self.is_bad_word(word):
544 out += obscure(word) + ' '
553 pf = ProfanityFilter()
554 phrase = ' '.join(sys.argv[1:])
555 print(pf.contains_bad_word(phrase))
556 print(pf.obscure_bad_words(phrase))
560 if __name__ == '__main__':