9 from nltk.stem import PorterStemmer
11 import decorator_utils
15 logger = logging.getLogger(__name__)
18 @decorator_utils.singleton
19 class ProfanityFilter(object):
21 self.bad_words = set([
134 'chocolate rosebuds',
189 'double penetration',
293 'missionary position',
470 self.stemmer = PorterStemmer()
472 def _normalize(self, text: str) -> str:
473 result = text.lower()
474 result = result.replace("_", " ")
475 result = result.replace('0', 'o')
476 result = result.replace('1', 'l')
477 result = result.replace('4', 'a')
478 result = result.replace('5', 's')
479 result = result.replace('3', 'e')
480 for x in string.punctuation:
481 result = result.replace(x, "")
483 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
485 return ' '.join(chunks)
487 def contains_bad_word(self, text: str) -> bool:
488 words = nltk.word_tokenize(text)
490 if self.is_bad_word(word):
491 logger.debug(f'"{word}" is profanity')
495 for bigram in string_utils.ngrams_presplit(words, 2):
496 bigram = ' '.join(bigram)
497 if self.is_bad_word(bigram):
498 logger.debug(f'"{bigram}" is profanity')
502 for trigram in string_utils.ngrams_presplit(words, 3):
503 trigram = ' '.join(trigram)
504 if self.is_bad_word(trigram):
505 logger.debug(f'"{trigram}" is profanity')
509 def is_bad_word(self, word: str) -> bool:
511 word in self.bad_words or
512 self._normalize(word) in self.bad_words
515 def obscure_bad_words(self, text: str) -> str:
517 def obscure(word: str):
525 char = random.choice(['#', '%', '!', '@', '&', '*'])
532 words = nltk.word_tokenize(text)
539 while cursor < len(words) - 3:
541 bigram = word + ' ' + words[cursor + 1]
542 trigram = bigram + ' ' + words[cursor + 2]
543 if self.is_bad_word(trigram):
544 out += obscure(trigram) + ' '
546 elif self.is_bad_word(bigram):
547 out += obscure(bigram) + ' '
549 elif self.is_bad_word(word):
550 out += obscure(word) + ' '
559 pf = ProfanityFilter()
560 phrase = ' '.join(sys.argv[1:])
561 print(pf.contains_bad_word(phrase))
562 print(pf.obscure_bad_words(phrase))
566 if __name__ == '__main__':