9 from nltk.stem import PorterStemmer
14 logger = logging.getLogger(__name__)
17 class ProfanityFilter(object):
19 self.bad_words = set([
131 'chocolate rosebuds',
186 'double penetration',
290 'missionary position',
466 self.stemmer = PorterStemmer()
468 def _normalize(self, text: str) -> str:
469 result = text.lower()
470 result = result.replace("_", " ")
471 for x in string.punctuation:
472 result = result.replace(x, "")
474 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
476 return ' '.join(chunks)
478 def contains_bad_word(self, text: str) -> bool:
479 words = nltk.word_tokenize(text)
481 if self.is_bad_word(word):
482 logger.debug(f'"{word}" is profanity')
486 for bigram in string_utils.ngrams_presplit(words, 2):
487 if self.is_bad_word(bigram):
488 logger.debug('"{bigram}" is profanity')
492 for trigram in string_utils.ngrams_presplit(words, 3):
493 if self.is_bad_word(trigram):
494 logger.debug('"{trigram}" is profanity')
498 def is_bad_word(self, word: str) -> bool:
500 word in self.bad_words or
501 self._normalize(word) in self.bad_words
504 def obscure_bad_words(self, text: str) -> str:
506 def obscure(word: str):
514 char = random.choice(['#', '%', '!', '@', '&', '*'])
521 words = nltk.word_tokenize(text)
528 while cursor < len(words) - 3:
530 bigram = word + ' ' + words[cursor + 1]
531 trigram = bigram + ' ' + words[cursor + 2]
532 if self.is_bad_word(trigram):
533 out += obscure(trigram) + ' '
535 elif self.is_bad_word(bigram):
536 out += obscure(bigram) + ' '
538 elif self.is_bad_word(word):
539 out += obscure(word) + ' '
548 pf = ProfanityFilter()
549 phrase = ' '.join(sys.argv[1:])
550 print(pf.contains_bad_word(phrase))
551 print(pf.obscure_bad_words(phrase))
555 if __name__ == '__main__':