9 from nltk.stem import PorterStemmer
14 logger = logging.getLogger(__name__)
17 class ProfanityFilter(object):
19 self.bad_words = set([
131 'chocolate rosebuds',
186 'double penetration',
290 'missionary position',
466 self.stemmer = PorterStemmer()
468 def _normalize(self, text: str) -> str:
469 result = text.lower()
470 result = result.replace("_", " ")
471 for x in string.punctuation:
472 result = result.replace(x, "")
474 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
476 return ' '.join(chunks)
478 def contains_bad_word(self, text: str) -> bool:
479 words = nltk.word_tokenize(text)
481 if self.is_bad_word(word):
482 logger.debug(f'"{word}" is profanity')
486 for bigram in string_utils.ngrams_presplit(words, 2):
487 bigram = ' '.join(bigram)
488 if self.is_bad_word(bigram):
489 logger.debug('"{bigram}" is profanity')
493 for trigram in string_utils.ngrams_presplit(words, 3):
494 trigram = ' '.join(trigram)
495 if self.is_bad_word(trigram):
496 logger.debug('"{trigram}" is profanity')
500 def is_bad_word(self, word: str) -> bool:
502 word in self.bad_words or
503 self._normalize(word) in self.bad_words
506 def obscure_bad_words(self, text: str) -> str:
508 def obscure(word: str):
516 char = random.choice(['#', '%', '!', '@', '&', '*'])
523 words = nltk.word_tokenize(text)
530 while cursor < len(words) - 3:
532 bigram = word + ' ' + words[cursor + 1]
533 trigram = bigram + ' ' + words[cursor + 2]
534 if self.is_bad_word(trigram):
535 out += obscure(trigram) + ' '
537 elif self.is_bad_word(bigram):
538 out += obscure(bigram) + ' '
540 elif self.is_bad_word(word):
541 out += obscure(word) + ' '
550 pf = ProfanityFilter()
551 phrase = ' '.join(sys.argv[1:])
552 print(pf.contains_bad_word(phrase))
553 print(pf.obscure_bad_words(phrase))
557 if __name__ == '__main__':