10 from nltk.stem import PorterStemmer
12 import decorator_utils
16 logger = logging.getLogger(__name__)
19 @decorator_utils.singleton
20 class ProfanityFilter(object):
136 'chocolate rosebuds',
191 'double penetration',
295 'missionary position',
473 self.stemmer = PorterStemmer()
475 def _normalize(self, text: str) -> str:
478 >>> _normalize('Tittie5')
481 >>> _normalize('Suck a Dick!')
484 >>> _normalize('fucking a whore')
488 result = text.lower()
489 result = result.replace("_", " ")
490 result = result.replace('0', 'o')
491 result = result.replace('1', 'l')
492 result = result.replace('4', 'a')
493 result = result.replace('5', 's')
494 result = result.replace('3', 'e')
495 for x in string.punctuation:
496 result = result.replace(x, "")
497 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
498 return ' '.join(chunks)
500 def tokenize(self, text: str):
501 for x in nltk.word_tokenize(text):
502 for y in re.split(r'\W+', x):
505 def contains_bad_word(self, text: str) -> bool:
506 """Returns True if text contains a bad word (or more than one)
507 and False if no bad words were detected.
509 >>> contains_bad_word('fuck you')
512 >>> contains_bad_word('FucK u')
515 >>> contains_bad_word('FuK U')
519 words = [word for word in self.tokenize(text)]
521 if self.is_bad_word(word):
522 logger.debug(f'"{word}" is profanity')
526 for bigram in string_utils.ngrams_presplit(words, 2):
527 bigram = ' '.join(bigram)
528 if self.is_bad_word(bigram):
529 logger.debug(f'"{bigram}" is profanity')
533 for trigram in string_utils.ngrams_presplit(words, 3):
534 trigram = ' '.join(trigram)
535 if self.is_bad_word(trigram):
536 logger.debug(f'"{trigram}" is profanity')
540 def is_bad_word(self, word: str) -> bool:
541 return word in self.bad_words or self._normalize(word) in self.bad_words
543 def obscure_bad_words(self, text: str) -> str:
544 """Obscure bad words that are detected by inserting random punctuation
549 def obscure(word: str):
557 char = random.choice(['#', '%', '!', '@', '&', '*'])
564 words = self.tokenize(text)
571 while cursor < len(words) - 3:
573 bigram = word + ' ' + words[cursor + 1]
574 trigram = bigram + ' ' + words[cursor + 2]
575 if self.is_bad_word(trigram):
576 out += obscure(trigram) + ' '
578 elif self.is_bad_word(bigram):
579 out += obscure(bigram) + ' '
581 elif self.is_bad_word(word):
582 out += obscure(word) + ' '
594 pf = ProfanityFilter()
595 phrase = ' '.join(sys.argv[1:])
596 print(pf.contains_bad_word(phrase))
597 print(pf.obscure_bad_words(phrase))
601 if __name__ == '__main__':