10 from nltk.stem import PorterStemmer
12 import decorator_utils
15 logger = logging.getLogger(__name__)
18 @decorator_utils.singleton
19 class ProfanityFilter(object):
135 'chocolate rosebuds',
190 'double penetration',
297 'missionary position',
475 self.stemmer = PorterStemmer()
477 def _normalize(self, text: str) -> str:
480 >>> _normalize('Tittie5')
483 >>> _normalize('Suck a Dick!')
486 >>> _normalize('fucking a whore')
490 result = text.lower()
491 result = result.replace("_", " ")
492 result = result.replace('0', 'o')
493 result = result.replace('1', 'l')
494 result = result.replace('4', 'a')
495 result = result.replace('5', 's')
496 result = result.replace('3', 'e')
497 for x in string.punctuation:
498 result = result.replace(x, "")
499 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
500 return ' '.join(chunks)
502 def tokenize(self, text: str):
503 for x in nltk.word_tokenize(text):
504 for y in re.split(r'\W+', x):
507 def contains_bad_word(self, text: str) -> bool:
508 """Returns True if text contains a bad word (or more than one)
509 and False if no bad words were detected.
511 >>> contains_bad_word('fuck you')
514 >>> contains_bad_word('FucK u')
517 >>> contains_bad_word('FuK U')
521 words = [word for word in self.tokenize(text)]
523 if self.is_bad_word(word):
524 logger.debug(f'"{word}" is profanity')
528 for bigram in string_utils.ngrams_presplit(words, 2):
529 bigram = ' '.join(bigram)
530 if self.is_bad_word(bigram):
531 logger.debug(f'"{bigram}" is profanity')
535 for trigram in string_utils.ngrams_presplit(words, 3):
536 trigram = ' '.join(trigram)
537 if self.is_bad_word(trigram):
538 logger.debug(f'"{trigram}" is profanity')
542 def is_bad_word(self, word: str) -> bool:
543 return word in self.bad_words or self._normalize(word) in self.bad_words
545 def obscure_bad_words(self, text: str) -> str:
546 """Obscure bad words that are detected by inserting random punctuation
551 def obscure(word: str):
559 char = random.choice(['#', '%', '!', '@', '&', '*'])
566 words = [x for x in self.tokenize(text)]
573 while cursor < len(words) - 3:
575 bigram = word + ' ' + words[cursor + 1]
576 trigram = bigram + ' ' + words[cursor + 2]
577 if self.is_bad_word(trigram):
578 out += obscure(trigram) + ' '
580 elif self.is_bad_word(bigram):
581 out += obscure(bigram) + ' '
583 elif self.is_bad_word(word):
584 out += obscure(word) + ' '
596 pf = ProfanityFilter()
597 phrase = ' '.join(sys.argv[1:])
598 print(pf.contains_bad_word(phrase))
599 print(pf.obscure_bad_words(phrase))
603 if __name__ == '__main__':