10 from nltk.stem import PorterStemmer
12 import decorator_utils
15 logger = logging.getLogger(__name__)
18 @decorator_utils.singleton
19 class ProfanityFilter(object):
135 'chocolate rosebuds',
190 'double penetration',
294 'missionary position',
472 self.stemmer = PorterStemmer()
474 def _normalize(self, text: str) -> str:
477 >>> _normalize('Tittie5')
480 >>> _normalize('Suck a Dick!')
483 >>> _normalize('fucking a whore')
487 result = text.lower()
488 result = result.replace("_", " ")
489 result = result.replace('0', 'o')
490 result = result.replace('1', 'l')
491 result = result.replace('4', 'a')
492 result = result.replace('5', 's')
493 result = result.replace('3', 'e')
494 for x in string.punctuation:
495 result = result.replace(x, "")
496 chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
497 return ' '.join(chunks)
499 def tokenize(self, text: str):
500 for x in nltk.word_tokenize(text):
501 for y in re.split(r'\W+', x):
504 def contains_bad_word(self, text: str) -> bool:
505 """Returns True if text contains a bad word (or more than one)
506 and False if no bad words were detected.
508 >>> contains_bad_word('fuck you')
511 >>> contains_bad_word('FucK u')
514 >>> contains_bad_word('FuK U')
518 words = [word for word in self.tokenize(text)]
520 if self.is_bad_word(word):
521 logger.debug(f'"{word}" is profanity')
525 for bigram in string_utils.ngrams_presplit(words, 2):
526 bigram = ' '.join(bigram)
527 if self.is_bad_word(bigram):
528 logger.debug(f'"{bigram}" is profanity')
532 for trigram in string_utils.ngrams_presplit(words, 3):
533 trigram = ' '.join(trigram)
534 if self.is_bad_word(trigram):
535 logger.debug(f'"{trigram}" is profanity')
539 def is_bad_word(self, word: str) -> bool:
540 return word in self.bad_words or self._normalize(word) in self.bad_words
542 def obscure_bad_words(self, text: str) -> str:
543 """Obscure bad words that are detected by inserting random punctuation
548 def obscure(word: str):
556 char = random.choice(['#', '%', '!', '@', '&', '*'])
563 words = self.tokenize(text)
570 while cursor < len(words) - 3:
572 bigram = word + ' ' + words[cursor + 1]
573 trigram = bigram + ' ' + words[cursor + 2]
574 if self.is_bad_word(trigram):
575 out += obscure(trigram) + ' '
577 elif self.is_bad_word(bigram):
578 out += obscure(bigram) + ' '
580 elif self.is_bad_word(word):
581 out += obscure(word) + ' '
593 pf = ProfanityFilter()
594 phrase = ' '.join(sys.argv[1:])
595 print(pf.contains_bad_word(phrase))
596 print(pf.obscure_bad_words(phrase))
600 if __name__ == '__main__':