10 from nltk.stem import PorterStemmer
12 import decorator_utils
16 logger = logging.getLogger(__name__)
19 @decorator_utils.singleton
20 class ProfanityFilter(object):
22 self.bad_words = set([
135 'chocolate rosebuds',
190 'double penetration',
294 'missionary position',
471 self.stemmer = PorterStemmer()
473 def _normalize(self, text: str) -> str:
476 >>> _normalize('Tittie5')
479 >>> _normalize('Suck a Dick!')
482 >>> _normalize('fucking a whore')
486 result = text.lower()
487 result = result.replace("_", " ")
488 result = result.replace('0', 'o')
489 result = result.replace('1', 'l')
490 result = result.replace('4', 'a')
491 result = result.replace('5', 's')
492 result = result.replace('3', 'e')
493 for x in string.punctuation:
494 result = result.replace(x, "")
496 self.stemmer.stem(word) for word in nltk.word_tokenize(result)
498 return ' '.join(chunks)
500 def tokenize(self, text: str):
501 for x in nltk.word_tokenize(text):
502 for y in re.split('\W+', x):
505 def contains_bad_word(self, text: str) -> bool:
506 """Returns True if text contains a bad word (or more than one)
507 and False if no bad words were detected.
509 >>> contains_bad_word('fuck you')
512 >>> contains_bad_word('FucK u')
515 >>> contains_bad_word('FuK U')
519 words = [word for word in self.tokenize(text)]
521 if self.is_bad_word(word):
522 logger.debug(f'"{word}" is profanity')
526 for bigram in string_utils.ngrams_presplit(words, 2):
527 bigram = ' '.join(bigram)
528 if self.is_bad_word(bigram):
529 logger.debug(f'"{bigram}" is profanity')
533 for trigram in string_utils.ngrams_presplit(words, 3):
534 trigram = ' '.join(trigram)
535 if self.is_bad_word(trigram):
536 logger.debug(f'"{trigram}" is profanity')
540 def is_bad_word(self, word: str) -> bool:
542 word in self.bad_words or
543 self._normalize(word) in self.bad_words
546 def obscure_bad_words(self, text: str) -> str:
547 """Obscure bad words that are detected by inserting random punctuation
551 def obscure(word: str):
559 char = random.choice(['#', '%', '!', '@', '&', '*'])
566 words = self.tokenize(text)
573 while cursor < len(words) - 3:
575 bigram = word + ' ' + words[cursor + 1]
576 trigram = bigram + ' ' + words[cursor + 2]
577 if self.is_bad_word(trigram):
578 out += obscure(trigram) + ' '
580 elif self.is_bad_word(bigram):
581 out += obscure(bigram) + ' '
583 elif self.is_bad_word(word):
584 out += obscure(word) + ' '
595 pf = ProfanityFilter()
596 phrase = ' '.join(sys.argv[1:])
597 print(pf.contains_bad_word(phrase))
598 print(pf.obscure_bad_words(phrase))
602 if __name__ == '__main__':