X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=db014e1704742c7cab01bc6e7ca1f6ca7f874de5;hb=36fe954a689c26e7082c61c1c8dbbf76dd7cf6c8;hp=e1b474323fb67823a0c1607e69df08ec41b5398d;hpb=b843703134a166013518c707fa5a77373f1bf0bf;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index e1b4743..db014e1 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -8,12 +8,14 @@ import sys import nltk from nltk.stem import PorterStemmer +import decorator_utils import string_utils logger = logging.getLogger(__name__) +@decorator_utils.singleton class ProfanityFilter(object): def __init__(self): self.bad_words = set([ @@ -82,6 +84,7 @@ class ProfanityFilter(object): 'blonde action', 'blow j', 'blow job', + 'blowjob', 'blow my', 'blow me', 'blow ourselv', @@ -344,6 +347,7 @@ class ProfanityFilter(object): 'poop chute', 'poopchute', 'porn', + 'pron', 'pornhub', 'porno', 'pornographi', @@ -466,8 +470,25 @@ class ProfanityFilter(object): self.stemmer = PorterStemmer() def _normalize(self, text: str) -> str: + """Normalize text. + + >>> _normalize('Tittie5') + 'titties' + + >>> _normalize('Suck a Dick!') + 'suck a dick' + + >>> _normalize('fucking a whore') + 'fuck a whore' + + """ result = text.lower() result = result.replace("_", " ") + result = result.replace('0', 'o') + result = result.replace('1', 'l') + result = result.replace('4', 'a') + result = result.replace('5', 's') + result = result.replace('3', 'e') for x in string.punctuation: result = result.replace(x, "") chunks = [ @@ -476,6 +497,19 @@ class ProfanityFilter(object): return ' '.join(chunks) def contains_bad_word(self, text: str) -> bool: + """Returns True if text contains a bad word (or more than one) + and False if no bad words were detected. + + >>> contains_bad_word('fuck you') + True + + >>> contains_bad_word('FucK u') + True + + >>> contains_bad_word('FuK U') + False + + """ words = nltk.word_tokenize(text) for word in words: if self.is_bad_word(word): @@ -484,14 +518,16 @@ class ProfanityFilter(object): if len(words) > 1: for bigram in string_utils.ngrams_presplit(words, 2): + bigram = ' '.join(bigram) if self.is_bad_word(bigram): - logger.debug('"{bigram}" is profanity') + logger.debug(f'"{bigram}" is profanity') return True if len(words) > 2: for trigram in string_utils.ngrams_presplit(words, 3): + trigram = ' '.join(trigram) if self.is_bad_word(trigram): - logger.debug('"{trigram}" is profanity') + logger.debug(f'"{trigram}" is profanity') return True return False @@ -502,7 +538,10 @@ class ProfanityFilter(object): ) def obscure_bad_words(self, text: str) -> str: + """Obscure bad words that are detected by inserting random punctuation + characters. + """ def obscure(word: str): out = '' last = '' @@ -545,6 +584,8 @@ class ProfanityFilter(object): def main() -> None: + import doctest + doctest.testmod() pf = ProfanityFilter() phrase = ' '.join(sys.argv[1:]) print(pf.contains_bad_word(phrase))