X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=1a855857478089f010a16115166c3ea488922259;hb=a9bdfd8fc9f84b7b2c09a57cd12ba32259e84d1c;hp=a1f0c0b9adaa8971dfd243694cd096a2e84a077d;hpb=532df2c5b57c7517dfb3dddd8c1358fbadf8baf3;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index a1f0c0b..1a85585 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -2,7 +2,8 @@ # © Copyright 2021-2022, Scott Gasch -"""A helper to identify and optionally obscure some bad words.""" +"""A helper to identify and optionally obscure some bad words. Not +perfect but decent. Uses a fuzzy block list rather than ML.""" import logging import random @@ -477,6 +478,9 @@ class ProfanityFilter(object): >>> _normalize('fucking a whore') 'fuck a whore' + >>> _normalize('pu55y') + 'pussy' + """ result = text.lower() result = result.replace("_", " ") @@ -492,6 +496,7 @@ class ProfanityFilter(object): @staticmethod def tokenize(text: str): + """Tokenize text into word-like chunks""" for x in nltk.word_tokenize(text): for y in re.split(r'\W+', x): yield y @@ -532,12 +537,12 @@ class ProfanityFilter(object): return False def is_bad_word(self, word: str) -> bool: + """True if we think word is a bad word.""" return word in self.bad_words or self._normalize(word) in self.bad_words def obscure_bad_words(self, text: str) -> str: """Obscure bad words that are detected by inserting random punctuation characters. - """ def obscure(word: str):