X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=1a855857478089f010a16115166c3ea488922259;hb=a9bdfd8fc9f84b7b2c09a57cd12ba32259e84d1c;hp=c1767bf16370bae17b63b56d426c7fc2e9e49519;hpb=1f9d550895e0112b1e0a1eb4a5d725deace8e810;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index c1767bf..1a85585 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 -"""A helper to identify and optionally obscure some bad words.""" +# © Copyright 2021-2022, Scott Gasch + +"""A helper to identify and optionally obscure some bad words. Not +perfect but decent. Uses a fuzzy block list rather than ML.""" import logging import random @@ -475,6 +478,9 @@ class ProfanityFilter(object): >>> _normalize('fucking a whore') 'fuck a whore' + >>> _normalize('pu55y') + 'pussy' + """ result = text.lower() result = result.replace("_", " ") @@ -490,6 +496,7 @@ class ProfanityFilter(object): @staticmethod def tokenize(text: str): + """Tokenize text into word-like chunks""" for x in nltk.word_tokenize(text): for y in re.split(r'\W+', x): yield y @@ -530,12 +537,12 @@ class ProfanityFilter(object): return False def is_bad_word(self, word: str) -> bool: + """True if we think word is a bad word.""" return word in self.bad_words or self._normalize(word) in self.bad_words def obscure_bad_words(self, text: str) -> str: """Obscure bad words that are detected by inserting random punctuation characters. - """ def obscure(word: str):