# © Copyright 2021-2022, Scott Gasch
-"""A helper to identify and optionally obscure some bad words."""
+"""A helper to identify and optionally obscure some bad words. Not
+perfect but decent. Uses a fuzzy block list rather than ML."""
import logging
import random
>>> _normalize('fucking a whore')
'fuck a whore'
+ >>> _normalize('pu55y')
+ 'pussy'
+
"""
result = text.lower()
result = result.replace("_", " ")
@staticmethod
def tokenize(text: str):
+ """Tokenize text into word-like chunks"""
for x in nltk.word_tokenize(text):
for y in re.split(r'\W+', x):
yield y
return False
def is_bad_word(self, word: str) -> bool:
+ """True if we think word is a bad word."""
return word in self.bad_words or self._normalize(word) in self.bad_words
def obscure_bad_words(self, text: str) -> str:
"""Obscure bad words that are detected by inserting random punctuation
characters.
-
"""
def obscure(word: str):