X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=37756bac99abdaaa298b92ab7ff4f984ec844d51;hb=e8fbbb7306430478dec55d2c963eed116d8330cc;hp=95540fa7b36f0bd8fcf813196e2f9f2390569fce;hpb=36fea7f15ed17150691b5b3ead75450e575229ef;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index 95540fa..37756ba 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +"""A helper to identify and optionally obscure some bad words.""" + import logging import random import re @@ -12,12 +14,13 @@ from nltk.stem import PorterStemmer import decorator_utils import string_utils - logger = logging.getLogger(__name__) @decorator_utils.singleton class ProfanityFilter(object): + """A helper to identify and optionally obscure some bad words.""" + def __init__(self): self.bad_words = set( [ @@ -238,6 +241,9 @@ class ProfanityFilter(object): 'girl gone wild', 'girl on top', 'girl on', + 'give head', + 'giving head', + 'gave head', 'goatcx', 'goatse', 'goddamn', @@ -494,14 +500,13 @@ class ProfanityFilter(object): result = result.replace('3', 'e') for x in string.punctuation: result = result.replace(x, "") - chunks = [ - self.stemmer.stem(word) for word in nltk.word_tokenize(result) - ] + chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)] return ' '.join(chunks) - def tokenize(self, text: str): + @staticmethod + def tokenize(text: str): for x in nltk.word_tokenize(text): - for y in re.split('\W+', x): + for y in re.split(r'\W+', x): yield y def contains_bad_word(self, text: str) -> bool: @@ -518,24 +523,24 @@ class ProfanityFilter(object): False """ - words = [word for word in self.tokenize(text)] + words = list(self.tokenize(text)) for word in words: if self.is_bad_word(word): - logger.debug(f'"{word}" is profanity') + logger.debug('"%s" is profanity', word) return True if len(words) > 1: for bigram in string_utils.ngrams_presplit(words, 2): bigram = ' '.join(bigram) if self.is_bad_word(bigram): - logger.debug(f'"{bigram}" is profanity') + logger.debug('"%s" is profanity', bigram) return True if len(words) > 2: for trigram in string_utils.ngrams_presplit(words, 3): trigram = ' '.join(trigram) if self.is_bad_word(trigram): - logger.debug(f'"{trigram}" is profanity') + logger.debug('"%s" is profanity', trigram) return True return False @@ -563,7 +568,7 @@ class ProfanityFilter(object): break return out - words = self.tokenize(text) + words = list(self.tokenize(text)) words.append('') words.append('') words.append('')