X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=1a855857478089f010a16115166c3ea488922259;hb=02302bbd9363facb59c4df2c1f4013087702cfa6;hp=95540fa7b36f0bd8fcf813196e2f9f2390569fce;hpb=36fea7f15ed17150691b5b3ead75450e575229ef;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index 95540fa..1a85585 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""A helper to identify and optionally obscure some bad words. Not +perfect but decent. Uses a fuzzy block list rather than ML.""" + import logging import random import re @@ -12,25 +17,25 @@ from nltk.stem import PorterStemmer import decorator_utils import string_utils - logger = logging.getLogger(__name__) @decorator_utils.singleton class ProfanityFilter(object): + """A helper to identify and optionally obscure some bad words.""" + def __init__(self): self.bad_words = set( [ 'acrotomophilia', 'anal', - 'analingus', + 'analingu', 'anally', - 'anilingus', + 'anilingu', 'anus', 'arsehol', 'arsehole', 'ass', - 'asses', 'asshol', 'asshole', 'assmunch', @@ -52,7 +57,6 @@ class ProfanityFilter(object): 'ball sucking', 'ball zack', 'bangbro', - 'bangbros', 'bare legal', 'bareback', 'barely legal', @@ -65,21 +69,16 @@ class ProfanityFilter(object): 'bdsm', 'beaver cleaver', 'beaver lip', - 'beaver lips', 'bestial', 'bestiality', 'bi curiou', - 'bi curious', 'big black', - 'big breasts', + 'big breast', 'big knocker', - 'big knockers', 'big tit', - 'big tits', 'bimbo', 'birdlock', 'bitch', - 'bitches', 'black cock', 'blond action', 'blond on blond', @@ -96,18 +95,14 @@ class ProfanityFilter(object): 'blue waffle', 'blumpkin', 'bollock', - 'bollocks', 'bondag', 'bondage', 'boner', 'boob', - 'boobs', 'booti call', 'booty call', 'breast', - 'breasts', 'brown shower', - 'brown showers', 'brunett action', 'brunette action', 'bukkak', @@ -123,7 +118,6 @@ class ProfanityFilter(object): 'busty', 'butt', 'buttcheek', - 'buttcheeks', 'butthol', 'butthole', 'camel toe', @@ -133,18 +127,16 @@ class ProfanityFilter(object): 'carpet muncher', 'carpetmuncher', 'chocol rosebud', - 'chocolate rosebuds', 'circlejerk', 'chink', 'cleveland steamer', 'clit', + 'clitor', 'clitori', - 'clitoris', 'clover clamp', - 'clover clamps', 'clusterfuck', + 'cluster fuck', 'cock', - 'cocks', 'coprolagnia', 'coprophilia', 'cornhol', @@ -155,7 +147,6 @@ class ProfanityFilter(object): 'cum', 'cumming', 'cunnilingu', - 'cunnilingus', 'cunt', 'damn', 'darki', @@ -208,12 +199,8 @@ class ProfanityFilter(object): 'female squirting', 'femdom', 'figging', - 'fingered', - 'fingering', - 'fingers', - 'fisted', - 'fisting', - 'fists', + 'finger', + 'fist', 'foot fetish', 'footjob', 'frotting', @@ -238,6 +225,14 @@ class ProfanityFilter(object): 'girl gone wild', 'girl on top', 'girl on', + 'give head', + 'giving head', + 'gave head', + 'gave you head', + 'gave him head', + 'gave them head', + 'gave us head', + 'glori hole', 'goatcx', 'goatse', 'goddamn', @@ -266,7 +261,7 @@ class ProfanityFilter(object): 'huge fat', 'humped', 'humping', - 'humps', + 'hump', 'incest', 'intercourse', 'jack off', @@ -291,10 +286,11 @@ class ProfanityFilter(object): 'male squirting', 'masturb', 'menage a trois', + 'menag a troi', 'milf', 'missionary position', 'motherfuck', - 'mound of venus', + 'mound of venu', 'mr hand', 'muff diver', 'muffdiv', @@ -313,7 +309,6 @@ class ProfanityFilter(object): 'nsfl', 'nsfw', 'nude', - 'nudes', 'nudity', 'nut sack', 'nutsack', @@ -327,7 +322,7 @@ class ProfanityFilter(object): 'paedophil', 'paedophile', 'panties', - 'panty', + 'panti', 'pedobear', 'pedophil', 'pedophile', @@ -385,8 +380,8 @@ class ProfanityFilter(object): 'sexi', 'sexo', 'sexy', - 'shaved beaver', - 'shaved pussy', + 'shave beaver', + 'shave pussi', 'shemale', 'shibari', 'shit', @@ -403,7 +398,7 @@ class ProfanityFilter(object): 'sodomy', 'spic', 'spooge', - 'spread legs', + 'spread leg', 'squirting', 'strap on', 'strapon', @@ -411,7 +406,7 @@ class ProfanityFilter(object): 'strip club', 'style doggy', 'suck', - 'suicide girls', + 'suicid girl', 'sultry women', 'swastika', 'swinger', @@ -424,7 +419,6 @@ class ProfanityFilter(object): 'tied up', 'tight white', 'tit', - 'tits', 'titti', 'titties', 'titty', @@ -447,7 +441,7 @@ class ProfanityFilter(object): 'urophilia', 'vag', 'vagina', - 'venus mound', + 'venu mound', 'vibrator', 'violet blue', 'violet wand', @@ -484,6 +478,9 @@ class ProfanityFilter(object): >>> _normalize('fucking a whore') 'fuck a whore' + >>> _normalize('pu55y') + 'pussy' + """ result = text.lower() result = result.replace("_", " ") @@ -494,14 +491,14 @@ class ProfanityFilter(object): result = result.replace('3', 'e') for x in string.punctuation: result = result.replace(x, "") - chunks = [ - self.stemmer.stem(word) for word in nltk.word_tokenize(result) - ] + chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)] return ' '.join(chunks) - def tokenize(self, text: str): + @staticmethod + def tokenize(text: str): + """Tokenize text into word-like chunks""" for x in nltk.word_tokenize(text): - for y in re.split('\W+', x): + for y in re.split(r'\W+', x): yield y def contains_bad_word(self, text: str) -> bool: @@ -518,34 +515,34 @@ class ProfanityFilter(object): False """ - words = [word for word in self.tokenize(text)] + words = list(self.tokenize(text)) for word in words: if self.is_bad_word(word): - logger.debug(f'"{word}" is profanity') + logger.debug('"%s" is profanity', word) return True if len(words) > 1: for bigram in string_utils.ngrams_presplit(words, 2): bigram = ' '.join(bigram) if self.is_bad_word(bigram): - logger.debug(f'"{bigram}" is profanity') + logger.debug('"%s" is profanity', bigram) return True if len(words) > 2: for trigram in string_utils.ngrams_presplit(words, 3): trigram = ' '.join(trigram) if self.is_bad_word(trigram): - logger.debug(f'"{trigram}" is profanity') + logger.debug('"%s" is profanity', trigram) return True return False def is_bad_word(self, word: str) -> bool: + """True if we think word is a bad word.""" return word in self.bad_words or self._normalize(word) in self.bad_words def obscure_bad_words(self, text: str) -> str: """Obscure bad words that are detected by inserting random punctuation characters. - """ def obscure(word: str): @@ -563,7 +560,7 @@ class ProfanityFilter(object): break return out - words = self.tokenize(text) + words = list(self.tokenize(text)) words.append('') words.append('') words.append('')