X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=profanity_filter.py;h=a1f0c0b9adaa8971dfd243694cd096a2e84a077d;hb=532df2c5b57c7517dfb3dddd8c1358fbadf8baf3;hp=fe5422179ba9a50c678188e088689184f139a14d;hpb=e8671a716da868332d3ac1f66d4d2f7f8d33fc28;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index fe54221..a1f0c0b 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -1,7 +1,12 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + +"""A helper to identify and optionally obscure some bad words.""" + import logging import random +import re import string import sys @@ -11,465 +16,468 @@ from nltk.stem import PorterStemmer import decorator_utils import string_utils - logger = logging.getLogger(__name__) @decorator_utils.singleton class ProfanityFilter(object): + """A helper to identify and optionally obscure some bad words.""" + def __init__(self): - self.bad_words = set([ - 'acrotomophilia', - 'anal', - 'analingus', - 'anally', - 'anilingus', - 'anus', - 'arsehol', - 'arsehole', - 'ass', - 'asses', - 'asshol', - 'asshole', - 'assmunch', - 'auto erot', - 'auto erotic', - 'autoerotic', - 'babeland', - 'babi batter', - 'baby batter', - 'ball gag', - 'ball gravi', - 'ball gravy', - 'ball kick', - 'ball kicking', - 'ball lick', - 'ball licking', - 'ball sack', - 'ball suck', - 'ball sucking', - 'ball zack', - 'bangbro', - 'bangbros', - 'bare legal', - 'bareback', - 'barely legal', - 'barenak', - 'barenaked', - 'bastardo', - 'bastinado', - 'bbc', - 'bbw', - 'bdsm', - 'beaver cleaver', - 'beaver lip', - 'beaver lips', - 'bestial', - 'bestiality', - 'bi curiou', - 'bi curious', - 'big black', - 'big breasts', - 'big knocker', - 'big knockers', - 'big tit', - 'big tits', - 'bimbo', - 'birdlock', - 'bitch', - 'bitches', - 'black cock', - 'blond action', - 'blond on blond', - 'blonde action', - 'blow j', - 'blow job', - 'blowjob', - 'blow my', - 'blow me', - 'blow ourselv', - 'blow ourselves', - 'blow your load', - 'blue waffl', - 'blue waffle', - 'blumpkin', - 'bollock', - 'bollocks', - 'bondag', - 'bondage', - 'boner', - 'boob', - 'boobs', - 'booti call', - 'booty call', - 'breast', - 'breasts', - 'brown shower', - 'brown showers', - 'brunett action', - 'brunette action', - 'bukkak', - 'bukkake', - 'bulldyk', - 'bulldyke', - 'bullet vibe', - 'bullshit', - 'bung hole', - 'bunghol', - 'bunghole', - 'busti', - 'busty', - 'butt', - 'buttcheek', - 'buttcheeks', - 'butthol', - 'butthole', - 'camel toe', - 'camgirl', - 'camslut', - 'camwhore', - 'carpet muncher', - 'carpetmuncher', - 'chocol rosebud', - 'chocolate rosebuds', - 'circlejerk', - 'chink', - 'cleveland steamer', - 'clit', - 'clitori', - 'clitoris', - 'clover clamp', - 'clover clamps', - 'clusterfuck', - 'cock', - 'cocks', - 'coprolagnia', - 'coprophilia', - 'cornhol', - 'cornhole', - 'cream pie', - 'creampi', - 'creampie', - 'cum', - 'cumming', - 'cunnilingu', - 'cunnilingus', - 'cunt', - 'damn', - 'darki', - 'darkie', - 'date rape', - 'daterap', - 'daterape', - 'deep throat', - 'deepthroat', - 'dick', - 'dildo', - 'dirti pillow', - 'dirti sanchez', - 'dirty pillow', - 'dirty sanchez', - 'dog style', - 'doggi style', - 'doggie style', - 'doggiestyl', - 'doggiestyle', - 'doggystyle', - 'dolcett', - 'domination', - 'dominatrix', - 'domm', - 'dommes', - 'donkey punch', - 'doubl dick', - 'doubl dong', - 'doubl penetr', - 'double dick', - 'double dong', - 'double penetration', - 'dp action', - 'dtf', - 'eat my ass', - 'ecchi', - 'ejacul', - 'erection', - 'erotic', - 'erotism', - 'escort', - 'ethical slut', - 'eunuch', - 'faggot', - 'fecal', - 'felch', - 'fellatio', - 'feltch', - 'female squirting', - 'femdom', - 'figging', - 'fingered', - 'fingering', - 'fingers', - 'fisted', - 'fisting', - 'fists', - 'foot fetish', - 'footjob', - 'frotting', - 'fuck button', - 'fuck', - 'fucked', - 'fucker', - 'fuckhead', - 'fuckin', - 'fucking', - 'fudge packer', - 'fudgepack', - 'fudgepacker', - 'futanari', - 'g spot', - 'g-spot', - 'gang bang', - 'gay sex', - 'gee spot', - 'genital', - 'giant cock', - 'girl gone wild', - 'girl on top', - 'girl on', - 'goatcx', - 'goatse', - 'goddamn', - 'gokkun', - 'golden shower', - 'goo girl', - 'goodpoop', - 'goregasm', - 'grope', - 'group sex', - 'gspot', - 'guro', - 'hand job', - 'handjob', - 'hard core', - 'hardcore', - 'hentai', - 'homoerotic', - 'honkey', - 'hooker', - 'horni', - 'horny', - 'hot chick', - 'how to kill', - 'how to murder', - 'huge fat', - 'humped', - 'humping', - 'humps', - 'incest', - 'intercourse', - 'jack off', - 'jail bait', - 'jailbait', - 'jerk off', - 'jigaboo', - 'jiggaboo', - 'jiggerboo', - 'jizz', - 'jugg', - 'kike', - 'kinbaku', - 'kinkster', - 'kinky', - 'knobbing', - 'leather restraint', - 'lemon party', - 'lolita', - 'lovemaking', - 'make me come', - 'male squirting', - 'masturb', - 'menage a trois', - 'milf', - 'missionary position', - 'motherfuck', - 'mound of venus', - 'mr hand', - 'muff diver', - 'muffdiv', - 'muffdiving', - 'nambla', - 'nawashi', - 'negro', - 'neonazi', - 'nig nog', - 'nigga', - 'nigger', - 'nimphomania', - 'nipple', - 'nip', - 'not safe for', - 'nsfl', - 'nsfw', - 'nude', - 'nudes', - 'nudity', - 'nut sack', - 'nutsack', - 'nympho', - 'nymphomania', - 'octopussy', - 'omorashi', - 'one night stand', - 'orgasm', - 'orgy', - 'paedophil', - 'paedophile', - 'panties', - 'panty', - 'pedobear', - 'pedophil', - 'pedophile', - 'pee', - 'pegging', - 'peni', - 'penis', - 'phone sex', - 'pigfucker', - 'piss pig', - 'piss', - 'pissing', - 'pisspig', - 'playboy', - 'pleasure chest', - 'pole smoker', - 'ponyplay', - 'poof', - 'poop chute', - 'poopchute', - 'porn', - 'pron', - 'pornhub', - 'porno', - 'pornographi', - 'pornography', - 'prince albert', - 'pthc', - 'pube', - 'pussi', - 'pussies', - 'pussy', - 'queaf', - 'queer', - 'raghead', - 'raging boner', - 'rape', - 'raping', - 'rapist', - 'rectum', - 'reverse cowgirl', - 'rimjob', - 'rimming', - 'rosy palm', - 'rusty trombone', - 's & m', - 's&m', - 's+m', - 'sadism', - 'scat', - 'schlong', - 'scissoring', - 'semen', - 'sex', - 'sexi', - 'sexo', - 'sexy', - 'shaved beaver', - 'shaved pussy', - 'shemale', - 'shibari', - 'shit', - 'shota', - 'shrimping', - 'slanteye', - 'slut', - 'smut', - 'snatch', - 'snm', - 'snowballing', - 'sodomi', - 'sodomize', - 'sodomy', - 'spic', - 'spooge', - 'spread legs', - 'squirting', - 'strap on', - 'strapon', - 'strappado', - 'strip club', - 'style doggy', - 'suck', - 'suicide girls', - 'sultry women', - 'swastika', - 'swinger', - 'taint', - 'tainted love', - 'taste my', - 'tea bagging', - 'threesome', - 'throating', - 'tied up', - 'tight white', - 'tit', - 'tits', - 'titti', - 'titties', - 'titty', - 'tongue in', - 'topless', - 'tosser', - 'towelhead', - 'tranny', - 'tribadism', - 'tub girl', - 'tubgirl', - 'tushy', - 'twat', - 'twink', - 'twinki', - 'twinkie', - 'undress', - 'upskirt', - 'urethra play', - 'urophilia', - 'vag', - 'vagina', - 'venus mound', - 'vibrator', - 'violet blue', - 'violet wand', - 'vorarephilia', - 'voyeur', - 'vulva', - 'wank', - 'wet dream', - 'wetback', - 'white power', - 'whore', - 'women rapping', - 'wrapping men', - 'wrinkled starfish', - 'xx', - 'xxx', - 'yaoi', - 'yellow shower', - 'yiffy', - 'zoophilia', - ]) + self.bad_words = set( + [ + 'acrotomophilia', + 'anal', + 'analingu', + 'anally', + 'anilingu', + 'anus', + 'arsehol', + 'arsehole', + 'ass', + 'asshol', + 'asshole', + 'assmunch', + 'auto erot', + 'auto erotic', + 'autoerotic', + 'babeland', + 'babi batter', + 'baby batter', + 'ball gag', + 'ball gravi', + 'ball gravy', + 'ball kick', + 'ball kicking', + 'ball lick', + 'ball licking', + 'ball sack', + 'ball suck', + 'ball sucking', + 'ball zack', + 'bangbro', + 'bare legal', + 'bareback', + 'barely legal', + 'barenak', + 'barenaked', + 'bastardo', + 'bastinado', + 'bbc', + 'bbw', + 'bdsm', + 'beaver cleaver', + 'beaver lip', + 'bestial', + 'bestiality', + 'bi curiou', + 'big black', + 'big breast', + 'big knocker', + 'big tit', + 'bimbo', + 'birdlock', + 'bitch', + 'black cock', + 'blond action', + 'blond on blond', + 'blonde action', + 'blow j', + 'blow job', + 'blowjob', + 'blow my', + 'blow me', + 'blow ourselv', + 'blow ourselves', + 'blow your load', + 'blue waffl', + 'blue waffle', + 'blumpkin', + 'bollock', + 'bondag', + 'bondage', + 'boner', + 'boob', + 'booti call', + 'booty call', + 'breast', + 'brown shower', + 'brunett action', + 'brunette action', + 'bukkak', + 'bukkake', + 'bulldyk', + 'bulldyke', + 'bullet vibe', + 'bullshit', + 'bung hole', + 'bunghol', + 'bunghole', + 'busti', + 'busty', + 'butt', + 'buttcheek', + 'butthol', + 'butthole', + 'camel toe', + 'camgirl', + 'camslut', + 'camwhore', + 'carpet muncher', + 'carpetmuncher', + 'chocol rosebud', + 'circlejerk', + 'chink', + 'cleveland steamer', + 'clit', + 'clitor', + 'clitori', + 'clover clamp', + 'clusterfuck', + 'cluster fuck', + 'cock', + 'coprolagnia', + 'coprophilia', + 'cornhol', + 'cornhole', + 'cream pie', + 'creampi', + 'creampie', + 'cum', + 'cumming', + 'cunnilingu', + 'cunt', + 'damn', + 'darki', + 'darkie', + 'date rape', + 'daterap', + 'daterape', + 'deep throat', + 'deepthroat', + 'dick', + 'dildo', + 'dirti pillow', + 'dirti sanchez', + 'dirty pillow', + 'dirty sanchez', + 'dog style', + 'doggi style', + 'doggie style', + 'doggiestyl', + 'doggiestyle', + 'doggystyle', + 'dolcett', + 'domination', + 'dominatrix', + 'domm', + 'dommes', + 'donkey punch', + 'doubl dick', + 'doubl dong', + 'doubl penetr', + 'double dick', + 'double dong', + 'double penetration', + 'dp action', + 'dtf', + 'eat my ass', + 'ecchi', + 'ejacul', + 'erection', + 'erotic', + 'erotism', + 'escort', + 'ethical slut', + 'eunuch', + 'faggot', + 'fecal', + 'felch', + 'fellatio', + 'feltch', + 'female squirting', + 'femdom', + 'figging', + 'finger', + 'fist', + 'foot fetish', + 'footjob', + 'frotting', + 'fuck button', + 'fuck', + 'fucked', + 'fucker', + 'fuckhead', + 'fuckin', + 'fucking', + 'fudge packer', + 'fudgepack', + 'fudgepacker', + 'futanari', + 'g spot', + 'g-spot', + 'gang bang', + 'gay sex', + 'gee spot', + 'genital', + 'giant cock', + 'girl gone wild', + 'girl on top', + 'girl on', + 'give head', + 'giving head', + 'gave head', + 'gave you head', + 'gave him head', + 'gave them head', + 'gave us head', + 'glori hole', + 'goatcx', + 'goatse', + 'goddamn', + 'gokkun', + 'golden shower', + 'goo girl', + 'goodpoop', + 'goregasm', + 'grope', + 'group sex', + 'gspot', + 'guro', + 'hand job', + 'handjob', + 'hard core', + 'hardcore', + 'hentai', + 'homoerotic', + 'honkey', + 'hooker', + 'horni', + 'horny', + 'hot chick', + 'how to kill', + 'how to murder', + 'huge fat', + 'humped', + 'humping', + 'hump', + 'incest', + 'intercourse', + 'jack off', + 'jail bait', + 'jailbait', + 'jerk off', + 'jigaboo', + 'jiggaboo', + 'jiggerboo', + 'jizz', + 'jugg', + 'kike', + 'kinbaku', + 'kinkster', + 'kinky', + 'knobbing', + 'leather restraint', + 'lemon party', + 'lolita', + 'lovemaking', + 'make me come', + 'male squirting', + 'masturb', + 'menage a trois', + 'menag a troi', + 'milf', + 'missionary position', + 'motherfuck', + 'mound of venu', + 'mr hand', + 'muff diver', + 'muffdiv', + 'muffdiving', + 'nambla', + 'nawashi', + 'negro', + 'neonazi', + 'nig nog', + 'nigga', + 'nigger', + 'nimphomania', + 'nipple', + 'nip', + 'not safe for', + 'nsfl', + 'nsfw', + 'nude', + 'nudity', + 'nut sack', + 'nutsack', + 'nympho', + 'nymphomania', + 'octopussy', + 'omorashi', + 'one night stand', + 'orgasm', + 'orgy', + 'paedophil', + 'paedophile', + 'panties', + 'panti', + 'pedobear', + 'pedophil', + 'pedophile', + 'pee', + 'pegging', + 'peni', + 'penis', + 'phone sex', + 'pigfucker', + 'piss pig', + 'piss', + 'pissing', + 'pisspig', + 'playboy', + 'pleasure chest', + 'pole smoker', + 'ponyplay', + 'poof', + 'poop chute', + 'poopchute', + 'porn', + 'pron', + 'pornhub', + 'porno', + 'pornographi', + 'pornography', + 'prince albert', + 'pthc', + 'pube', + 'pussi', + 'pussies', + 'pussy', + 'queaf', + 'queer', + 'raghead', + 'raging boner', + 'rape', + 'raping', + 'rapist', + 'rectum', + 'reverse cowgirl', + 'rimjob', + 'rimming', + 'rosy palm', + 'rusty trombone', + 's & m', + 's&m', + 's+m', + 'sadism', + 'scat', + 'schlong', + 'scissoring', + 'semen', + 'sex', + 'sexi', + 'sexo', + 'sexy', + 'shave beaver', + 'shave pussi', + 'shemale', + 'shibari', + 'shit', + 'shota', + 'shrimping', + 'slanteye', + 'slut', + 'smut', + 'snatch', + 'snm', + 'snowballing', + 'sodomi', + 'sodomize', + 'sodomy', + 'spic', + 'spooge', + 'spread leg', + 'squirting', + 'strap on', + 'strapon', + 'strappado', + 'strip club', + 'style doggy', + 'suck', + 'suicid girl', + 'sultry women', + 'swastika', + 'swinger', + 'taint', + 'tainted love', + 'taste my', + 'tea bagging', + 'threesome', + 'throating', + 'tied up', + 'tight white', + 'tit', + 'titti', + 'titties', + 'titty', + 'tongue in', + 'topless', + 'tosser', + 'towelhead', + 'tranny', + 'tribadism', + 'tub girl', + 'tubgirl', + 'tushy', + 'twat', + 'twink', + 'twinki', + 'twinkie', + 'undress', + 'upskirt', + 'urethra play', + 'urophilia', + 'vag', + 'vagina', + 'venu mound', + 'vibrator', + 'violet blue', + 'violet wand', + 'vorarephilia', + 'voyeur', + 'vulva', + 'wank', + 'wet dream', + 'wetback', + 'white power', + 'whore', + 'women rapping', + 'wrapping men', + 'wrinkled starfish', + 'xx', + 'xxx', + 'yaoi', + 'yellow shower', + 'yiffy', + 'zoophilia', + ] + ) self.stemmer = PorterStemmer() def _normalize(self, text: str) -> str: + """Normalize text. + + >>> _normalize('Tittie5') + 'titties' + + >>> _normalize('Suck a Dick!') + 'suck a dick' + + >>> _normalize('fucking a whore') + 'fuck a whore' + + """ result = text.lower() result = result.replace("_", " ") result = result.replace('0', 'o') @@ -479,40 +487,58 @@ class ProfanityFilter(object): result = result.replace('3', 'e') for x in string.punctuation: result = result.replace(x, "") - chunks = [ - self.stemmer.stem(word) for word in nltk.word_tokenize(result) - ] + chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)] return ' '.join(chunks) + @staticmethod + def tokenize(text: str): + for x in nltk.word_tokenize(text): + for y in re.split(r'\W+', x): + yield y + def contains_bad_word(self, text: str) -> bool: - words = nltk.word_tokenize(text) + """Returns True if text contains a bad word (or more than one) + and False if no bad words were detected. + + >>> contains_bad_word('fuck you') + True + + >>> contains_bad_word('FucK u') + True + + >>> contains_bad_word('FuK U') + False + + """ + words = list(self.tokenize(text)) for word in words: if self.is_bad_word(word): - logger.debug(f'"{word}" is profanity') + logger.debug('"%s" is profanity', word) return True if len(words) > 1: for bigram in string_utils.ngrams_presplit(words, 2): bigram = ' '.join(bigram) if self.is_bad_word(bigram): - logger.debug(f'"{bigram}" is profanity') + logger.debug('"%s" is profanity', bigram) return True if len(words) > 2: for trigram in string_utils.ngrams_presplit(words, 3): trigram = ' '.join(trigram) if self.is_bad_word(trigram): - logger.debug(f'"{trigram}" is profanity') + logger.debug('"%s" is profanity', trigram) return True return False def is_bad_word(self, word: str) -> bool: - return ( - word in self.bad_words or - self._normalize(word) in self.bad_words - ) + return word in self.bad_words or self._normalize(word) in self.bad_words def obscure_bad_words(self, text: str) -> str: + """Obscure bad words that are detected by inserting random punctuation + characters. + + """ def obscure(word: str): out = '' @@ -529,7 +555,7 @@ class ProfanityFilter(object): break return out - words = nltk.word_tokenize(text) + words = list(self.tokenize(text)) words.append('') words.append('') words.append('') @@ -556,6 +582,9 @@ class ProfanityFilter(object): def main() -> None: + import doctest + + doctest.testmod() pf = ProfanityFilter() phrase = ' '.join(sys.argv[1:]) print(pf.contains_bad_word(phrase))