#!/usr/bin/env python3 # © Copyright 2021-2022, Scott Gasch """A helper to identify and optionally obscure some bad words. Not perfect but decent. Uses a fuzzy block list rather than ML.""" import logging import random import re import string import sys import nltk from nltk.stem import PorterStemmer import decorator_utils import string_utils logger = logging.getLogger(__name__) @decorator_utils.singleton class ProfanityFilter(object): """A helper to identify and optionally obscure some bad words.""" def __init__(self): self.bad_words = set( [ 'acrotomophilia', 'anal', 'analingu', 'anally', 'anilingu', 'anus', 'arsehol', 'arsehole', 'ass', 'asshol', 'asshole', 'assmunch', 'auto erot', 'auto erotic', 'autoerotic', 'babeland', 'babi batter', 'baby batter', 'ball gag', 'ball gravi', 'ball gravy', 'ball kick', 'ball kicking', 'ball lick', 'ball licking', 'ball sack', 'ball suck', 'ball sucking', 'ball zack', 'bangbro', 'bare legal', 'bareback', 'barely legal', 'barenak', 'barenaked', 'bastardo', 'bastinado', 'bbc', 'bbw', 'bdsm', 'beaver cleaver', 'beaver lip', 'bestial', 'bestiality', 'bi curiou', 'big black', 'big breast', 'big knocker', 'big tit', 'bimbo', 'birdlock', 'bitch', 'black cock', 'blond action', 'blond on blond', 'blonde action', 'blow j', 'blow job', 'blowjob', 'blow my', 'blow me', 'blow ourselv', 'blow ourselves', 'blow your load', 'blue waffl', 'blue waffle', 'blumpkin', 'bollock', 'bondag', 'bondage', 'boner', 'boob', 'booti call', 'booty call', 'breast', 'brown shower', 'brunett action', 'brunette action', 'bukkak', 'bukkake', 'bulldyk', 'bulldyke', 'bullet vibe', 'bullshit', 'bung hole', 'bunghol', 'bunghole', 'busti', 'busty', 'butt', 'buttcheek', 'butthol', 'butthole', 'camel toe', 'camgirl', 'camslut', 'camwhore', 'carpet muncher', 'carpetmuncher', 'chocol rosebud', 'circlejerk', 'chink', 'cleveland steamer', 'clit', 'clitor', 'clitori', 'clover clamp', 'clusterfuck', 'cluster fuck', 'cock', 'coprolagnia', 'coprophilia', 'cornhol', 'cornhole', 'cream pie', 'creampi', 'creampie', 'cum', 'cumming', 'cunnilingu', 'cunt', 'damn', 'darki', 'darkie', 'date rape', 'daterap', 'daterape', 'deep throat', 'deepthroat', 'dick', 'dildo', 'dirti pillow', 'dirti sanchez', 'dirty pillow', 'dirty sanchez', 'dog style', 'doggi style', 'doggie style', 'doggiestyl', 'doggiestyle', 'doggystyle', 'dolcett', 'domination', 'dominatrix', 'domm', 'dommes', 'donkey punch', 'doubl dick', 'doubl dong', 'doubl penetr', 'double dick', 'double dong', 'double penetration', 'dp action', 'dtf', 'eat my ass', 'ecchi', 'ejacul', 'erection', 'erotic', 'erotism', 'escort', 'ethical slut', 'eunuch', 'faggot', 'fecal', 'felch', 'fellatio', 'feltch', 'female squirting', 'femdom', 'figging', 'finger', 'fist', 'foot fetish', 'footjob', 'frotting', 'fuck button', 'fuck', 'fucked', 'fucker', 'fuckhead', 'fuckin', 'fucking', 'fudge packer', 'fudgepack', 'fudgepacker', 'futanari', 'g spot', 'g-spot', 'gang bang', 'gay sex', 'gee spot', 'genital', 'giant cock', 'girl gone wild', 'girl on top', 'girl on', 'give head', 'giving head', 'gave head', 'gave you head', 'gave him head', 'gave them head', 'gave us head', 'glori hole', 'goatcx', 'goatse', 'goddamn', 'gokkun', 'golden shower', 'goo girl', 'goodpoop', 'goregasm', 'grope', 'group sex', 'gspot', 'guro', 'hand job', 'handjob', 'hard core', 'hardcore', 'hentai', 'homoerotic', 'honkey', 'hooker', 'horni', 'horny', 'hot chick', 'how to kill', 'how to murder', 'huge fat', 'humped', 'humping', 'hump', 'incest', 'intercourse', 'jack off', 'jail bait', 'jailbait', 'jerk off', 'jigaboo', 'jiggaboo', 'jiggerboo', 'jizz', 'jugg', 'kike', 'kinbaku', 'kinkster', 'kinky', 'knobbing', 'leather restraint', 'lemon party', 'lolita', 'lovemaking', 'make me come', 'male squirting', 'masturb', 'menage a trois', 'menag a troi', 'milf', 'missionary position', 'motherfuck', 'mound of venu', 'mr hand', 'muff diver', 'muffdiv', 'muffdiving', 'nambla', 'nawashi', 'negro', 'neonazi', 'nig nog', 'nigga', 'nigger', 'nimphomania', 'nipple', 'nip', 'not safe for', 'nsfl', 'nsfw', 'nude', 'nudity', 'nut sack', 'nutsack', 'nympho', 'nymphomania', 'octopussy', 'omorashi', 'one night stand', 'orgasm', 'orgy', 'paedophil', 'paedophile', 'panties', 'panti', 'pedobear', 'pedophil', 'pedophile', 'pee', 'pegging', 'peni', 'penis', 'phone sex', 'pigfucker', 'piss pig', 'piss', 'pissing', 'pisspig', 'playboy', 'pleasure chest', 'pole smoker', 'ponyplay', 'poof', 'poop chute', 'poopchute', 'porn', 'pron', 'pornhub', 'porno', 'pornographi', 'pornography', 'prince albert', 'pthc', 'pube', 'pussi', 'pussies', 'pussy', 'queaf', 'queer', 'raghead', 'raging boner', 'rape', 'raping', 'rapist', 'rectum', 'reverse cowgirl', 'rimjob', 'rimming', 'rosy palm', 'rusty trombone', 's & m', 's&m', 's+m', 'sadism', 'scat', 'schlong', 'scissoring', 'semen', 'sex', 'sexi', 'sexo', 'sexy', 'shave beaver', 'shave pussi', 'shemale', 'shibari', 'shit', 'shota', 'shrimping', 'slanteye', 'slut', 'smut', 'snatch', 'snm', 'snowballing', 'sodomi', 'sodomize', 'sodomy', 'spic', 'spooge', 'spread leg', 'squirting', 'strap on', 'strapon', 'strappado', 'strip club', 'style doggy', 'suck', 'suicid girl', 'sultry women', 'swastika', 'swinger', 'taint', 'tainted love', 'taste my', 'tea bagging', 'threesome', 'throating', 'tied up', 'tight white', 'tit', 'titti', 'titties', 'titty', 'tongue in', 'topless', 'tosser', 'towelhead', 'tranny', 'tribadism', 'tub girl', 'tubgirl', 'tushy', 'twat', 'twink', 'twinki', 'twinkie', 'undress', 'upskirt', 'urethra play', 'urophilia', 'vag', 'vagina', 'venu mound', 'vibrator', 'violet blue', 'violet wand', 'vorarephilia', 'voyeur', 'vulva', 'wank', 'wet dream', 'wetback', 'white power', 'whore', 'women rapping', 'wrapping men', 'wrinkled starfish', 'xx', 'xxx', 'yaoi', 'yellow shower', 'yiffy', 'zoophilia', ] ) self.stemmer = PorterStemmer() def _normalize(self, text: str) -> str: """Normalize text. >>> _normalize('Tittie5') 'titties' >>> _normalize('Suck a Dick!') 'suck a dick' >>> _normalize('fucking a whore') 'fuck a whore' >>> _normalize('pu55y') 'pussy' """ result = text.lower() result = result.replace("_", " ") result = result.replace('0', 'o') result = result.replace('1', 'l') result = result.replace('4', 'a') result = result.replace('5', 's') result = result.replace('3', 'e') for x in string.punctuation: result = result.replace(x, "") chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)] return ' '.join(chunks) @staticmethod def tokenize(text: str): """Tokenize text into word-like chunks""" for x in nltk.word_tokenize(text): for y in re.split(r'\W+', x): yield y def contains_bad_word(self, text: str) -> bool: """Returns True if text contains a bad word (or more than one) and False if no bad words were detected. >>> contains_bad_word('fuck you') True >>> contains_bad_word('FucK u') True >>> contains_bad_word('FuK U') False """ words = list(self.tokenize(text)) for word in words: if self.is_bad_word(word): logger.debug('"%s" is profanity', word) return True if len(words) > 1: for bigram in string_utils.ngrams_presplit(words, 2): bigram = ' '.join(bigram) if self.is_bad_word(bigram): logger.debug('"%s" is profanity', bigram) return True if len(words) > 2: for trigram in string_utils.ngrams_presplit(words, 3): trigram = ' '.join(trigram) if self.is_bad_word(trigram): logger.debug('"%s" is profanity', trigram) return True return False def is_bad_word(self, word: str) -> bool: """True if we think word is a bad word.""" return word in self.bad_words or self._normalize(word) in self.bad_words def obscure_bad_words(self, text: str) -> str: """Obscure bad words that are detected by inserting random punctuation characters. """ def obscure(word: str): out = '' last = '' for letter in word: if letter.isspace(): out += letter else: while True: char = random.choice(['#', '%', '!', '@', '&', '*']) if last != char: last = char out += char break return out words = list(self.tokenize(text)) words.append('') words.append('') words.append('') out = '' cursor = 0 while cursor < len(words) - 3: word = words[cursor] bigram = word + ' ' + words[cursor + 1] trigram = bigram + ' ' + words[cursor + 2] if self.is_bad_word(trigram): out += obscure(trigram) + ' ' cursor += 3 elif self.is_bad_word(bigram): out += obscure(bigram) + ' ' cursor += 2 elif self.is_bad_word(word): out += obscure(word) + ' ' cursor += 1 else: out += word + ' ' cursor += 1 return out.strip() def main() -> None: import doctest doctest.testmod() pf = ProfanityFilter() phrase = ' '.join(sys.argv[1:]) print(pf.contains_bad_word(phrase)) print(pf.obscure_bad_words(phrase)) sys.exit(0) if __name__ == '__main__': main()