#!/usr/bin/env python3 import string import re class profanity_filter: def __init__(self): self.arrBad = [ "acrotomophilia", "anal", "anally", "anilingus", "anus", "arsehole", "ass", "asses", "asshole", "assmunch", "auto erotic", "autoerotic", "babeland", "baby batter", "ball gag", "ball gravy", "ball kicking", "ball licking", "ball sack", "ball zack", "ball sucking", "bangbros", "bareback", "barely legal", "barenaked", "bastardo", "bastinado", "bbw", "bdsm", "beaver cleaver", "beaver lips", "bestiality", "bi curious", "big black", "big breasts", "big knockers", "big tits", "bimbos", "birdlock", "bitch", "bitches", "black cock", "blonde action", "blonde on blonde", "blow j", "blow your l", "blow ourselves", "blow m", "blue waffle", "blumpkin", "bollocks", "bondage", "boner", "boob", "boobs", "booty call", "breasts", "brown showers", "brunette action", "bukkake", "bulldyke", "bullshit", "bullet vibe", "bung hole", "bunghole", "busty", "butt", "buttcheeks", "butthole", "camel toe", "camgirl", "camslut", "camwhore", "carpet muncher", "carpetmuncher", "chocolate rosebuds", "circlejerk", "cleveland steamer", "clit", "clitoris", "clover clamps", "clusterfuck", "cock", "cocks", "coprolagnia", "coprophilia", "cornhole", "creampie", "cream pie", "cum", "cumming", "cunnilingus", "cunt", "damn", "darkie", "date rape", "daterape", "deep throat", "deepthroat", "dick", "dildo", "dirty pillows", "dirty sanchez", "dog style", "doggie style", "doggiestyle", "doggy style", "doggystyle", "dolcett", "domination", "dominatrix", "dommes", "donkey punch", "double dick", "double dong", "double penetration", "dp action", "dtf", "eat my ass", "ecchi", "ejaculation", "erection", "erotic", "erotism", "escort", "ethical slut", "eunuch", "faggot", "posts each week", "fecal", "felch", "fellatio", "feltch", "female squirting", "femdom", "figging", "fingering", "fisting", "foot fetish", "footjob", "frotting", "fuck", "fucking", "fuckin", "fuckin'", "fucked", "fuckers", "fuck buttons", "fuckhead", "fudge packer", "fudgepacker", "futanari", "g-spot", "gspot", "gang bang", "gay sex", "genitals", "giant cock", "girl on", "girl on top", "girls gone wild", "goatcx", "goatse", "goddamn", "gokkun", "golden shower", "goo girl", "goodpoop", "goregasm", "grope", "group sex", "guro", "hand job", "handjob", "hard core", "hardcore", "hentai", "homoerotic", "honkey", "hooker", "horny", "hot chick", "how to kill", "how to murder", "huge fat", "humping", "incest", "intercourse", "jack off", "jail bait", "jailbait", "jerk off", "jerking off", "jigaboo", "jiggaboo", "jiggerboo", "jizz", "juggs", "kike", "kinbaku", "kinkster", "kinky", "knobbing", "leather restraint", "lemon party", "lolita", "lovemaking", "lpt request", "make me come", "male squirting", "masturbate", "masturbated", "masturbating", "menage a trois", "milf", "milfs", "missionary position", "motherfucker", "mound of venus", "mr hands", "muff diver", "muffdiving", "nambla", "nawashi", "negro", "neonazi", "nig nog", "nigga", "nigger", "nimphomania", "nipple", "not safe for", "nsfw", "nsfw images", "nude", "nudity", "nutsack", "nut sack", "nympho", "nymphomania", "octopussy", "omorashi", "one night stand", "orgasm", "orgy", "paedophile", "panties", "panty", "pedobear", "pedophile", "pegging", "pee", "penis", "phone sex", "piss pig", "pissing", "pisspig", "playboy", "pleasure chest", "pole smoker", "ponyplay", "poof", "poop chute", "poopchute", "porn", "pornhub", "porno", "pornography", "prince albert", "pthc", "pube", "pubes", "pussy", "pussies", "queaf", "queer", "raghead", "raging boner", "rape", "raping", "rapist", "rectum", "reverse cowgirl", "rimjob", "rimming", "rosy palm", "rusty trombone", "s&m", "sadism", "scat", "schlong", "scissoring", "semen", "sex", "sexo", "sexy", "shaved beaver", "shaved pussy", "shemale", "shibari", "shit", "shota", "shrimping", "slanteye", "slut", "smut", "snatch", "snowballing", "sodomize", "sodomy", "spic", "spooge", "spread legs", "strap on", "strapon", "strappado", "strip club", "style doggy", "suck", "sucks", "suicide girls", "sultry women", "swastika", "swinger", "tainted love", "taste my", "tea bagging", "threesome", "throating", "tied up", "tight white", "tit", "tits", "titties", "titty", "tongue in a", "topless", "tosser", "towelhead", "tranny", "tribadism", "tub girl", "tubgirl", "tushy", "twat", "twink", "twinkie", "undressing", "upskirt", "urethra play", "urophilia", "vagina", "venus mound", "vibrator", "violet blue", "violet wand", "vorarephilia", "voyeur", "vulva", "wank", "wet dream", "wetback", "white power", "whore", "women rapping", "wrapping men", "wrinkled starfish", "xx", "xxx", "yaoi", "yellow showers", "yiffy", "zoophilia", ] def normalize(self, text: str) -> str: result = text.lower() result = result.replace("_", " ") for x in string.punctuation: result = result.replace(x, "") result = re.sub(r"e?s$", "", result) return result def filter_bad_words(self, text: str) -> str: badWordMask = "!@#$%!@#$%^~!@%^~@#$%!@#$%^~!" brokenStr1 = text.split() for word in brokenStr1: if self.normalize(word) in self.arrBad or word in self.arrBad: print(f'***** PROFANITY WORD="{word}"') text = text.replace(word, badWordMask[: len(word)]) if len(brokenStr1) > 1: bigrams = list(zip(brokenStr1, brokenStr1[1:])) for bigram in bigrams: phrase = f"{bigram[0]} {bigram[1]}" if self.normalize(phrase) in self.arrBad or phrase in self.arrBad: print(f'***** PROFANITY PHRASE="{phrase}"') text = text.replace(bigram[0], badWordMask[: len(bigram[0])]) text = text.replace(bigram[1], badWordMask[: len(bigram[1])]) if len(brokenStr1) > 2: trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:])) for trigram in trigrams: phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}" if self.normalize(phrase) in self.arrBad or phrase in self.arrBad: print(f'***** PROFANITY PHRASE="{phrase}"') text = text.replace(trigram[0], badWordMask[: len(trigram[0])]) text = text.replace(trigram[1], badWordMask[: len(trigram[1])]) text = text.replace(trigram[2], badWordMask[: len(trigram[2])]) return text def contains_bad_words(self, text: str) -> bool: brokenStr1 = text.split() for word in brokenStr1: if self.normalize(word) in self.arrBad or word in self.arrBad: print(f'***** PROFANITY WORD="{word}"') return True if len(brokenStr1) > 1: bigrams = list(zip(brokenStr1, brokenStr1[1:])) for bigram in bigrams: phrase = f"{bigram[0]} {bigram[1]}" if self.normalize(phrase) in self.arrBad or phrase in self.arrBad: print(f'***** PROFANITY PHRASE="{phrase}"') return True if len(brokenStr1) > 2: trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:])) for trigram in trigrams: phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}" if self.normalize(phrase) in self.arrBad or phrase in self.arrBad: print(f'***** PROFANITY PHRASE="{phrase}"') return True return False # x = profanity_filter() # print(x.filter_bad_words("Fuck this auto erotic shit, it's not safe for work.")) # print(x.contains_bad_words("cream pie their daughter.")) # print(x.contains_bad_words("If you tell someone your penis is 6 inches it's pretty believable. If you say it's half a foot no one will believe you.")) # print(x.normalize("dickes"));