profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 import string
   4 import re
   5
   6
   7 class profanity_filter:
   8     def __init__(self):
   9         self.arrBad = [
  10             "acrotomophilia",
  11             "anal",
  12             "anally",
  13             "anilingus",
  14             "anus",
  15             "arsehole",
  16             "ass",
  17             "asses",
  18             "asshole",
  19             "assmunch",
  20             "auto erotic",
  21             "autoerotic",
  22             "babeland",
  23             "baby batter",
  24             "ball gag",
  25             "ball gravy",
  26             "ball kicking",
  27             "ball licking",
  28             "ball sack",
  29             "ball zack",
  30             "ball sucking",
  31             "bangbros",
  32             "bareback",
  33             "barely legal",
  34             "barenaked",
  35             "bastardo",
  36             "bastinado",
  37             "bbw",
  38             "bdsm",
  39             "beaver cleaver",
  40             "beaver lips",
  41             "bestiality",
  42             "bi curious",
  43             "big black",
  44             "big breasts",
  45             "big knockers",
  46             "big tits",
  47             "bimbos",
  48             "birdlock",
  49             "bitch",
  50             "bitches",
  51             "black cock",
  52             "blonde action",
  53             "blonde on blonde",
  54             "blow j",
  55             "blow your l",
  56             "blow ourselves",
  57             "blow m",
  58             "blue waffle",
  59             "blumpkin",
  60             "bollocks",
  61             "bondage",
  62             "boner",
  63             "boob",
  64             "boobs",
  65             "booty call",
  66             "breasts",
  67             "brown showers",
  68             "brunette action",
  69             "bukkake",
  70             "bulldyke",
  71             "bullshit",
  72             "bullet vibe",
  73             "bung hole",
  74             "bunghole",
  75             "busty",
  76             "butt",
  77             "buttcheeks",
  78             "butthole",
  79             "camel toe",
  80             "camgirl",
  81             "camslut",
  82             "camwhore",
  83             "carpet muncher",
  84             "carpetmuncher",
  85             "chocolate rosebuds",
  86             "circlejerk",
  87             "cleveland steamer",
  88             "clit",
  89             "clitoris",
  90             "clover clamps",
  91             "clusterfuck",
  92             "cock",
  93             "cocks",
  94             "coprolagnia",
  95             "coprophilia",
  96             "cornhole",
  97             "creampie",
  98             "cream pie",
  99             "cum",
 100             "cumming",
 101             "cunnilingus",
 102             "cunt",
 103             "damn",
 104             "darkie",
 105             "date rape",
 106             "daterape",
 107             "deep throat",
 108             "deepthroat",
 109             "dick",
 110             "dildo",
 111             "dirty pillows",
 112             "dirty sanchez",
 113             "dog style",
 114             "doggie style",
 115             "doggiestyle",
 116             "doggy style",
 117             "doggystyle",
 118             "dolcett",
 119             "domination",
 120             "dominatrix",
 121             "dommes",
 122             "donkey punch",
 123             "double dick",
 124             "double dong",
 125             "double penetration",
 126             "dp action",
 127             "dtf",
 128             "eat my ass",
 129             "ecchi",
 130             "ejaculation",
 131             "erection",
 132             "erotic",
 133             "erotism",
 134             "escort",
 135             "ethical slut",
 136             "eunuch",
 137             "faggot",
 138             "posts each week",
 139             "fecal",
 140             "felch",
 141             "fellatio",
 142             "feltch",
 143             "female squirting",
 144             "femdom",
 145             "figging",
 146             "fingering",
 147             "fisting",
 148             "foot fetish",
 149             "footjob",
 150             "frotting",
 151             "fuck",
 152             "fucking",
 153             "fuckin",
 154             "fuckin'",
 155             "fucked",
 156             "fuckers",
 157             "fuck buttons",
 158             "fuckhead",
 159             "fudge packer",
 160             "fudgepacker",
 161             "futanari",
 162             "g-spot",
 163             "gspot",
 164             "gang bang",
 165             "gay sex",
 166             "genitals",
 167             "giant cock",
 168             "girl on",
 169             "girl on top",
 170             "girls gone wild",
 171             "goatcx",
 172             "goatse",
 173             "goddamn",
 174             "gokkun",
 175             "golden shower",
 176             "goo girl",
 177             "goodpoop",
 178             "goregasm",
 179             "grope",
 180             "group sex",
 181             "guro",
 182             "hand job",
 183             "handjob",
 184             "hard core",
 185             "hardcore",
 186             "hentai",
 187             "homoerotic",
 188             "honkey",
 189             "hooker",
 190             "horny",
 191             "hot chick",
 192             "how to kill",
 193             "how to murder",
 194             "huge fat",
 195             "humping",
 196             "incest",
 197             "intercourse",
 198             "jack off",
 199             "jail bait",
 200             "jailbait",
 201             "jerk off",
 202             "jerking off",
 203             "jigaboo",
 204             "jiggaboo",
 205             "jiggerboo",
 206             "jizz",
 207             "juggs",
 208             "kike",
 209             "kinbaku",
 210             "kinkster",
 211             "kinky",
 212             "knobbing",
 213             "leather restraint",
 214             "lemon party",
 215             "lolita",
 216             "lovemaking",
 217             "lpt request",
 218             "make me come",
 219             "male squirting",
 220             "masturbate",
 221             "masturbated",
 222             "masturbating",
 223             "menage a trois",
 224             "milf",
 225             "milfs",
 226             "missionary position",
 227             "motherfucker",
 228             "mound of venus",
 229             "mr hands",
 230             "muff diver",
 231             "muffdiving",
 232             "nambla",
 233             "nawashi",
 234             "negro",
 235             "neonazi",
 236             "nig nog",
 237             "nigga",
 238             "nigger",
 239             "nimphomania",
 240             "nipple",
 241             "not safe for",
 242             "nsfw",
 243             "nsfw images",
 244             "nude",
 245             "nudity",
 246             "nutsack",
 247             "nut sack",
 248             "nympho",
 249             "nymphomania",
 250             "octopussy",
 251             "omorashi",
 252             "one night stand",
 253             "orgasm",
 254             "orgy",
 255             "paedophile",
 256             "panties",
 257             "panty",
 258             "pedobear",
 259             "pedophile",
 260             "pegging",
 261             "pee",
 262             "penis",
 263             "phone sex",
 264             "piss pig",
 265             "pissing",
 266             "pisspig",
 267             "playboy",
 268             "pleasure chest",
 269             "pole smoker",
 270             "ponyplay",
 271             "poof",
 272             "poop chute",
 273             "poopchute",
 274             "porn",
 275             "pornhub",
 276             "porno",
 277             "pornography",
 278             "prince albert",
 279             "pthc",
 280             "pube",
 281             "pubes",
 282             "pussy",
 283             "pussies",
 284             "queaf",
 285             "queer",
 286             "raghead",
 287             "raging boner",
 288             "rape",
 289             "raping",
 290             "rapist",
 291             "rectum",
 292             "reverse cowgirl",
 293             "rimjob",
 294             "rimming",
 295             "rosy palm",
 296             "rusty trombone",
 297             "s&m",
 298             "sadism",
 299             "scat",
 300             "schlong",
 301             "scissoring",
 302             "semen",
 303             "sex",
 304             "sexo",
 305             "sexy",
 306             "shaved beaver",
 307             "shaved pussy",
 308             "shemale",
 309             "shibari",
 310             "shit",
 311             "shota",
 312             "shrimping",
 313             "slanteye",
 314             "slut",
 315             "smut",
 316             "snatch",
 317             "snowballing",
 318             "sodomize",
 319             "sodomy",
 320             "spic",
 321             "spooge",
 322             "spread legs",
 323             "strap on",
 324             "strapon",
 325             "strappado",
 326             "strip club",
 327             "style doggy",
 328             "suck",
 329             "sucks",
 330             "suicide girls",
 331             "sultry women",
 332             "swastika",
 333             "swinger",
 334             "tainted love",
 335             "taste my",
 336             "tea bagging",
 337             "threesome",
 338             "throating",
 339             "tied up",
 340             "tight white",
 341             "tit",
 342             "tits",
 343             "titties",
 344             "titty",
 345             "tongue in a",
 346             "topless",
 347             "tosser",
 348             "towelhead",
 349             "tranny",
 350             "tribadism",
 351             "tub girl",
 352             "tubgirl",
 353             "tushy",
 354             "twat",
 355             "twink",
 356             "twinkie",
 357             "undressing",
 358             "upskirt",
 359             "urethra play",
 360             "urophilia",
 361             "vagina",
 362             "venus mound",
 363             "vibrator",
 364             "violet blue",
 365             "violet wand",
 366             "vorarephilia",
 367             "voyeur",
 368             "vulva",
 369             "wank",
 370             "wet dream",
 371             "wetback",
 372             "white power",
 373             "whore",
 374             "women rapping",
 375             "wrapping men",
 376             "wrinkled starfish",
 377             "xx",
 378             "xxx",
 379             "yaoi",
 380             "yellow showers",
 381             "yiffy",
 382             "zoophilia",
 383         ]
 384
 385     def normalize(self, text: str) -> str:
 386         result = text.lower()
 387         result = result.replace("_", " ")
 388         for x in string.punctuation:
 389             result = result.replace(x, "")
 390         result = re.sub(r"e?s$", "", result)
 391         return result
 392
 393     def filter_bad_words(self, text: str) -> str:
 394         badWordMask = "!@#$%!@#$%^~!@%^~@#$%!@#$%^~!"
 395
 396         brokenStr1 = text.split()
 397         for word in brokenStr1:
 398             if self.normalize(word) in self.arrBad or word in self.arrBad:
 399                 print(f'***** PROFANITY WORD="{word}"')
 400                 text = text.replace(word, badWordMask[: len(word)])
 401
 402         if len(brokenStr1) > 1:
 403             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 404             for bigram in bigrams:
 405                 phrase = f"{bigram[0]} {bigram[1]}"
 406                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 407                     print(f'***** PROFANITY PHRASE="{phrase}"')
 408                     text = text.replace(bigram[0], badWordMask[: len(bigram[0])])
 409                     text = text.replace(bigram[1], badWordMask[: len(bigram[1])])
 410
 411         if len(brokenStr1) > 2:
 412             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 413             for trigram in trigrams:
 414                 phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}"
 415                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 416                     print(f'***** PROFANITY PHRASE="{phrase}"')
 417                     text = text.replace(trigram[0], badWordMask[: len(trigram[0])])
 418                     text = text.replace(trigram[1], badWordMask[: len(trigram[1])])
 419                     text = text.replace(trigram[2], badWordMask[: len(trigram[2])])
 420         return text
 421
 422     def contains_bad_words(self, text: str) -> bool:
 423         brokenStr1 = text.split()
 424         for word in brokenStr1:
 425             if self.normalize(word) in self.arrBad or word in self.arrBad:
 426                 print(f'***** PROFANITY WORD="{word}"')
 427                 return True
 428
 429         if len(brokenStr1) > 1:
 430             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 431             for bigram in bigrams:
 432                 phrase = f"{bigram[0]} {bigram[1]}"
 433                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 434                     print(f'***** PROFANITY PHRASE="{phrase}"')
 435                     return True
 436
 437         if len(brokenStr1) > 2:
 438             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 439             for trigram in trigrams:
 440                 phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}"
 441                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 442                     print(f'***** PROFANITY PHRASE="{phrase}"')
 443                     return True
 444         return False
 445
 446
 447 # x = profanity_filter()
 448 # print(x.filter_bad_words("Fuck this auto erotic shit, it's not safe for work."))
 449 # print(x.contains_bad_words("cream pie their daughter."))
 450 # print(x.contains_bad_words("If you tell someone your penis is 6 inches it's pretty believable.  If you say it's half a foot no one will believe you."))
 451 # print(x.normalize("dickes"));