profanity_filter.py

   1 import string
   2 import re
   3
   4
   5 class profanity_filter:
   6     def __init__(self):
   7         self.arrBad = [
   8             "acrotomophilia",
   9             "anal",
  10             "anally",
  11             "anilingus",
  12             "anus",
  13             "arsehole",
  14             "ass",
  15             "asses",
  16             "asshole",
  17             "assmunch",
  18             "auto erotic",
  19             "autoerotic",
  20             "babeland",
  21             "baby batter",
  22             "ball gag",
  23             "ball gravy",
  24             "ball kicking",
  25             "ball licking",
  26             "ball sack",
  27             "ball zack",
  28             "ball sucking",
  29             "bangbros",
  30             "bareback",
  31             "barely legal",
  32             "barenaked",
  33             "bastardo",
  34             "bastinado",
  35             "bbw",
  36             "bdsm",
  37             "beaver cleaver",
  38             "beaver lips",
  39             "bestiality",
  40             "bi curious",
  41             "big black",
  42             "big breasts",
  43             "big knockers",
  44             "big tits",
  45             "bimbos",
  46             "birdlock",
  47             "bitch",
  48             "bitches",
  49             "black cock",
  50             "blonde action",
  51             "blonde on blonde",
  52             "blow j",
  53             "blow your l",
  54             "blow ourselves",
  55             "blow m",
  56             "blue waffle",
  57             "blumpkin",
  58             "bollocks",
  59             "bondage",
  60             "boner",
  61             "boob",
  62             "boobs",
  63             "booty call",
  64             "breasts",
  65             "brown showers",
  66             "brunette action",
  67             "bukkake",
  68             "bulldyke",
  69             "bullshit",
  70             "bullet vibe",
  71             "bung hole",
  72             "bunghole",
  73             "busty",
  74             "butt",
  75             "buttcheeks",
  76             "butthole",
  77             "camel toe",
  78             "camgirl",
  79             "camslut",
  80             "camwhore",
  81             "carpet muncher",
  82             "carpetmuncher",
  83             "chocolate rosebuds",
  84             "circlejerk",
  85             "cleveland steamer",
  86             "clit",
  87             "clitoris",
  88             "clover clamps",
  89             "clusterfuck",
  90             "cock",
  91             "cocks",
  92             "coprolagnia",
  93             "coprophilia",
  94             "cornhole",
  95             "creampie",
  96             "cream pie",
  97             "cum",
  98             "cumming",
  99             "cunnilingus",
 100             "cunt",
 101             "damn",
 102             "darkie",
 103             "date rape",
 104             "daterape",
 105             "deep throat",
 106             "deepthroat",
 107             "dick",
 108             "dildo",
 109             "dirty pillows",
 110             "dirty sanchez",
 111             "dog style",
 112             "doggie style",
 113             "doggiestyle",
 114             "doggy style",
 115             "doggystyle",
 116             "dolcett",
 117             "domination",
 118             "dominatrix",
 119             "dommes",
 120             "donkey punch",
 121             "double dick",
 122             "double dong",
 123             "double penetration",
 124             "dp action",
 125             "dtf",
 126             "eat my ass",
 127             "ecchi",
 128             "ejaculation",
 129             "erection",
 130             "erotic",
 131             "erotism",
 132             "escort",
 133             "ethical slut",
 134             "eunuch",
 135             "faggot",
 136             "posts each week",
 137             "fecal",
 138             "felch",
 139             "fellatio",
 140             "feltch",
 141             "female squirting",
 142             "femdom",
 143             "figging",
 144             "fingering",
 145             "fisting",
 146             "foot fetish",
 147             "footjob",
 148             "frotting",
 149             "fuck",
 150             "fucking",
 151             "fuckin",
 152             "fuckin'",
 153             "fucked",
 154             "fuckers",
 155             "fuck buttons",
 156             "fuckhead",
 157             "fudge packer",
 158             "fudgepacker",
 159             "futanari",
 160             "g-spot",
 161             "gspot",
 162             "gang bang",
 163             "gay sex",
 164             "genitals",
 165             "giant cock",
 166             "girl on",
 167             "girl on top",
 168             "girls gone wild",
 169             "goatcx",
 170             "goatse",
 171             "goddamn",
 172             "gokkun",
 173             "golden shower",
 174             "goo girl",
 175             "goodpoop",
 176             "goregasm",
 177             "grope",
 178             "group sex",
 179             "guro",
 180             "hand job",
 181             "handjob",
 182             "hard core",
 183             "hardcore",
 184             "hentai",
 185             "homoerotic",
 186             "honkey",
 187             "hooker",
 188             "horny",
 189             "hot chick",
 190             "how to kill",
 191             "how to murder",
 192             "huge fat",
 193             "humping",
 194             "incest",
 195             "intercourse",
 196             "jack off",
 197             "jail bait",
 198             "jailbait",
 199             "jerk off",
 200             "jerking off",
 201             "jigaboo",
 202             "jiggaboo",
 203             "jiggerboo",
 204             "jizz",
 205             "juggs",
 206             "kike",
 207             "kinbaku",
 208             "kinkster",
 209             "kinky",
 210             "knobbing",
 211             "leather restraint",
 212             "lemon party",
 213             "lolita",
 214             "lovemaking",
 215             "lpt request",
 216             "make me come",
 217             "male squirting",
 218             "masturbate",
 219             "masturbated",
 220             "masturbating",
 221             "menage a trois",
 222             "milf",
 223             "milfs",
 224             "missionary position",
 225             "motherfucker",
 226             "mound of venus",
 227             "mr hands",
 228             "muff diver",
 229             "muffdiving",
 230             "nambla",
 231             "nawashi",
 232             "negro",
 233             "neonazi",
 234             "nig nog",
 235             "nigga",
 236             "nigger",
 237             "nimphomania",
 238             "nipple",
 239             "not safe for",
 240             "nsfw",
 241             "nsfw images",
 242             "nude",
 243             "nudity",
 244             "nutsack",
 245             "nut sack",
 246             "nympho",
 247             "nymphomania",
 248             "octopussy",
 249             "omorashi",
 250             "one night stand",
 251             "orgasm",
 252             "orgy",
 253             "paedophile",
 254             "panties",
 255             "panty",
 256             "pedobear",
 257             "pedophile",
 258             "pegging",
 259             "pee",
 260             "penis",
 261             "phone sex",
 262             "piss pig",
 263             "pissing",
 264             "pisspig",
 265             "playboy",
 266             "pleasure chest",
 267             "pole smoker",
 268             "ponyplay",
 269             "poof",
 270             "poop chute",
 271             "poopchute",
 272             "porn",
 273             "pornhub",
 274             "porno",
 275             "pornography",
 276             "prince albert",
 277             "pthc",
 278             "pube",
 279             "pubes",
 280             "pussy",
 281             "pussies",
 282             "queaf",
 283             "queer",
 284             "raghead",
 285             "raging boner",
 286             "rape",
 287             "raping",
 288             "rapist",
 289             "rectum",
 290             "reverse cowgirl",
 291             "rimjob",
 292             "rimming",
 293             "rosy palm",
 294             "rusty trombone",
 295             "s&m",
 296             "sadism",
 297             "scat",
 298             "schlong",
 299             "scissoring",
 300             "semen",
 301             "sex",
 302             "sexo",
 303             "sexy",
 304             "shaved beaver",
 305             "shaved pussy",
 306             "shemale",
 307             "shibari",
 308             "shit",
 309             "shota",
 310             "shrimping",
 311             "slanteye",
 312             "slut",
 313             "smut",
 314             "snatch",
 315             "snowballing",
 316             "sodomize",
 317             "sodomy",
 318             "spic",
 319             "spooge",
 320             "spread legs",
 321             "strap on",
 322             "strapon",
 323             "strappado",
 324             "strip club",
 325             "style doggy",
 326             "suck",
 327             "sucks",
 328             "suicide girls",
 329             "sultry women",
 330             "swastika",
 331             "swinger",
 332             "tainted love",
 333             "taste my",
 334             "tea bagging",
 335             "threesome",
 336             "throating",
 337             "tied up",
 338             "tight white",
 339             "tit",
 340             "tits",
 341             "titties",
 342             "titty",
 343             "tongue in a",
 344             "topless",
 345             "tosser",
 346             "towelhead",
 347             "tranny",
 348             "tribadism",
 349             "tub girl",
 350             "tubgirl",
 351             "tushy",
 352             "twat",
 353             "twink",
 354             "twinkie",
 355             "undressing",
 356             "upskirt",
 357             "urethra play",
 358             "urophilia",
 359             "vagina",
 360             "venus mound",
 361             "vibrator",
 362             "violet blue",
 363             "violet wand",
 364             "vorarephilia",
 365             "voyeur",
 366             "vulva",
 367             "wank",
 368             "wet dream",
 369             "wetback",
 370             "white power",
 371             "whore",
 372             "women rapping",
 373             "wrapping men",
 374             "wrinkled starfish",
 375             "xx",
 376             "xxx",
 377             "yaoi",
 378             "yellow showers",
 379             "yiffy",
 380             "zoophilia",
 381         ]
 382
 383     def normalize(self, text):
 384         result = text.lower()
 385         result = result.replace("_", " ")
 386         for x in string.punctuation:
 387             result = result.replace(x, "")
 388         result = re.sub(r"e?s$", "", result)
 389         return result
 390
 391     def filter_bad_words(self, text):
 392         badWordMask = "!@#$%!@#$%^~!@%^~@#$%!@#$%^~!"
 393
 394         brokenStr1 = text.split()
 395         for word in brokenStr1:
 396             if self.normalize(word) in self.arrBad or word in self.arrBad:
 397                 print(('***** PROFANITY WORD="%s"' % word))
 398                 text = text.replace(word, badWordMask[: len(word)])
 399
 400         if len(brokenStr1) > 1:
 401             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 402             for bigram in bigrams:
 403                 phrase = "%s %s" % (bigram[0], bigram[1])
 404                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 405                     print(('***** PROFANITY PHRASE="%s"' % phrase))
 406                     text = text.replace(bigram[0], badWordMask[: len(bigram[0])])
 407                     text = text.replace(bigram[1], badWordMask[: len(bigram[1])])
 408
 409         if len(brokenStr1) > 2:
 410             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 411             for trigram in trigrams:
 412                 phrase = "%s %s %s" % (trigram[0], trigram[1], trigram[2])
 413                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 414                     print(('***** PROFANITY PHRASE="%s"' % phrase))
 415                     text = text.replace(trigram[0], badWordMask[: len(trigram[0])])
 416                     text = text.replace(trigram[1], badWordMask[: len(trigram[1])])
 417                     text = text.replace(trigram[2], badWordMask[: len(trigram[2])])
 418         return text
 419
 420     def contains_bad_words(self, text):
 421         brokenStr1 = text.split()
 422         for word in brokenStr1:
 423             if self.normalize(word) in self.arrBad or word in self.arrBad:
 424                 print(('***** PROFANITY WORD="%s"' % word))
 425                 return True
 426
 427         if len(brokenStr1) > 1:
 428             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 429             for bigram in bigrams:
 430                 phrase = "%s %s" % (bigram[0], bigram[1])
 431                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 432                     print(('***** PROFANITY PHRASE="%s"' % phrase))
 433                     return True
 434
 435         if len(brokenStr1) > 2:
 436             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 437             for trigram in trigrams:
 438                 phrase = "%s %s %s" % (trigram[0], trigram[1], trigram[2])
 439                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 440                     print(('***** PROFANITY PHRASE="%s"' % phrase))
 441                     return True
 442
 443         return False
 444
 445
 446 # x = profanity_filter()
 447 # print(x.filter_bad_words("Fuck this auto erotic shit, it's not safe for work."))
 448 # print(x.contains_bad_words("cream pie their daughter."))
 449 # print(x.contains_bad_words("If you tell someone your penis is 6 inches it's pretty believable.  If you say it's half a foot no one will believe you."))
 450 # print(x.normalize("dickes"));