pf.py

   1 #!/usr/bin/env python3
   2
   3 import string
   4 import re
   5
   6
   7 class profanity_filter:
   8     def __init__(self):
   9         self.arrBad = [
  10             "acrotomophilia",
  11             "anal",
  12             "anally",
  13             "anilingus",
  14             "anus",
  15             "arsehole",
  16             "ass",
  17             "asses",
  18             "asshole",
  19             "assmunch",
  20             "auto erotic",
  21             "autoerotic",
  22             "babeland",
  23             "baby batter",
  24             "ball gag",
  25             "ball gravy",
  26             "ball kicking",
  27             "ball licking",
  28             "ball sack",
  29             "ball zack",
  30             "ball sucking",
  31             "bangbros",
  32             "bareback",
  33             "barely legal",
  34             "barenaked",
  35             "bastardo",
  36             "bastinado",
  37             "bbw",
  38             "bdsm",
  39             "beaver cleaver",
  40             "beaver lips",
  41             "bestiality",
  42             "bi curious",
  43             "big black",
  44             "big breasts",
  45             "big knockers",
  46             "big tits",
  47             "bimbos",
  48             "birdlock",
  49             "bitch",
  50             "bitches",
  51             "black cock",
  52             "blonde action",
  53             "blonde on blonde",
  54             "blow j",
  55             "blow your l",
  56             "blow ourselves",
  57             "blow m",
  58             "blue waffle",
  59             "blumpkin",
  60             "bollocks",
  61             "bondage",
  62             "boner",
  63             "boob",
  64             "boobs",
  65             "booty call",
  66             "breasts",
  67             "brown showers",
  68             "brunette action",
  69             "bukkake",
  70             "bulldyke",
  71             "bullshit",
  72             "bullet vibe",
  73             "bung hole",
  74             "bunghole",
  75             "busty",
  76             "butt",
  77             "buttcheeks",
  78             "butthole",
  79             "camel toe",
  80             "camgirl",
  81             "camslut",
  82             "camwhore",
  83             "carpet muncher",
  84             "carpetmuncher",
  85             "chocolate rosebuds",
  86             "circlejerk",
  87             "cleveland steamer",
  88             "clit",
  89             "clitoris",
  90             "clover clamps",
  91             "clusterfuck",
  92             "cock",
  93             "cocks",
  94             "coprolagnia",
  95             "coprophilia",
  96             "cornhole",
  97             "creampie",
  98             "cream pie",
  99             "cum",
 100             "cumming",
 101             "cunnilingus",
 102             "cunt",
 103             "damn",
 104             "darkie",
 105             "date rape",
 106             "daterape",
 107             "deep throat",
 108             "deepthroat",
 109             "dick",
 110             "dildo",
 111             "dirty pillows",
 112             "dirty sanchez",
 113             "dog style",
 114             "doggie style",
 115             "doggiestyle",
 116             "doggy style",
 117             "doggystyle",
 118             "dolcett",
 119             "domination",
 120             "dominatrix",
 121             "dommes",
 122             "donkey punch",
 123             "double dick",
 124             "double dong",
 125             "double penetration",
 126             "dp action",
 127             "dtf",
 128             "eat my ass",
 129             "ecchi",
 130             "ejaculation",
 131             "erection",
 132             "erotic",
 133             "erotism",
 134             "escort",
 135             "ethical slut",
 136             "eunuch",
 137             "faggot",
 138             "posts each week",
 139             "fecal",
 140             "felch",
 141             "fellatio",
 142             "feltch",
 143             "female squirting",
 144             "femdom",
 145             "figging",
 146             "fingering",
 147             "fisting",
 148             "foot fetish",
 149             "footjob",
 150             "frotting",
 151             "fuck",
 152             "fucking",
 153             "fuckin",
 154             "fuckin'",
 155             "fucked",
 156             "fuckers",
 157             "fuck buttons",
 158             "fuckhead",
 159             "fudge packer",
 160             "fudgepacker",
 161             "futanari",
 162             "g-spot",
 163             "gspot",
 164             "gang bang",
 165             "gay sex",
 166             "genitals",
 167             "giant cock",
 168             "girl on",
 169             "girl on top",
 170             "girls gone wild",
 171             "goatcx",
 172             "goatse",
 173             "goddamn",
 174             "gokkun",
 175             "golden shower",
 176             "goo girl",
 177             "goodpoop",
 178             "goregasm",
 179             "grope",
 180             "group sex",
 181             "guro",
 182             "hand job",
 183             "handjob",
 184             "hard core",
 185             "hardcore",
 186             "hentai",
 187             "homoerotic",
 188             "honkey",
 189             "hooker",
 190             "horny",
 191             "hot chick",
 192             "how to kill",
 193             "how to murder",
 194             "huge fat",
 195             "humping",
 196             "incest",
 197             "intercourse",
 198             "jack off",
 199             "jail bait",
 200             "jailbait",
 201             "jerk off",
 202             "jerking off",
 203             "jigaboo",
 204             "jiggaboo",
 205             "jiggerboo",
 206             "jizz",
 207             "juggs",
 208             "kike",
 209             "kinbaku",
 210             "kinkster",
 211             "kinky",
 212             "knobbing",
 213             "leather restraint",
 214             "lemon party",
 215             "lolita",
 216             "lovemaking",
 217             "lpt request",
 218             "make me come",
 219             "male squirting",
 220             "masturbate",
 221             "masturbated",
 222             "masturbating",
 223             "menage a trois",
 224             "milf",
 225             "milfs",
 226             "missionary position",
 227             "motherfucker",
 228             "mound of venus",
 229             "mr hands",
 230             "muff diver",
 231             "muffdiving",
 232             "nambla",
 233             "nawashi",
 234             "negro",
 235             "neonazi",
 236             "nig nog",
 237             "nigga",
 238             "nigger",
 239             "nimphomania",
 240             "nipple",
 241             "not safe for",
 242             "nsfw",
 243             "nsfw images",
 244             "nude",
 245             "nudity",
 246             "nutsack",
 247             "nut sack",
 248             "nympho",
 249             "nymphomania",
 250             "octopussy",
 251             "omorashi",
 252             "one night stand",
 253             "onlyfans",
 254             "orgasm",
 255             "orgy",
 256             "paedophile",
 257             "panties",
 258             "panty",
 259             "pedobear",
 260             "pedophile",
 261             "pegging",
 262             "pee",
 263             "penis",
 264             "phone sex",
 265             "piss pig",
 266             "pissing",
 267             "pisspig",
 268             "playboy",
 269             "pleasure chest",
 270             "pole smoker",
 271             "ponyplay",
 272             "poof",
 273             "poop chute",
 274             "poopchute",
 275             "porn",
 276             "pornhub",
 277             "porno",
 278             "pornography",
 279             "prince albert",
 280             "pthc",
 281             "pube",
 282             "pubes",
 283             "pussy",
 284             "pussies",
 285             "queaf",
 286             "queer",
 287             "raghead",
 288             "raging boner",
 289             "rape",
 290             "raping",
 291             "rapist",
 292             "rectum",
 293             "reverse cowgirl",
 294             "rimjob",
 295             "rimming",
 296             "rosy palm",
 297             "rusty trombone",
 298             "s&m",
 299             "sadism",
 300             "scat",
 301             "schlong",
 302             "scissoring",
 303             "semen",
 304             "sex",
 305             "sexo",
 306             "sexy",
 307             "shaved beaver",
 308             "shaved pussy",
 309             "shemale",
 310             "shibari",
 311             "shit",
 312             "shota",
 313             "shrimping",
 314             "slanteye",
 315             "slut",
 316             "smut",
 317             "snatch",
 318             "snowballing",
 319             "sodomize",
 320             "sodomy",
 321             "spic",
 322             "spooge",
 323             "spread legs",
 324             "strap on",
 325             "strapon",
 326             "strappado",
 327             "strip club",
 328             "style doggy",
 329             "suck",
 330             "sucks",
 331             "suicide girls",
 332             "sultry women",
 333             "swastika",
 334             "swinger",
 335             "tainted love",
 336             "taste my",
 337             "tea bagging",
 338             "threesome",
 339             "throating",
 340             "tied up",
 341             "tight white",
 342             "tit",
 343             "tits",
 344             "titties",
 345             "titty",
 346             "tongue in a",
 347             "topless",
 348             "tosser",
 349             "towelhead",
 350             "tranny",
 351             "tribadism",
 352             "tub girl",
 353             "tubgirl",
 354             "tushy",
 355             "twat",
 356             "twink",
 357             "twinkie",
 358             "undressing",
 359             "upskirt",
 360             "urethra play",
 361             "urophilia",
 362             "vagina",
 363             "venus mound",
 364             "vibrator",
 365             "violet blue",
 366             "violet wand",
 367             "vorarephilia",
 368             "voyeur",
 369             "vulva",
 370             "wank",
 371             "wet dream",
 372             "wetback",
 373             "white power",
 374             "whore",
 375             "women rapping",
 376             "wrapping men",
 377             "wrinkled starfish",
 378             "xx",
 379             "xxx",
 380             "yaoi",
 381             "yellow showers",
 382             "yiffy",
 383             "zoophilia",
 384         ]
 385
 386     def normalize(self, text: str) -> str:
 387         result = text.lower()
 388         result = result.replace("_", " ")
 389         for x in string.punctuation:
 390             result = result.replace(x, "")
 391         result = re.sub(r"e?s$", "", result)
 392         return result
 393
 394     def filter_bad_words(self, text: str) -> str:
 395         badWordMask = "!@#$%!@#$%^~!@%^~@#$%!@#$%^~!"
 396
 397         brokenStr1 = text.split()
 398         for word in brokenStr1:
 399             if self.normalize(word) in self.arrBad or word in self.arrBad:
 400                 print(f'***** PROFANITY WORD="{word}"')
 401                 text = text.replace(word, badWordMask[: len(word)])
 402
 403         if len(brokenStr1) > 1:
 404             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 405             for bigram in bigrams:
 406                 phrase = f"{bigram[0]} {bigram[1]}"
 407                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 408                     print(f'***** PROFANITY PHRASE="{phrase}"')
 409                     text = text.replace(bigram[0], badWordMask[: len(bigram[0])])
 410                     text = text.replace(bigram[1], badWordMask[: len(bigram[1])])
 411
 412         if len(brokenStr1) > 2:
 413             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 414             for trigram in trigrams:
 415                 phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}"
 416                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 417                     print(f'***** PROFANITY PHRASE="{phrase}"')
 418                     text = text.replace(trigram[0], badWordMask[: len(trigram[0])])
 419                     text = text.replace(trigram[1], badWordMask[: len(trigram[1])])
 420                     text = text.replace(trigram[2], badWordMask[: len(trigram[2])])
 421         return text
 422
 423     def contains_bad_words(self, text: str) -> bool:
 424         brokenStr1 = text.split()
 425         for word in brokenStr1:
 426             if self.normalize(word) in self.arrBad or word in self.arrBad:
 427                 print(f'***** PROFANITY WORD="{word}"')
 428                 return True
 429
 430         if len(brokenStr1) > 1:
 431             bigrams = list(zip(brokenStr1, brokenStr1[1:]))
 432             for bigram in bigrams:
 433                 phrase = f"{bigram[0]} {bigram[1]}"
 434                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 435                     print(f'***** PROFANITY PHRASE="{phrase}"')
 436                     return True
 437
 438         if len(brokenStr1) > 2:
 439             trigrams = list(zip(brokenStr1, brokenStr1[1:], brokenStr1[2:]))
 440             for trigram in trigrams:
 441                 phrase = f"{trigram[0]} {trigram[1]} {trigram[2]}"
 442                 if self.normalize(phrase) in self.arrBad or phrase in self.arrBad:
 443                     print(f'***** PROFANITY PHRASE="{phrase}"')
 444                     return True
 445         return False
 446
 447
 448 # x = profanity_filter()
 449 # print(x.filter_bad_words("Fuck this auto erotic shit, it's not safe for work."))
 450 # print(x.contains_bad_words("cream pie their daughter."))
 451 # print(x.contains_bad_words("If you tell someone your penis is 6 inches it's pretty believable.  If you say it's half a foot no one will believe you."))
 452 # print(x.normalize("dickes"));