profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 import logging
   4 import random
   5 import string
   6 import sys
   7
   8 import nltk
   9 from nltk.stem import PorterStemmer
  10
  11 import decorator_utils
  12 import string_utils
  13
  14
  15 logger = logging.getLogger(__name__)
  16
  17
  18 @decorator_utils.singleton
  19 class ProfanityFilter(object):
  20     def __init__(self):
  21         self.bad_words = set([
  22             'acrotomophilia',
  23             'anal',
  24             'analingus',
  25             'anally',
  26             'anilingus',
  27             'anus',
  28             'arsehol',
  29             'arsehole',
  30             'ass',
  31             'asses',
  32             'asshol',
  33             'asshole',
  34             'assmunch',
  35             'auto erot',
  36             'auto erotic',
  37             'autoerotic',
  38             'babeland',
  39             'babi batter',
  40             'baby batter',
  41             'ball gag',
  42             'ball gravi',
  43             'ball gravy',
  44             'ball kick',
  45             'ball kicking',
  46             'ball lick',
  47             'ball licking',
  48             'ball sack',
  49             'ball suck',
  50             'ball sucking',
  51             'ball zack',
  52             'bangbro',
  53             'bangbros',
  54             'bare legal',
  55             'bareback',
  56             'barely legal',
  57             'barenak',
  58             'barenaked',
  59             'bastardo',
  60             'bastinado',
  61             'bbc',
  62             'bbw',
  63             'bdsm',
  64             'beaver cleaver',
  65             'beaver lip',
  66             'beaver lips',
  67             'bestial',
  68             'bestiality',
  69             'bi curiou',
  70             'bi curious',
  71             'big black',
  72             'big breasts',
  73             'big knocker',
  74             'big knockers',
  75             'big tit',
  76             'big tits',
  77             'bimbo',
  78             'birdlock',
  79             'bitch',
  80             'bitches',
  81             'black cock',
  82             'blond action',
  83             'blond on blond',
  84             'blonde action',
  85             'blow j',
  86             'blow job',
  87             'blowjob',
  88             'blow my',
  89             'blow me',
  90             'blow ourselv',
  91             'blow ourselves',
  92             'blow your load',
  93             'blue waffl',
  94             'blue waffle',
  95             'blumpkin',
  96             'bollock',
  97             'bollocks',
  98             'bondag',
  99             'bondage',
 100             'boner',
 101             'boob',
 102             'boobs',
 103             'booti call',
 104             'booty call',
 105             'breast',
 106             'breasts',
 107             'brown shower',
 108             'brown showers',
 109             'brunett action',
 110             'brunette action',
 111             'bukkak',
 112             'bukkake',
 113             'bulldyk',
 114             'bulldyke',
 115             'bullet vibe',
 116             'bullshit',
 117             'bung hole',
 118             'bunghol',
 119             'bunghole',
 120             'busti',
 121             'busty',
 122             'butt',
 123             'buttcheek',
 124             'buttcheeks',
 125             'butthol',
 126             'butthole',
 127             'camel toe',
 128             'camgirl',
 129             'camslut',
 130             'camwhore',
 131             'carpet muncher',
 132             'carpetmuncher',
 133             'chocol rosebud',
 134             'chocolate rosebuds',
 135             'circlejerk',
 136             'chink',
 137             'cleveland steamer',
 138             'clit',
 139             'clitori',
 140             'clitoris',
 141             'clover clamp',
 142             'clover clamps',
 143             'clusterfuck',
 144             'cock',
 145             'cocks',
 146             'coprolagnia',
 147             'coprophilia',
 148             'cornhol',
 149             'cornhole',
 150             'cream pie',
 151             'creampi',
 152             'creampie',
 153             'cum',
 154             'cumming',
 155             'cunnilingu',
 156             'cunnilingus',
 157             'cunt',
 158             'damn',
 159             'darki',
 160             'darkie',
 161             'date rape',
 162             'daterap',
 163             'daterape',
 164             'deep throat',
 165             'deepthroat',
 166             'dick',
 167             'dildo',
 168             'dirti pillow',
 169             'dirti sanchez',
 170             'dirty pillow',
 171             'dirty sanchez',
 172             'dog style',
 173             'doggi style',
 174             'doggie style',
 175             'doggiestyl',
 176             'doggiestyle',
 177             'doggystyle',
 178             'dolcett',
 179             'domination',
 180             'dominatrix',
 181             'domm',
 182             'dommes',
 183             'donkey punch',
 184             'doubl dick',
 185             'doubl dong',
 186             'doubl penetr',
 187             'double dick',
 188             'double dong',
 189             'double penetration',
 190             'dp action',
 191             'dtf',
 192             'eat my ass',
 193             'ecchi',
 194             'ejacul',
 195             'erection',
 196             'erotic',
 197             'erotism',
 198             'escort',
 199             'ethical slut',
 200             'eunuch',
 201             'faggot',
 202             'fecal',
 203             'felch',
 204             'fellatio',
 205             'feltch',
 206             'female squirting',
 207             'femdom',
 208             'figging',
 209             'fingered',
 210             'fingering',
 211             'fingers',
 212             'fisted',
 213             'fisting',
 214             'fists',
 215             'foot fetish',
 216             'footjob',
 217             'frotting',
 218             'fuck button',
 219             'fuck',
 220             'fucked',
 221             'fucker',
 222             'fuckhead',
 223             'fuckin',
 224             'fucking',
 225             'fudge packer',
 226             'fudgepack',
 227             'fudgepacker',
 228             'futanari',
 229             'g spot',
 230             'g-spot',
 231             'gang bang',
 232             'gay sex',
 233             'gee spot',
 234             'genital',
 235             'giant cock',
 236             'girl gone wild',
 237             'girl on top',
 238             'girl on',
 239             'goatcx',
 240             'goatse',
 241             'goddamn',
 242             'gokkun',
 243             'golden shower',
 244             'goo girl',
 245             'goodpoop',
 246             'goregasm',
 247             'grope',
 248             'group sex',
 249             'gspot',
 250             'guro',
 251             'hand job',
 252             'handjob',
 253             'hard core',
 254             'hardcore',
 255             'hentai',
 256             'homoerotic',
 257             'honkey',
 258             'hooker',
 259             'horni',
 260             'horny',
 261             'hot chick',
 262             'how to kill',
 263             'how to murder',
 264             'huge fat',
 265             'humped',
 266             'humping',
 267             'humps',
 268             'incest',
 269             'intercourse',
 270             'jack off',
 271             'jail bait',
 272             'jailbait',
 273             'jerk off',
 274             'jigaboo',
 275             'jiggaboo',
 276             'jiggerboo',
 277             'jizz',
 278             'jugg',
 279             'kike',
 280             'kinbaku',
 281             'kinkster',
 282             'kinky',
 283             'knobbing',
 284             'leather restraint',
 285             'lemon party',
 286             'lolita',
 287             'lovemaking',
 288             'make me come',
 289             'male squirting',
 290             'masturb',
 291             'menage a trois',
 292             'milf',
 293             'missionary position',
 294             'motherfuck',
 295             'mound of venus',
 296             'mr hand',
 297             'muff diver',
 298             'muffdiv',
 299             'muffdiving',
 300             'nambla',
 301             'nawashi',
 302             'negro',
 303             'neonazi',
 304             'nig nog',
 305             'nigga',
 306             'nigger',
 307             'nimphomania',
 308             'nipple',
 309             'nip',
 310             'not safe for',
 311             'nsfl',
 312             'nsfw',
 313             'nude',
 314             'nudes',
 315             'nudity',
 316             'nut sack',
 317             'nutsack',
 318             'nympho',
 319             'nymphomania',
 320             'octopussy',
 321             'omorashi',
 322             'one night stand',
 323             'orgasm',
 324             'orgy',
 325             'paedophil',
 326             'paedophile',
 327             'panties',
 328             'panty',
 329             'pedobear',
 330             'pedophil',
 331             'pedophile',
 332             'pee',
 333             'pegging',
 334             'peni',
 335             'penis',
 336             'phone sex',
 337             'pigfucker',
 338             'piss pig',
 339             'piss',
 340             'pissing',
 341             'pisspig',
 342             'playboy',
 343             'pleasure chest',
 344             'pole smoker',
 345             'ponyplay',
 346             'poof',
 347             'poop chute',
 348             'poopchute',
 349             'porn',
 350             'pornhub',
 351             'porno',
 352             'pornographi',
 353             'pornography',
 354             'prince albert',
 355             'pthc',
 356             'pube',
 357             'pussi',
 358             'pussies',
 359             'pussy',
 360             'queaf',
 361             'queer',
 362             'raghead',
 363             'raging boner',
 364             'rape',
 365             'raping',
 366             'rapist',
 367             'rectum',
 368             'reverse cowgirl',
 369             'rimjob',
 370             'rimming',
 371             'rosy palm',
 372             'rusty trombone',
 373             's & m',
 374             's&m',
 375             's+m',
 376             'sadism',
 377             'scat',
 378             'schlong',
 379             'scissoring',
 380             'semen',
 381             'sex',
 382             'sexi',
 383             'sexo',
 384             'sexy',
 385             'shaved beaver',
 386             'shaved pussy',
 387             'shemale',
 388             'shibari',
 389             'shit',
 390             'shota',
 391             'shrimping',
 392             'slanteye',
 393             'slut',
 394             'smut',
 395             'snatch',
 396             'snm',
 397             'snowballing',
 398             'sodomi',
 399             'sodomize',
 400             'sodomy',
 401             'spic',
 402             'spooge',
 403             'spread legs',
 404             'squirting',
 405             'strap on',
 406             'strapon',
 407             'strappado',
 408             'strip club',
 409             'style doggy',
 410             'suck',
 411             'suicide girls',
 412             'sultry women',
 413             'swastika',
 414             'swinger',
 415             'taint',
 416             'tainted love',
 417             'taste my',
 418             'tea bagging',
 419             'threesome',
 420             'throating',
 421             'tied up',
 422             'tight white',
 423             'tit',
 424             'tits',
 425             'titti',
 426             'titties',
 427             'titty',
 428             'tongue in',
 429             'topless',
 430             'tosser',
 431             'towelhead',
 432             'tranny',
 433             'tribadism',
 434             'tub girl',
 435             'tubgirl',
 436             'tushy',
 437             'twat',
 438             'twink',
 439             'twinki',
 440             'twinkie',
 441             'undress',
 442             'upskirt',
 443             'urethra play',
 444             'urophilia',
 445             'vag',
 446             'vagina',
 447             'venus mound',
 448             'vibrator',
 449             'violet blue',
 450             'violet wand',
 451             'vorarephilia',
 452             'voyeur',
 453             'vulva',
 454             'wank',
 455             'wet dream',
 456             'wetback',
 457             'white power',
 458             'whore',
 459             'women rapping',
 460             'wrapping men',
 461             'wrinkled starfish',
 462             'xx',
 463             'xxx',
 464             'yaoi',
 465             'yellow shower',
 466             'yiffy',
 467             'zoophilia',
 468         ])
 469         self.stemmer = PorterStemmer()
 470
 471     def _normalize(self, text: str) -> str:
 472         result = text.lower()
 473         result = result.replace("_", " ")
 474         for x in string.punctuation:
 475             result = result.replace(x, "")
 476         chunks = [
 477             self.stemmer.stem(word) for word in nltk.word_tokenize(result)
 478         ]
 479         return ' '.join(chunks)
 480
 481     def contains_bad_word(self, text: str) -> bool:
 482         words = nltk.word_tokenize(text)
 483         for word in words:
 484             if self.is_bad_word(word):
 485                 logger.debug(f'"{word}" is profanity')
 486                 return True
 487
 488         if len(words) > 1:
 489             for bigram in string_utils.ngrams_presplit(words, 2):
 490                 bigram = ' '.join(bigram)
 491                 if self.is_bad_word(bigram):
 492                     logger.debug(f'"{bigram}" is profanity')
 493                     return True
 494
 495         if len(words) > 2:
 496             for trigram in string_utils.ngrams_presplit(words, 3):
 497                 trigram = ' '.join(trigram)
 498                 if self.is_bad_word(trigram):
 499                     logger.debug(f'"{trigram}" is profanity')
 500                     return True
 501         return False
 502
 503     def is_bad_word(self, word: str) -> bool:
 504         return (
 505             word in self.bad_words or
 506             self._normalize(word) in self.bad_words
 507         )
 508
 509     def obscure_bad_words(self, text: str) -> str:
 510
 511         def obscure(word: str):
 512             out = ''
 513             last = ''
 514             for letter in word:
 515                 if letter.isspace():
 516                     out += letter
 517                 else:
 518                     while True:
 519                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 520                         if last != char:
 521                             last = char
 522                             out += char
 523                             break
 524             return out
 525
 526         words = nltk.word_tokenize(text)
 527         words.append('')
 528         words.append('')
 529         words.append('')
 530         out = ''
 531
 532         cursor = 0
 533         while cursor < len(words) - 3:
 534             word = words[cursor]
 535             bigram = word + ' ' + words[cursor + 1]
 536             trigram = bigram + ' ' + words[cursor + 2]
 537             if self.is_bad_word(trigram):
 538                 out += obscure(trigram) + ' '
 539                 cursor += 3
 540             elif self.is_bad_word(bigram):
 541                 out += obscure(bigram) + ' '
 542                 cursor += 2
 543             elif self.is_bad_word(word):
 544                 out += obscure(word) + ' '
 545                 cursor += 1
 546             else:
 547                 out += word + ' '
 548                 cursor += 1
 549         return out.strip()
 550
 551
 552 def main() -> None:
 553     pf = ProfanityFilter()
 554     phrase = ' '.join(sys.argv[1:])
 555     print(pf.contains_bad_word(phrase))
 556     print(pf.obscure_bad_words(phrase))
 557     sys.exit(0)
 558
 559
 560 if __name__ == '__main__':
 561     main()