profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 import logging
   4 import random
   5 import string
   6 import sys
   7
   8 import nltk
   9 from nltk.stem import PorterStemmer
  10
  11 import decorator_utils
  12 import string_utils
  13
  14
  15 logger = logging.getLogger(__name__)
  16
  17
  18 @decorator_utils.singleton
  19 class ProfanityFilter(object):
  20     def __init__(self):
  21         self.bad_words = set([
  22             'acrotomophilia',
  23             'anal',
  24             'analingus',
  25             'anally',
  26             'anilingus',
  27             'anus',
  28             'arsehol',
  29             'arsehole',
  30             'ass',
  31             'asses',
  32             'asshol',
  33             'asshole',
  34             'assmunch',
  35             'auto erot',
  36             'auto erotic',
  37             'autoerotic',
  38             'babeland',
  39             'babi batter',
  40             'baby batter',
  41             'ball gag',
  42             'ball gravi',
  43             'ball gravy',
  44             'ball kick',
  45             'ball kicking',
  46             'ball lick',
  47             'ball licking',
  48             'ball sack',
  49             'ball suck',
  50             'ball sucking',
  51             'ball zack',
  52             'bangbro',
  53             'bangbros',
  54             'bare legal',
  55             'bareback',
  56             'barely legal',
  57             'barenak',
  58             'barenaked',
  59             'bastardo',
  60             'bastinado',
  61             'bbc',
  62             'bbw',
  63             'bdsm',
  64             'beaver cleaver',
  65             'beaver lip',
  66             'beaver lips',
  67             'bestial',
  68             'bestiality',
  69             'bi curiou',
  70             'bi curious',
  71             'big black',
  72             'big breasts',
  73             'big knocker',
  74             'big knockers',
  75             'big tit',
  76             'big tits',
  77             'bimbo',
  78             'birdlock',
  79             'bitch',
  80             'bitches',
  81             'black cock',
  82             'blond action',
  83             'blond on blond',
  84             'blonde action',
  85             'blow j',
  86             'blow job',
  87             'blowjob',
  88             'blow my',
  89             'blow me',
  90             'blow ourselv',
  91             'blow ourselves',
  92             'blow your load',
  93             'blue waffl',
  94             'blue waffle',
  95             'blumpkin',
  96             'bollock',
  97             'bollocks',
  98             'bondag',
  99             'bondage',
 100             'boner',
 101             'boob',
 102             'boobs',
 103             'booti call',
 104             'booty call',
 105             'breast',
 106             'breasts',
 107             'brown shower',
 108             'brown showers',
 109             'brunett action',
 110             'brunette action',
 111             'bukkak',
 112             'bukkake',
 113             'bulldyk',
 114             'bulldyke',
 115             'bullet vibe',
 116             'bullshit',
 117             'bung hole',
 118             'bunghol',
 119             'bunghole',
 120             'busti',
 121             'busty',
 122             'butt',
 123             'buttcheek',
 124             'buttcheeks',
 125             'butthol',
 126             'butthole',
 127             'camel toe',
 128             'camgirl',
 129             'camslut',
 130             'camwhore',
 131             'carpet muncher',
 132             'carpetmuncher',
 133             'chocol rosebud',
 134             'chocolate rosebuds',
 135             'circlejerk',
 136             'chink',
 137             'cleveland steamer',
 138             'clit',
 139             'clitori',
 140             'clitoris',
 141             'clover clamp',
 142             'clover clamps',
 143             'clusterfuck',
 144             'cock',
 145             'cocks',
 146             'coprolagnia',
 147             'coprophilia',
 148             'cornhol',
 149             'cornhole',
 150             'cream pie',
 151             'creampi',
 152             'creampie',
 153             'cum',
 154             'cumming',
 155             'cunnilingu',
 156             'cunnilingus',
 157             'cunt',
 158             'damn',
 159             'darki',
 160             'darkie',
 161             'date rape',
 162             'daterap',
 163             'daterape',
 164             'deep throat',
 165             'deepthroat',
 166             'dick',
 167             'dildo',
 168             'dirti pillow',
 169             'dirti sanchez',
 170             'dirty pillow',
 171             'dirty sanchez',
 172             'dog style',
 173             'doggi style',
 174             'doggie style',
 175             'doggiestyl',
 176             'doggiestyle',
 177             'doggystyle',
 178             'dolcett',
 179             'domination',
 180             'dominatrix',
 181             'domm',
 182             'dommes',
 183             'donkey punch',
 184             'doubl dick',
 185             'doubl dong',
 186             'doubl penetr',
 187             'double dick',
 188             'double dong',
 189             'double penetration',
 190             'dp action',
 191             'dtf',
 192             'eat my ass',
 193             'ecchi',
 194             'ejacul',
 195             'erection',
 196             'erotic',
 197             'erotism',
 198             'escort',
 199             'ethical slut',
 200             'eunuch',
 201             'faggot',
 202             'fecal',
 203             'felch',
 204             'fellatio',
 205             'feltch',
 206             'female squirting',
 207             'femdom',
 208             'figging',
 209             'fingered',
 210             'fingering',
 211             'fingers',
 212             'fisted',
 213             'fisting',
 214             'fists',
 215             'foot fetish',
 216             'footjob',
 217             'frotting',
 218             'fuck button',
 219             'fuck',
 220             'fucked',
 221             'fucker',
 222             'fuckhead',
 223             'fuckin',
 224             'fucking',
 225             'fudge packer',
 226             'fudgepack',
 227             'fudgepacker',
 228             'futanari',
 229             'g spot',
 230             'g-spot',
 231             'gang bang',
 232             'gay sex',
 233             'gee spot',
 234             'genital',
 235             'giant cock',
 236             'girl gone wild',
 237             'girl on top',
 238             'girl on',
 239             'goatcx',
 240             'goatse',
 241             'goddamn',
 242             'gokkun',
 243             'golden shower',
 244             'goo girl',
 245             'goodpoop',
 246             'goregasm',
 247             'grope',
 248             'group sex',
 249             'gspot',
 250             'guro',
 251             'hand job',
 252             'handjob',
 253             'hard core',
 254             'hardcore',
 255             'hentai',
 256             'homoerotic',
 257             'honkey',
 258             'hooker',
 259             'horni',
 260             'horny',
 261             'hot chick',
 262             'how to kill',
 263             'how to murder',
 264             'huge fat',
 265             'humped',
 266             'humping',
 267             'humps',
 268             'incest',
 269             'intercourse',
 270             'jack off',
 271             'jail bait',
 272             'jailbait',
 273             'jerk off',
 274             'jigaboo',
 275             'jiggaboo',
 276             'jiggerboo',
 277             'jizz',
 278             'jugg',
 279             'kike',
 280             'kinbaku',
 281             'kinkster',
 282             'kinky',
 283             'knobbing',
 284             'leather restraint',
 285             'lemon party',
 286             'lolita',
 287             'lovemaking',
 288             'make me come',
 289             'male squirting',
 290             'masturb',
 291             'menage a trois',
 292             'milf',
 293             'missionary position',
 294             'motherfuck',
 295             'mound of venus',
 296             'mr hand',
 297             'muff diver',
 298             'muffdiv',
 299             'muffdiving',
 300             'nambla',
 301             'nawashi',
 302             'negro',
 303             'neonazi',
 304             'nig nog',
 305             'nigga',
 306             'nigger',
 307             'nimphomania',
 308             'nipple',
 309             'nip',
 310             'not safe for',
 311             'nsfl',
 312             'nsfw',
 313             'nude',
 314             'nudes',
 315             'nudity',
 316             'nut sack',
 317             'nutsack',
 318             'nympho',
 319             'nymphomania',
 320             'octopussy',
 321             'omorashi',
 322             'one night stand',
 323             'orgasm',
 324             'orgy',
 325             'paedophil',
 326             'paedophile',
 327             'panties',
 328             'panty',
 329             'pedobear',
 330             'pedophil',
 331             'pedophile',
 332             'pee',
 333             'pegging',
 334             'peni',
 335             'penis',
 336             'phone sex',
 337             'pigfucker',
 338             'piss pig',
 339             'piss',
 340             'pissing',
 341             'pisspig',
 342             'playboy',
 343             'pleasure chest',
 344             'pole smoker',
 345             'ponyplay',
 346             'poof',
 347             'poop chute',
 348             'poopchute',
 349             'porn',
 350             'pron',
 351             'pornhub',
 352             'porno',
 353             'pornographi',
 354             'pornography',
 355             'prince albert',
 356             'pthc',
 357             'pube',
 358             'pussi',
 359             'pussies',
 360             'pussy',
 361             'queaf',
 362             'queer',
 363             'raghead',
 364             'raging boner',
 365             'rape',
 366             'raping',
 367             'rapist',
 368             'rectum',
 369             'reverse cowgirl',
 370             'rimjob',
 371             'rimming',
 372             'rosy palm',
 373             'rusty trombone',
 374             's & m',
 375             's&m',
 376             's+m',
 377             'sadism',
 378             'scat',
 379             'schlong',
 380             'scissoring',
 381             'semen',
 382             'sex',
 383             'sexi',
 384             'sexo',
 385             'sexy',
 386             'shaved beaver',
 387             'shaved pussy',
 388             'shemale',
 389             'shibari',
 390             'shit',
 391             'shota',
 392             'shrimping',
 393             'slanteye',
 394             'slut',
 395             'smut',
 396             'snatch',
 397             'snm',
 398             'snowballing',
 399             'sodomi',
 400             'sodomize',
 401             'sodomy',
 402             'spic',
 403             'spooge',
 404             'spread legs',
 405             'squirting',
 406             'strap on',
 407             'strapon',
 408             'strappado',
 409             'strip club',
 410             'style doggy',
 411             'suck',
 412             'suicide girls',
 413             'sultry women',
 414             'swastika',
 415             'swinger',
 416             'taint',
 417             'tainted love',
 418             'taste my',
 419             'tea bagging',
 420             'threesome',
 421             'throating',
 422             'tied up',
 423             'tight white',
 424             'tit',
 425             'tits',
 426             'titti',
 427             'titties',
 428             'titty',
 429             'tongue in',
 430             'topless',
 431             'tosser',
 432             'towelhead',
 433             'tranny',
 434             'tribadism',
 435             'tub girl',
 436             'tubgirl',
 437             'tushy',
 438             'twat',
 439             'twink',
 440             'twinki',
 441             'twinkie',
 442             'undress',
 443             'upskirt',
 444             'urethra play',
 445             'urophilia',
 446             'vag',
 447             'vagina',
 448             'venus mound',
 449             'vibrator',
 450             'violet blue',
 451             'violet wand',
 452             'vorarephilia',
 453             'voyeur',
 454             'vulva',
 455             'wank',
 456             'wet dream',
 457             'wetback',
 458             'white power',
 459             'whore',
 460             'women rapping',
 461             'wrapping men',
 462             'wrinkled starfish',
 463             'xx',
 464             'xxx',
 465             'yaoi',
 466             'yellow shower',
 467             'yiffy',
 468             'zoophilia',
 469         ])
 470         self.stemmer = PorterStemmer()
 471
 472     def _normalize(self, text: str) -> str:
 473         """Normalize text.
 474
 475         >>> _normalize('Tittie5')
 476         'titties'
 477
 478         >>> _normalize('Suck a Dick!')
 479         'suck a dick'
 480
 481         >>> _normalize('fucking a whore')
 482         'fuck a whore'
 483
 484         """
 485         result = text.lower()
 486         result = result.replace("_", " ")
 487         result = result.replace('0', 'o')
 488         result = result.replace('1', 'l')
 489         result = result.replace('4', 'a')
 490         result = result.replace('5', 's')
 491         result = result.replace('3', 'e')
 492         for x in string.punctuation:
 493             result = result.replace(x, "")
 494         chunks = [
 495             self.stemmer.stem(word) for word in nltk.word_tokenize(result)
 496         ]
 497         return ' '.join(chunks)
 498
 499     def contains_bad_word(self, text: str) -> bool:
 500         """Returns True if text contains a bad word (or more than one)
 501         and False if no bad words were detected.
 502
 503         >>> contains_bad_word('fuck you')
 504         True
 505
 506         >>> contains_bad_word('FucK u')
 507         True
 508
 509         >>> contains_bad_word('FuK U')
 510         False
 511
 512         """
 513         words = nltk.word_tokenize(text)
 514         for word in words:
 515             if self.is_bad_word(word):
 516                 logger.debug(f'"{word}" is profanity')
 517                 return True
 518
 519         if len(words) > 1:
 520             for bigram in string_utils.ngrams_presplit(words, 2):
 521                 bigram = ' '.join(bigram)
 522                 if self.is_bad_word(bigram):
 523                     logger.debug(f'"{bigram}" is profanity')
 524                     return True
 525
 526         if len(words) > 2:
 527             for trigram in string_utils.ngrams_presplit(words, 3):
 528                 trigram = ' '.join(trigram)
 529                 if self.is_bad_word(trigram):
 530                     logger.debug(f'"{trigram}" is profanity')
 531                     return True
 532         return False
 533
 534     def is_bad_word(self, word: str) -> bool:
 535         return (
 536             word in self.bad_words or
 537             self._normalize(word) in self.bad_words
 538         )
 539
 540     def obscure_bad_words(self, text: str) -> str:
 541         """Obscure bad words that are detected by inserting random punctuation
 542         characters.
 543
 544         """
 545         def obscure(word: str):
 546             out = ''
 547             last = ''
 548             for letter in word:
 549                 if letter.isspace():
 550                     out += letter
 551                 else:
 552                     while True:
 553                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 554                         if last != char:
 555                             last = char
 556                             out += char
 557                             break
 558             return out
 559
 560         words = nltk.word_tokenize(text)
 561         words.append('')
 562         words.append('')
 563         words.append('')
 564         out = ''
 565
 566         cursor = 0
 567         while cursor < len(words) - 3:
 568             word = words[cursor]
 569             bigram = word + ' ' + words[cursor + 1]
 570             trigram = bigram + ' ' + words[cursor + 2]
 571             if self.is_bad_word(trigram):
 572                 out += obscure(trigram) + ' '
 573                 cursor += 3
 574             elif self.is_bad_word(bigram):
 575                 out += obscure(bigram) + ' '
 576                 cursor += 2
 577             elif self.is_bad_word(word):
 578                 out += obscure(word) + ' '
 579                 cursor += 1
 580             else:
 581                 out += word + ' '
 582                 cursor += 1
 583         return out.strip()
 584
 585
 586 def main() -> None:
 587     import doctest
 588     doctest.testmod()
 589     pf = ProfanityFilter()
 590     phrase = ' '.join(sys.argv[1:])
 591     print(pf.contains_bad_word(phrase))
 592     print(pf.obscure_bad_words(phrase))
 593     sys.exit(0)
 594
 595
 596 if __name__ == '__main__':
 597     main()