profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 import logging
   4 import random
   5 import re
   6 import string
   7 import sys
   8
   9 import nltk
  10 from nltk.stem import PorterStemmer
  11
  12 import decorator_utils
  13 import string_utils
  14
  15
  16 logger = logging.getLogger(__name__)
  17
  18
  19 @decorator_utils.singleton
  20 class ProfanityFilter(object):
  21     def __init__(self):
  22         self.bad_words = set(
  23             [
  24                 'acrotomophilia',
  25                 'anal',
  26                 'analingus',
  27                 'anally',
  28                 'anilingus',
  29                 'anus',
  30                 'arsehol',
  31                 'arsehole',
  32                 'ass',
  33                 'asses',
  34                 'asshol',
  35                 'asshole',
  36                 'assmunch',
  37                 'auto erot',
  38                 'auto erotic',
  39                 'autoerotic',
  40                 'babeland',
  41                 'babi batter',
  42                 'baby batter',
  43                 'ball gag',
  44                 'ball gravi',
  45                 'ball gravy',
  46                 'ball kick',
  47                 'ball kicking',
  48                 'ball lick',
  49                 'ball licking',
  50                 'ball sack',
  51                 'ball suck',
  52                 'ball sucking',
  53                 'ball zack',
  54                 'bangbro',
  55                 'bangbros',
  56                 'bare legal',
  57                 'bareback',
  58                 'barely legal',
  59                 'barenak',
  60                 'barenaked',
  61                 'bastardo',
  62                 'bastinado',
  63                 'bbc',
  64                 'bbw',
  65                 'bdsm',
  66                 'beaver cleaver',
  67                 'beaver lip',
  68                 'beaver lips',
  69                 'bestial',
  70                 'bestiality',
  71                 'bi curiou',
  72                 'bi curious',
  73                 'big black',
  74                 'big breasts',
  75                 'big knocker',
  76                 'big knockers',
  77                 'big tit',
  78                 'big tits',
  79                 'bimbo',
  80                 'birdlock',
  81                 'bitch',
  82                 'bitches',
  83                 'black cock',
  84                 'blond action',
  85                 'blond on blond',
  86                 'blonde action',
  87                 'blow j',
  88                 'blow job',
  89                 'blowjob',
  90                 'blow my',
  91                 'blow me',
  92                 'blow ourselv',
  93                 'blow ourselves',
  94                 'blow your load',
  95                 'blue waffl',
  96                 'blue waffle',
  97                 'blumpkin',
  98                 'bollock',
  99                 'bollocks',
 100                 'bondag',
 101                 'bondage',
 102                 'boner',
 103                 'boob',
 104                 'boobs',
 105                 'booti call',
 106                 'booty call',
 107                 'breast',
 108                 'breasts',
 109                 'brown shower',
 110                 'brown showers',
 111                 'brunett action',
 112                 'brunette action',
 113                 'bukkak',
 114                 'bukkake',
 115                 'bulldyk',
 116                 'bulldyke',
 117                 'bullet vibe',
 118                 'bullshit',
 119                 'bung hole',
 120                 'bunghol',
 121                 'bunghole',
 122                 'busti',
 123                 'busty',
 124                 'butt',
 125                 'buttcheek',
 126                 'buttcheeks',
 127                 'butthol',
 128                 'butthole',
 129                 'camel toe',
 130                 'camgirl',
 131                 'camslut',
 132                 'camwhore',
 133                 'carpet muncher',
 134                 'carpetmuncher',
 135                 'chocol rosebud',
 136                 'chocolate rosebuds',
 137                 'circlejerk',
 138                 'chink',
 139                 'cleveland steamer',
 140                 'clit',
 141                 'clitori',
 142                 'clitoris',
 143                 'clover clamp',
 144                 'clover clamps',
 145                 'clusterfuck',
 146                 'cock',
 147                 'cocks',
 148                 'coprolagnia',
 149                 'coprophilia',
 150                 'cornhol',
 151                 'cornhole',
 152                 'cream pie',
 153                 'creampi',
 154                 'creampie',
 155                 'cum',
 156                 'cumming',
 157                 'cunnilingu',
 158                 'cunnilingus',
 159                 'cunt',
 160                 'damn',
 161                 'darki',
 162                 'darkie',
 163                 'date rape',
 164                 'daterap',
 165                 'daterape',
 166                 'deep throat',
 167                 'deepthroat',
 168                 'dick',
 169                 'dildo',
 170                 'dirti pillow',
 171                 'dirti sanchez',
 172                 'dirty pillow',
 173                 'dirty sanchez',
 174                 'dog style',
 175                 'doggi style',
 176                 'doggie style',
 177                 'doggiestyl',
 178                 'doggiestyle',
 179                 'doggystyle',
 180                 'dolcett',
 181                 'domination',
 182                 'dominatrix',
 183                 'domm',
 184                 'dommes',
 185                 'donkey punch',
 186                 'doubl dick',
 187                 'doubl dong',
 188                 'doubl penetr',
 189                 'double dick',
 190                 'double dong',
 191                 'double penetration',
 192                 'dp action',
 193                 'dtf',
 194                 'eat my ass',
 195                 'ecchi',
 196                 'ejacul',
 197                 'erection',
 198                 'erotic',
 199                 'erotism',
 200                 'escort',
 201                 'ethical slut',
 202                 'eunuch',
 203                 'faggot',
 204                 'fecal',
 205                 'felch',
 206                 'fellatio',
 207                 'feltch',
 208                 'female squirting',
 209                 'femdom',
 210                 'figging',
 211                 'fingered',
 212                 'fingering',
 213                 'fingers',
 214                 'fisted',
 215                 'fisting',
 216                 'fists',
 217                 'foot fetish',
 218                 'footjob',
 219                 'frotting',
 220                 'fuck button',
 221                 'fuck',
 222                 'fucked',
 223                 'fucker',
 224                 'fuckhead',
 225                 'fuckin',
 226                 'fucking',
 227                 'fudge packer',
 228                 'fudgepack',
 229                 'fudgepacker',
 230                 'futanari',
 231                 'g spot',
 232                 'g-spot',
 233                 'gang bang',
 234                 'gay sex',
 235                 'gee spot',
 236                 'genital',
 237                 'giant cock',
 238                 'girl gone wild',
 239                 'girl on top',
 240                 'girl on',
 241                 'goatcx',
 242                 'goatse',
 243                 'goddamn',
 244                 'gokkun',
 245                 'golden shower',
 246                 'goo girl',
 247                 'goodpoop',
 248                 'goregasm',
 249                 'grope',
 250                 'group sex',
 251                 'gspot',
 252                 'guro',
 253                 'hand job',
 254                 'handjob',
 255                 'hard core',
 256                 'hardcore',
 257                 'hentai',
 258                 'homoerotic',
 259                 'honkey',
 260                 'hooker',
 261                 'horni',
 262                 'horny',
 263                 'hot chick',
 264                 'how to kill',
 265                 'how to murder',
 266                 'huge fat',
 267                 'humped',
 268                 'humping',
 269                 'humps',
 270                 'incest',
 271                 'intercourse',
 272                 'jack off',
 273                 'jail bait',
 274                 'jailbait',
 275                 'jerk off',
 276                 'jigaboo',
 277                 'jiggaboo',
 278                 'jiggerboo',
 279                 'jizz',
 280                 'jugg',
 281                 'kike',
 282                 'kinbaku',
 283                 'kinkster',
 284                 'kinky',
 285                 'knobbing',
 286                 'leather restraint',
 287                 'lemon party',
 288                 'lolita',
 289                 'lovemaking',
 290                 'make me come',
 291                 'male squirting',
 292                 'masturb',
 293                 'menage a trois',
 294                 'milf',
 295                 'missionary position',
 296                 'motherfuck',
 297                 'mound of venus',
 298                 'mr hand',
 299                 'muff diver',
 300                 'muffdiv',
 301                 'muffdiving',
 302                 'nambla',
 303                 'nawashi',
 304                 'negro',
 305                 'neonazi',
 306                 'nig nog',
 307                 'nigga',
 308                 'nigger',
 309                 'nimphomania',
 310                 'nipple',
 311                 'nip',
 312                 'not safe for',
 313                 'nsfl',
 314                 'nsfw',
 315                 'nude',
 316                 'nudes',
 317                 'nudity',
 318                 'nut sack',
 319                 'nutsack',
 320                 'nympho',
 321                 'nymphomania',
 322                 'octopussy',
 323                 'omorashi',
 324                 'one night stand',
 325                 'orgasm',
 326                 'orgy',
 327                 'paedophil',
 328                 'paedophile',
 329                 'panties',
 330                 'panty',
 331                 'pedobear',
 332                 'pedophil',
 333                 'pedophile',
 334                 'pee',
 335                 'pegging',
 336                 'peni',
 337                 'penis',
 338                 'phone sex',
 339                 'pigfucker',
 340                 'piss pig',
 341                 'piss',
 342                 'pissing',
 343                 'pisspig',
 344                 'playboy',
 345                 'pleasure chest',
 346                 'pole smoker',
 347                 'ponyplay',
 348                 'poof',
 349                 'poop chute',
 350                 'poopchute',
 351                 'porn',
 352                 'pron',
 353                 'pornhub',
 354                 'porno',
 355                 'pornographi',
 356                 'pornography',
 357                 'prince albert',
 358                 'pthc',
 359                 'pube',
 360                 'pussi',
 361                 'pussies',
 362                 'pussy',
 363                 'queaf',
 364                 'queer',
 365                 'raghead',
 366                 'raging boner',
 367                 'rape',
 368                 'raping',
 369                 'rapist',
 370                 'rectum',
 371                 'reverse cowgirl',
 372                 'rimjob',
 373                 'rimming',
 374                 'rosy palm',
 375                 'rusty trombone',
 376                 's & m',
 377                 's&m',
 378                 's+m',
 379                 'sadism',
 380                 'scat',
 381                 'schlong',
 382                 'scissoring',
 383                 'semen',
 384                 'sex',
 385                 'sexi',
 386                 'sexo',
 387                 'sexy',
 388                 'shaved beaver',
 389                 'shaved pussy',
 390                 'shemale',
 391                 'shibari',
 392                 'shit',
 393                 'shota',
 394                 'shrimping',
 395                 'slanteye',
 396                 'slut',
 397                 'smut',
 398                 'snatch',
 399                 'snm',
 400                 'snowballing',
 401                 'sodomi',
 402                 'sodomize',
 403                 'sodomy',
 404                 'spic',
 405                 'spooge',
 406                 'spread legs',
 407                 'squirting',
 408                 'strap on',
 409                 'strapon',
 410                 'strappado',
 411                 'strip club',
 412                 'style doggy',
 413                 'suck',
 414                 'suicide girls',
 415                 'sultry women',
 416                 'swastika',
 417                 'swinger',
 418                 'taint',
 419                 'tainted love',
 420                 'taste my',
 421                 'tea bagging',
 422                 'threesome',
 423                 'throating',
 424                 'tied up',
 425                 'tight white',
 426                 'tit',
 427                 'tits',
 428                 'titti',
 429                 'titties',
 430                 'titty',
 431                 'tongue in',
 432                 'topless',
 433                 'tosser',
 434                 'towelhead',
 435                 'tranny',
 436                 'tribadism',
 437                 'tub girl',
 438                 'tubgirl',
 439                 'tushy',
 440                 'twat',
 441                 'twink',
 442                 'twinki',
 443                 'twinkie',
 444                 'undress',
 445                 'upskirt',
 446                 'urethra play',
 447                 'urophilia',
 448                 'vag',
 449                 'vagina',
 450                 'venus mound',
 451                 'vibrator',
 452                 'violet blue',
 453                 'violet wand',
 454                 'vorarephilia',
 455                 'voyeur',
 456                 'vulva',
 457                 'wank',
 458                 'wet dream',
 459                 'wetback',
 460                 'white power',
 461                 'whore',
 462                 'women rapping',
 463                 'wrapping men',
 464                 'wrinkled starfish',
 465                 'xx',
 466                 'xxx',
 467                 'yaoi',
 468                 'yellow shower',
 469                 'yiffy',
 470                 'zoophilia',
 471             ]
 472         )
 473         self.stemmer = PorterStemmer()
 474
 475     def _normalize(self, text: str) -> str:
 476         """Normalize text.
 477
 478         >>> _normalize('Tittie5')
 479         'titties'
 480
 481         >>> _normalize('Suck a Dick!')
 482         'suck a dick'
 483
 484         >>> _normalize('fucking a whore')
 485         'fuck a whore'
 486
 487         """
 488         result = text.lower()
 489         result = result.replace("_", " ")
 490         result = result.replace('0', 'o')
 491         result = result.replace('1', 'l')
 492         result = result.replace('4', 'a')
 493         result = result.replace('5', 's')
 494         result = result.replace('3', 'e')
 495         for x in string.punctuation:
 496             result = result.replace(x, "")
 497         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
 498         return ' '.join(chunks)
 499
 500     def tokenize(self, text: str):
 501         for x in nltk.word_tokenize(text):
 502             for y in re.split(r'\W+', x):
 503                 yield y
 504
 505     def contains_bad_word(self, text: str) -> bool:
 506         """Returns True if text contains a bad word (or more than one)
 507         and False if no bad words were detected.
 508
 509         >>> contains_bad_word('fuck you')
 510         True
 511
 512         >>> contains_bad_word('FucK u')
 513         True
 514
 515         >>> contains_bad_word('FuK U')
 516         False
 517
 518         """
 519         words = [word for word in self.tokenize(text)]
 520         for word in words:
 521             if self.is_bad_word(word):
 522                 logger.debug(f'"{word}" is profanity')
 523                 return True
 524
 525         if len(words) > 1:
 526             for bigram in string_utils.ngrams_presplit(words, 2):
 527                 bigram = ' '.join(bigram)
 528                 if self.is_bad_word(bigram):
 529                     logger.debug(f'"{bigram}" is profanity')
 530                     return True
 531
 532         if len(words) > 2:
 533             for trigram in string_utils.ngrams_presplit(words, 3):
 534                 trigram = ' '.join(trigram)
 535                 if self.is_bad_word(trigram):
 536                     logger.debug(f'"{trigram}" is profanity')
 537                     return True
 538         return False
 539
 540     def is_bad_word(self, word: str) -> bool:
 541         return word in self.bad_words or self._normalize(word) in self.bad_words
 542
 543     def obscure_bad_words(self, text: str) -> str:
 544         """Obscure bad words that are detected by inserting random punctuation
 545         characters.
 546
 547         """
 548
 549         def obscure(word: str):
 550             out = ''
 551             last = ''
 552             for letter in word:
 553                 if letter.isspace():
 554                     out += letter
 555                 else:
 556                     while True:
 557                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 558                         if last != char:
 559                             last = char
 560                             out += char
 561                             break
 562             return out
 563
 564         words = self.tokenize(text)
 565         words.append('')
 566         words.append('')
 567         words.append('')
 568         out = ''
 569
 570         cursor = 0
 571         while cursor < len(words) - 3:
 572             word = words[cursor]
 573             bigram = word + ' ' + words[cursor + 1]
 574             trigram = bigram + ' ' + words[cursor + 2]
 575             if self.is_bad_word(trigram):
 576                 out += obscure(trigram) + ' '
 577                 cursor += 3
 578             elif self.is_bad_word(bigram):
 579                 out += obscure(bigram) + ' '
 580                 cursor += 2
 581             elif self.is_bad_word(word):
 582                 out += obscure(word) + ' '
 583                 cursor += 1
 584             else:
 585                 out += word + ' '
 586                 cursor += 1
 587         return out.strip()
 588
 589
 590 def main() -> None:
 591     import doctest
 592
 593     doctest.testmod()
 594     pf = ProfanityFilter()
 595     phrase = ' '.join(sys.argv[1:])
 596     print(pf.contains_bad_word(phrase))
 597     print(pf.obscure_bad_words(phrase))
 598     sys.exit(0)
 599
 600
 601 if __name__ == '__main__':
 602     main()