profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 import logging
   4 import random
   5 import re
   6 import string
   7 import sys
   8
   9 import nltk
  10 from nltk.stem import PorterStemmer
  11
  12 import decorator_utils
  13 import string_utils
  14
  15 logger = logging.getLogger(__name__)
  16
  17
  18 @decorator_utils.singleton
  19 class ProfanityFilter(object):
  20     def __init__(self):
  21         self.bad_words = set(
  22             [
  23                 'acrotomophilia',
  24                 'anal',
  25                 'analingus',
  26                 'anally',
  27                 'anilingus',
  28                 'anus',
  29                 'arsehol',
  30                 'arsehole',
  31                 'ass',
  32                 'asses',
  33                 'asshol',
  34                 'asshole',
  35                 'assmunch',
  36                 'auto erot',
  37                 'auto erotic',
  38                 'autoerotic',
  39                 'babeland',
  40                 'babi batter',
  41                 'baby batter',
  42                 'ball gag',
  43                 'ball gravi',
  44                 'ball gravy',
  45                 'ball kick',
  46                 'ball kicking',
  47                 'ball lick',
  48                 'ball licking',
  49                 'ball sack',
  50                 'ball suck',
  51                 'ball sucking',
  52                 'ball zack',
  53                 'bangbro',
  54                 'bangbros',
  55                 'bare legal',
  56                 'bareback',
  57                 'barely legal',
  58                 'barenak',
  59                 'barenaked',
  60                 'bastardo',
  61                 'bastinado',
  62                 'bbc',
  63                 'bbw',
  64                 'bdsm',
  65                 'beaver cleaver',
  66                 'beaver lip',
  67                 'beaver lips',
  68                 'bestial',
  69                 'bestiality',
  70                 'bi curiou',
  71                 'bi curious',
  72                 'big black',
  73                 'big breasts',
  74                 'big knocker',
  75                 'big knockers',
  76                 'big tit',
  77                 'big tits',
  78                 'bimbo',
  79                 'birdlock',
  80                 'bitch',
  81                 'bitches',
  82                 'black cock',
  83                 'blond action',
  84                 'blond on blond',
  85                 'blonde action',
  86                 'blow j',
  87                 'blow job',
  88                 'blowjob',
  89                 'blow my',
  90                 'blow me',
  91                 'blow ourselv',
  92                 'blow ourselves',
  93                 'blow your load',
  94                 'blue waffl',
  95                 'blue waffle',
  96                 'blumpkin',
  97                 'bollock',
  98                 'bollocks',
  99                 'bondag',
 100                 'bondage',
 101                 'boner',
 102                 'boob',
 103                 'boobs',
 104                 'booti call',
 105                 'booty call',
 106                 'breast',
 107                 'breasts',
 108                 'brown shower',
 109                 'brown showers',
 110                 'brunett action',
 111                 'brunette action',
 112                 'bukkak',
 113                 'bukkake',
 114                 'bulldyk',
 115                 'bulldyke',
 116                 'bullet vibe',
 117                 'bullshit',
 118                 'bung hole',
 119                 'bunghol',
 120                 'bunghole',
 121                 'busti',
 122                 'busty',
 123                 'butt',
 124                 'buttcheek',
 125                 'buttcheeks',
 126                 'butthol',
 127                 'butthole',
 128                 'camel toe',
 129                 'camgirl',
 130                 'camslut',
 131                 'camwhore',
 132                 'carpet muncher',
 133                 'carpetmuncher',
 134                 'chocol rosebud',
 135                 'chocolate rosebuds',
 136                 'circlejerk',
 137                 'chink',
 138                 'cleveland steamer',
 139                 'clit',
 140                 'clitori',
 141                 'clitoris',
 142                 'clover clamp',
 143                 'clover clamps',
 144                 'clusterfuck',
 145                 'cock',
 146                 'cocks',
 147                 'coprolagnia',
 148                 'coprophilia',
 149                 'cornhol',
 150                 'cornhole',
 151                 'cream pie',
 152                 'creampi',
 153                 'creampie',
 154                 'cum',
 155                 'cumming',
 156                 'cunnilingu',
 157                 'cunnilingus',
 158                 'cunt',
 159                 'damn',
 160                 'darki',
 161                 'darkie',
 162                 'date rape',
 163                 'daterap',
 164                 'daterape',
 165                 'deep throat',
 166                 'deepthroat',
 167                 'dick',
 168                 'dildo',
 169                 'dirti pillow',
 170                 'dirti sanchez',
 171                 'dirty pillow',
 172                 'dirty sanchez',
 173                 'dog style',
 174                 'doggi style',
 175                 'doggie style',
 176                 'doggiestyl',
 177                 'doggiestyle',
 178                 'doggystyle',
 179                 'dolcett',
 180                 'domination',
 181                 'dominatrix',
 182                 'domm',
 183                 'dommes',
 184                 'donkey punch',
 185                 'doubl dick',
 186                 'doubl dong',
 187                 'doubl penetr',
 188                 'double dick',
 189                 'double dong',
 190                 'double penetration',
 191                 'dp action',
 192                 'dtf',
 193                 'eat my ass',
 194                 'ecchi',
 195                 'ejacul',
 196                 'erection',
 197                 'erotic',
 198                 'erotism',
 199                 'escort',
 200                 'ethical slut',
 201                 'eunuch',
 202                 'faggot',
 203                 'fecal',
 204                 'felch',
 205                 'fellatio',
 206                 'feltch',
 207                 'female squirting',
 208                 'femdom',
 209                 'figging',
 210                 'fingered',
 211                 'fingering',
 212                 'fingers',
 213                 'fisted',
 214                 'fisting',
 215                 'fists',
 216                 'foot fetish',
 217                 'footjob',
 218                 'frotting',
 219                 'fuck button',
 220                 'fuck',
 221                 'fucked',
 222                 'fucker',
 223                 'fuckhead',
 224                 'fuckin',
 225                 'fucking',
 226                 'fudge packer',
 227                 'fudgepack',
 228                 'fudgepacker',
 229                 'futanari',
 230                 'g spot',
 231                 'g-spot',
 232                 'gang bang',
 233                 'gay sex',
 234                 'gee spot',
 235                 'genital',
 236                 'giant cock',
 237                 'girl gone wild',
 238                 'girl on top',
 239                 'girl on',
 240                 'give head',
 241                 'giving head',
 242                 'gave head',
 243                 'goatcx',
 244                 'goatse',
 245                 'goddamn',
 246                 'gokkun',
 247                 'golden shower',
 248                 'goo girl',
 249                 'goodpoop',
 250                 'goregasm',
 251                 'grope',
 252                 'group sex',
 253                 'gspot',
 254                 'guro',
 255                 'hand job',
 256                 'handjob',
 257                 'hard core',
 258                 'hardcore',
 259                 'hentai',
 260                 'homoerotic',
 261                 'honkey',
 262                 'hooker',
 263                 'horni',
 264                 'horny',
 265                 'hot chick',
 266                 'how to kill',
 267                 'how to murder',
 268                 'huge fat',
 269                 'humped',
 270                 'humping',
 271                 'humps',
 272                 'incest',
 273                 'intercourse',
 274                 'jack off',
 275                 'jail bait',
 276                 'jailbait',
 277                 'jerk off',
 278                 'jigaboo',
 279                 'jiggaboo',
 280                 'jiggerboo',
 281                 'jizz',
 282                 'jugg',
 283                 'kike',
 284                 'kinbaku',
 285                 'kinkster',
 286                 'kinky',
 287                 'knobbing',
 288                 'leather restraint',
 289                 'lemon party',
 290                 'lolita',
 291                 'lovemaking',
 292                 'make me come',
 293                 'male squirting',
 294                 'masturb',
 295                 'menage a trois',
 296                 'milf',
 297                 'missionary position',
 298                 'motherfuck',
 299                 'mound of venus',
 300                 'mr hand',
 301                 'muff diver',
 302                 'muffdiv',
 303                 'muffdiving',
 304                 'nambla',
 305                 'nawashi',
 306                 'negro',
 307                 'neonazi',
 308                 'nig nog',
 309                 'nigga',
 310                 'nigger',
 311                 'nimphomania',
 312                 'nipple',
 313                 'nip',
 314                 'not safe for',
 315                 'nsfl',
 316                 'nsfw',
 317                 'nude',
 318                 'nudes',
 319                 'nudity',
 320                 'nut sack',
 321                 'nutsack',
 322                 'nympho',
 323                 'nymphomania',
 324                 'octopussy',
 325                 'omorashi',
 326                 'one night stand',
 327                 'orgasm',
 328                 'orgy',
 329                 'paedophil',
 330                 'paedophile',
 331                 'panties',
 332                 'panty',
 333                 'pedobear',
 334                 'pedophil',
 335                 'pedophile',
 336                 'pee',
 337                 'pegging',
 338                 'peni',
 339                 'penis',
 340                 'phone sex',
 341                 'pigfucker',
 342                 'piss pig',
 343                 'piss',
 344                 'pissing',
 345                 'pisspig',
 346                 'playboy',
 347                 'pleasure chest',
 348                 'pole smoker',
 349                 'ponyplay',
 350                 'poof',
 351                 'poop chute',
 352                 'poopchute',
 353                 'porn',
 354                 'pron',
 355                 'pornhub',
 356                 'porno',
 357                 'pornographi',
 358                 'pornography',
 359                 'prince albert',
 360                 'pthc',
 361                 'pube',
 362                 'pussi',
 363                 'pussies',
 364                 'pussy',
 365                 'queaf',
 366                 'queer',
 367                 'raghead',
 368                 'raging boner',
 369                 'rape',
 370                 'raping',
 371                 'rapist',
 372                 'rectum',
 373                 'reverse cowgirl',
 374                 'rimjob',
 375                 'rimming',
 376                 'rosy palm',
 377                 'rusty trombone',
 378                 's & m',
 379                 's&m',
 380                 's+m',
 381                 'sadism',
 382                 'scat',
 383                 'schlong',
 384                 'scissoring',
 385                 'semen',
 386                 'sex',
 387                 'sexi',
 388                 'sexo',
 389                 'sexy',
 390                 'shaved beaver',
 391                 'shaved pussy',
 392                 'shemale',
 393                 'shibari',
 394                 'shit',
 395                 'shota',
 396                 'shrimping',
 397                 'slanteye',
 398                 'slut',
 399                 'smut',
 400                 'snatch',
 401                 'snm',
 402                 'snowballing',
 403                 'sodomi',
 404                 'sodomize',
 405                 'sodomy',
 406                 'spic',
 407                 'spooge',
 408                 'spread legs',
 409                 'squirting',
 410                 'strap on',
 411                 'strapon',
 412                 'strappado',
 413                 'strip club',
 414                 'style doggy',
 415                 'suck',
 416                 'suicide girls',
 417                 'sultry women',
 418                 'swastika',
 419                 'swinger',
 420                 'taint',
 421                 'tainted love',
 422                 'taste my',
 423                 'tea bagging',
 424                 'threesome',
 425                 'throating',
 426                 'tied up',
 427                 'tight white',
 428                 'tit',
 429                 'tits',
 430                 'titti',
 431                 'titties',
 432                 'titty',
 433                 'tongue in',
 434                 'topless',
 435                 'tosser',
 436                 'towelhead',
 437                 'tranny',
 438                 'tribadism',
 439                 'tub girl',
 440                 'tubgirl',
 441                 'tushy',
 442                 'twat',
 443                 'twink',
 444                 'twinki',
 445                 'twinkie',
 446                 'undress',
 447                 'upskirt',
 448                 'urethra play',
 449                 'urophilia',
 450                 'vag',
 451                 'vagina',
 452                 'venus mound',
 453                 'vibrator',
 454                 'violet blue',
 455                 'violet wand',
 456                 'vorarephilia',
 457                 'voyeur',
 458                 'vulva',
 459                 'wank',
 460                 'wet dream',
 461                 'wetback',
 462                 'white power',
 463                 'whore',
 464                 'women rapping',
 465                 'wrapping men',
 466                 'wrinkled starfish',
 467                 'xx',
 468                 'xxx',
 469                 'yaoi',
 470                 'yellow shower',
 471                 'yiffy',
 472                 'zoophilia',
 473             ]
 474         )
 475         self.stemmer = PorterStemmer()
 476
 477     def _normalize(self, text: str) -> str:
 478         """Normalize text.
 479
 480         >>> _normalize('Tittie5')
 481         'titties'
 482
 483         >>> _normalize('Suck a Dick!')
 484         'suck a dick'
 485
 486         >>> _normalize('fucking a whore')
 487         'fuck a whore'
 488
 489         """
 490         result = text.lower()
 491         result = result.replace("_", " ")
 492         result = result.replace('0', 'o')
 493         result = result.replace('1', 'l')
 494         result = result.replace('4', 'a')
 495         result = result.replace('5', 's')
 496         result = result.replace('3', 'e')
 497         for x in string.punctuation:
 498             result = result.replace(x, "")
 499         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
 500         return ' '.join(chunks)
 501
 502     def tokenize(self, text: str):
 503         for x in nltk.word_tokenize(text):
 504             for y in re.split(r'\W+', x):
 505                 yield y
 506
 507     def contains_bad_word(self, text: str) -> bool:
 508         """Returns True if text contains a bad word (or more than one)
 509         and False if no bad words were detected.
 510
 511         >>> contains_bad_word('fuck you')
 512         True
 513
 514         >>> contains_bad_word('FucK u')
 515         True
 516
 517         >>> contains_bad_word('FuK U')
 518         False
 519
 520         """
 521         words = [word for word in self.tokenize(text)]
 522         for word in words:
 523             if self.is_bad_word(word):
 524                 logger.debug(f'"{word}" is profanity')
 525                 return True
 526
 527         if len(words) > 1:
 528             for bigram in string_utils.ngrams_presplit(words, 2):
 529                 bigram = ' '.join(bigram)
 530                 if self.is_bad_word(bigram):
 531                     logger.debug(f'"{bigram}" is profanity')
 532                     return True
 533
 534         if len(words) > 2:
 535             for trigram in string_utils.ngrams_presplit(words, 3):
 536                 trigram = ' '.join(trigram)
 537                 if self.is_bad_word(trigram):
 538                     logger.debug(f'"{trigram}" is profanity')
 539                     return True
 540         return False
 541
 542     def is_bad_word(self, word: str) -> bool:
 543         return word in self.bad_words or self._normalize(word) in self.bad_words
 544
 545     def obscure_bad_words(self, text: str) -> str:
 546         """Obscure bad words that are detected by inserting random punctuation
 547         characters.
 548
 549         """
 550
 551         def obscure(word: str):
 552             out = ''
 553             last = ''
 554             for letter in word:
 555                 if letter.isspace():
 556                     out += letter
 557                 else:
 558                     while True:
 559                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 560                         if last != char:
 561                             last = char
 562                             out += char
 563                             break
 564             return out
 565
 566         words = [x for x in self.tokenize(text)]
 567         words.append('')
 568         words.append('')
 569         words.append('')
 570         out = ''
 571
 572         cursor = 0
 573         while cursor < len(words) - 3:
 574             word = words[cursor]
 575             bigram = word + ' ' + words[cursor + 1]
 576             trigram = bigram + ' ' + words[cursor + 2]
 577             if self.is_bad_word(trigram):
 578                 out += obscure(trigram) + ' '
 579                 cursor += 3
 580             elif self.is_bad_word(bigram):
 581                 out += obscure(bigram) + ' '
 582                 cursor += 2
 583             elif self.is_bad_word(word):
 584                 out += obscure(word) + ' '
 585                 cursor += 1
 586             else:
 587                 out += word + ' '
 588                 cursor += 1
 589         return out.strip()
 590
 591
 592 def main() -> None:
 593     import doctest
 594
 595     doctest.testmod()
 596     pf = ProfanityFilter()
 597     phrase = ' '.join(sys.argv[1:])
 598     print(pf.contains_bad_word(phrase))
 599     print(pf.obscure_bad_words(phrase))
 600     sys.exit(0)
 601
 602
 603 if __name__ == '__main__':
 604     main()