profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 """A helper to identify and optionally obscure some bad words."""
   4
   5 import logging
   6 import random
   7 import re
   8 import string
   9 import sys
  10
  11 import nltk
  12 from nltk.stem import PorterStemmer
  13
  14 import decorator_utils
  15 import string_utils
  16
  17 logger = logging.getLogger(__name__)
  18
  19
  20 @decorator_utils.singleton
  21 class ProfanityFilter(object):
  22     """A helper to identify and optionally obscure some bad words."""
  23
  24     def __init__(self):
  25         self.bad_words = set(
  26             [
  27                 'acrotomophilia',
  28                 'anal',
  29                 'analingu',
  30                 'anally',
  31                 'anilingu',
  32                 'anus',
  33                 'arsehol',
  34                 'arsehole',
  35                 'ass',
  36                 'asshol',
  37                 'asshole',
  38                 'assmunch',
  39                 'auto erot',
  40                 'auto erotic',
  41                 'autoerotic',
  42                 'babeland',
  43                 'babi batter',
  44                 'baby batter',
  45                 'ball gag',
  46                 'ball gravi',
  47                 'ball gravy',
  48                 'ball kick',
  49                 'ball kicking',
  50                 'ball lick',
  51                 'ball licking',
  52                 'ball sack',
  53                 'ball suck',
  54                 'ball sucking',
  55                 'ball zack',
  56                 'bangbro',
  57                 'bare legal',
  58                 'bareback',
  59                 'barely legal',
  60                 'barenak',
  61                 'barenaked',
  62                 'bastardo',
  63                 'bastinado',
  64                 'bbc',
  65                 'bbw',
  66                 'bdsm',
  67                 'beaver cleaver',
  68                 'beaver lip',
  69                 'bestial',
  70                 'bestiality',
  71                 'bi curiou',
  72                 'big black',
  73                 'big breast',
  74                 'big knocker',
  75                 'big tit',
  76                 'bimbo',
  77                 'birdlock',
  78                 'bitch',
  79                 'black cock',
  80                 'blond action',
  81                 'blond on blond',
  82                 'blonde action',
  83                 'blow j',
  84                 'blow job',
  85                 'blowjob',
  86                 'blow my',
  87                 'blow me',
  88                 'blow ourselv',
  89                 'blow ourselves',
  90                 'blow your load',
  91                 'blue waffl',
  92                 'blue waffle',
  93                 'blumpkin',
  94                 'bollock',
  95                 'bondag',
  96                 'bondage',
  97                 'boner',
  98                 'boob',
  99                 'booti call',
 100                 'booty call',
 101                 'breast',
 102                 'brown shower',
 103                 'brunett action',
 104                 'brunette action',
 105                 'bukkak',
 106                 'bukkake',
 107                 'bulldyk',
 108                 'bulldyke',
 109                 'bullet vibe',
 110                 'bullshit',
 111                 'bung hole',
 112                 'bunghol',
 113                 'bunghole',
 114                 'busti',
 115                 'busty',
 116                 'butt',
 117                 'buttcheek',
 118                 'butthol',
 119                 'butthole',
 120                 'camel toe',
 121                 'camgirl',
 122                 'camslut',
 123                 'camwhore',
 124                 'carpet muncher',
 125                 'carpetmuncher',
 126                 'chocol rosebud',
 127                 'circlejerk',
 128                 'chink',
 129                 'cleveland steamer',
 130                 'clit',
 131                 'clitor',
 132                 'clitori',
 133                 'clover clamp',
 134                 'clusterfuck',
 135                 'cluster fuck',
 136                 'cock',
 137                 'coprolagnia',
 138                 'coprophilia',
 139                 'cornhol',
 140                 'cornhole',
 141                 'cream pie',
 142                 'creampi',
 143                 'creampie',
 144                 'cum',
 145                 'cumming',
 146                 'cunnilingu',
 147                 'cunt',
 148                 'damn',
 149                 'darki',
 150                 'darkie',
 151                 'date rape',
 152                 'daterap',
 153                 'daterape',
 154                 'deep throat',
 155                 'deepthroat',
 156                 'dick',
 157                 'dildo',
 158                 'dirti pillow',
 159                 'dirti sanchez',
 160                 'dirty pillow',
 161                 'dirty sanchez',
 162                 'dog style',
 163                 'doggi style',
 164                 'doggie style',
 165                 'doggiestyl',
 166                 'doggiestyle',
 167                 'doggystyle',
 168                 'dolcett',
 169                 'domination',
 170                 'dominatrix',
 171                 'domm',
 172                 'dommes',
 173                 'donkey punch',
 174                 'doubl dick',
 175                 'doubl dong',
 176                 'doubl penetr',
 177                 'double dick',
 178                 'double dong',
 179                 'double penetration',
 180                 'dp action',
 181                 'dtf',
 182                 'eat my ass',
 183                 'ecchi',
 184                 'ejacul',
 185                 'erection',
 186                 'erotic',
 187                 'erotism',
 188                 'escort',
 189                 'ethical slut',
 190                 'eunuch',
 191                 'faggot',
 192                 'fecal',
 193                 'felch',
 194                 'fellatio',
 195                 'feltch',
 196                 'female squirting',
 197                 'femdom',
 198                 'figging',
 199                 'finger',
 200                 'fist',
 201                 'foot fetish',
 202                 'footjob',
 203                 'frotting',
 204                 'fuck button',
 205                 'fuck',
 206                 'fucked',
 207                 'fucker',
 208                 'fuckhead',
 209                 'fuckin',
 210                 'fucking',
 211                 'fudge packer',
 212                 'fudgepack',
 213                 'fudgepacker',
 214                 'futanari',
 215                 'g spot',
 216                 'g-spot',
 217                 'gang bang',
 218                 'gay sex',
 219                 'gee spot',
 220                 'genital',
 221                 'giant cock',
 222                 'girl gone wild',
 223                 'girl on top',
 224                 'girl on',
 225                 'give head',
 226                 'giving head',
 227                 'gave head',
 228                 'gave you head',
 229                 'gave him head',
 230                 'gave them head',
 231                 'gave us head',
 232                 'glori hole',
 233                 'goatcx',
 234                 'goatse',
 235                 'goddamn',
 236                 'gokkun',
 237                 'golden shower',
 238                 'goo girl',
 239                 'goodpoop',
 240                 'goregasm',
 241                 'grope',
 242                 'group sex',
 243                 'gspot',
 244                 'guro',
 245                 'hand job',
 246                 'handjob',
 247                 'hard core',
 248                 'hardcore',
 249                 'hentai',
 250                 'homoerotic',
 251                 'honkey',
 252                 'hooker',
 253                 'horni',
 254                 'horny',
 255                 'hot chick',
 256                 'how to kill',
 257                 'how to murder',
 258                 'huge fat',
 259                 'humped',
 260                 'humping',
 261                 'hump',
 262                 'incest',
 263                 'intercourse',
 264                 'jack off',
 265                 'jail bait',
 266                 'jailbait',
 267                 'jerk off',
 268                 'jigaboo',
 269                 'jiggaboo',
 270                 'jiggerboo',
 271                 'jizz',
 272                 'jugg',
 273                 'kike',
 274                 'kinbaku',
 275                 'kinkster',
 276                 'kinky',
 277                 'knobbing',
 278                 'leather restraint',
 279                 'lemon party',
 280                 'lolita',
 281                 'lovemaking',
 282                 'make me come',
 283                 'male squirting',
 284                 'masturb',
 285                 'menage a trois',
 286                 'menag a troi',
 287                 'milf',
 288                 'missionary position',
 289                 'motherfuck',
 290                 'mound of venu',
 291                 'mr hand',
 292                 'muff diver',
 293                 'muffdiv',
 294                 'muffdiving',
 295                 'nambla',
 296                 'nawashi',
 297                 'negro',
 298                 'neonazi',
 299                 'nig nog',
 300                 'nigga',
 301                 'nigger',
 302                 'nimphomania',
 303                 'nipple',
 304                 'nip',
 305                 'not safe for',
 306                 'nsfl',
 307                 'nsfw',
 308                 'nude',
 309                 'nudity',
 310                 'nut sack',
 311                 'nutsack',
 312                 'nympho',
 313                 'nymphomania',
 314                 'octopussy',
 315                 'omorashi',
 316                 'one night stand',
 317                 'orgasm',
 318                 'orgy',
 319                 'paedophil',
 320                 'paedophile',
 321                 'panties',
 322                 'panti',
 323                 'pedobear',
 324                 'pedophil',
 325                 'pedophile',
 326                 'pee',
 327                 'pegging',
 328                 'peni',
 329                 'penis',
 330                 'phone sex',
 331                 'pigfucker',
 332                 'piss pig',
 333                 'piss',
 334                 'pissing',
 335                 'pisspig',
 336                 'playboy',
 337                 'pleasure chest',
 338                 'pole smoker',
 339                 'ponyplay',
 340                 'poof',
 341                 'poop chute',
 342                 'poopchute',
 343                 'porn',
 344                 'pron',
 345                 'pornhub',
 346                 'porno',
 347                 'pornographi',
 348                 'pornography',
 349                 'prince albert',
 350                 'pthc',
 351                 'pube',
 352                 'pussi',
 353                 'pussies',
 354                 'pussy',
 355                 'queaf',
 356                 'queer',
 357                 'raghead',
 358                 'raging boner',
 359                 'rape',
 360                 'raping',
 361                 'rapist',
 362                 'rectum',
 363                 'reverse cowgirl',
 364                 'rimjob',
 365                 'rimming',
 366                 'rosy palm',
 367                 'rusty trombone',
 368                 's & m',
 369                 's&m',
 370                 's+m',
 371                 'sadism',
 372                 'scat',
 373                 'schlong',
 374                 'scissoring',
 375                 'semen',
 376                 'sex',
 377                 'sexi',
 378                 'sexo',
 379                 'sexy',
 380                 'shave beaver',
 381                 'shave pussi',
 382                 'shemale',
 383                 'shibari',
 384                 'shit',
 385                 'shota',
 386                 'shrimping',
 387                 'slanteye',
 388                 'slut',
 389                 'smut',
 390                 'snatch',
 391                 'snm',
 392                 'snowballing',
 393                 'sodomi',
 394                 'sodomize',
 395                 'sodomy',
 396                 'spic',
 397                 'spooge',
 398                 'spread leg',
 399                 'squirting',
 400                 'strap on',
 401                 'strapon',
 402                 'strappado',
 403                 'strip club',
 404                 'style doggy',
 405                 'suck',
 406                 'suicid girl',
 407                 'sultry women',
 408                 'swastika',
 409                 'swinger',
 410                 'taint',
 411                 'tainted love',
 412                 'taste my',
 413                 'tea bagging',
 414                 'threesome',
 415                 'throating',
 416                 'tied up',
 417                 'tight white',
 418                 'tit',
 419                 'titti',
 420                 'titties',
 421                 'titty',
 422                 'tongue in',
 423                 'topless',
 424                 'tosser',
 425                 'towelhead',
 426                 'tranny',
 427                 'tribadism',
 428                 'tub girl',
 429                 'tubgirl',
 430                 'tushy',
 431                 'twat',
 432                 'twink',
 433                 'twinki',
 434                 'twinkie',
 435                 'undress',
 436                 'upskirt',
 437                 'urethra play',
 438                 'urophilia',
 439                 'vag',
 440                 'vagina',
 441                 'venu mound',
 442                 'vibrator',
 443                 'violet blue',
 444                 'violet wand',
 445                 'vorarephilia',
 446                 'voyeur',
 447                 'vulva',
 448                 'wank',
 449                 'wet dream',
 450                 'wetback',
 451                 'white power',
 452                 'whore',
 453                 'women rapping',
 454                 'wrapping men',
 455                 'wrinkled starfish',
 456                 'xx',
 457                 'xxx',
 458                 'yaoi',
 459                 'yellow shower',
 460                 'yiffy',
 461                 'zoophilia',
 462             ]
 463         )
 464         self.stemmer = PorterStemmer()
 465
 466     def _normalize(self, text: str) -> str:
 467         """Normalize text.
 468
 469         >>> _normalize('Tittie5')
 470         'titties'
 471
 472         >>> _normalize('Suck a Dick!')
 473         'suck a dick'
 474
 475         >>> _normalize('fucking a whore')
 476         'fuck a whore'
 477
 478         """
 479         result = text.lower()
 480         result = result.replace("_", " ")
 481         result = result.replace('0', 'o')
 482         result = result.replace('1', 'l')
 483         result = result.replace('4', 'a')
 484         result = result.replace('5', 's')
 485         result = result.replace('3', 'e')
 486         for x in string.punctuation:
 487             result = result.replace(x, "")
 488         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
 489         return ' '.join(chunks)
 490
 491     @staticmethod
 492     def tokenize(text: str):
 493         for x in nltk.word_tokenize(text):
 494             for y in re.split(r'\W+', x):
 495                 yield y
 496
 497     def contains_bad_word(self, text: str) -> bool:
 498         """Returns True if text contains a bad word (or more than one)
 499         and False if no bad words were detected.
 500
 501         >>> contains_bad_word('fuck you')
 502         True
 503
 504         >>> contains_bad_word('FucK u')
 505         True
 506
 507         >>> contains_bad_word('FuK U')
 508         False
 509
 510         """
 511         words = list(self.tokenize(text))
 512         for word in words:
 513             if self.is_bad_word(word):
 514                 logger.debug('"%s" is profanity', word)
 515                 return True
 516
 517         if len(words) > 1:
 518             for bigram in string_utils.ngrams_presplit(words, 2):
 519                 bigram = ' '.join(bigram)
 520                 if self.is_bad_word(bigram):
 521                     logger.debug('"%s" is profanity', bigram)
 522                     return True
 523
 524         if len(words) > 2:
 525             for trigram in string_utils.ngrams_presplit(words, 3):
 526                 trigram = ' '.join(trigram)
 527                 if self.is_bad_word(trigram):
 528                     logger.debug('"%s" is profanity', trigram)
 529                     return True
 530         return False
 531
 532     def is_bad_word(self, word: str) -> bool:
 533         return word in self.bad_words or self._normalize(word) in self.bad_words
 534
 535     def obscure_bad_words(self, text: str) -> str:
 536         """Obscure bad words that are detected by inserting random punctuation
 537         characters.
 538
 539         """
 540
 541         def obscure(word: str):
 542             out = ''
 543             last = ''
 544             for letter in word:
 545                 if letter.isspace():
 546                     out += letter
 547                 else:
 548                     while True:
 549                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 550                         if last != char:
 551                             last = char
 552                             out += char
 553                             break
 554             return out
 555
 556         words = list(self.tokenize(text))
 557         words.append('')
 558         words.append('')
 559         words.append('')
 560         out = ''
 561
 562         cursor = 0
 563         while cursor < len(words) - 3:
 564             word = words[cursor]
 565             bigram = word + ' ' + words[cursor + 1]
 566             trigram = bigram + ' ' + words[cursor + 2]
 567             if self.is_bad_word(trigram):
 568                 out += obscure(trigram) + ' '
 569                 cursor += 3
 570             elif self.is_bad_word(bigram):
 571                 out += obscure(bigram) + ' '
 572                 cursor += 2
 573             elif self.is_bad_word(word):
 574                 out += obscure(word) + ' '
 575                 cursor += 1
 576             else:
 577                 out += word + ' '
 578                 cursor += 1
 579         return out.strip()
 580
 581
 582 def main() -> None:
 583     import doctest
 584
 585     doctest.testmod()
 586     pf = ProfanityFilter()
 587     phrase = ' '.join(sys.argv[1:])
 588     print(pf.contains_bad_word(phrase))
 589     print(pf.obscure_bad_words(phrase))
 590     sys.exit(0)
 591
 592
 593 if __name__ == '__main__':
 594     main()