profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2022, Scott Gasch
   4
   5 """A helper to identify and optionally obscure some bad words."""
   6
   7 import logging
   8 import random
   9 import re
  10 import string
  11 import sys
  12
  13 import nltk
  14 from nltk.stem import PorterStemmer
  15
  16 import decorator_utils
  17 import string_utils
  18
  19 logger = logging.getLogger(__name__)
  20
  21
  22 @decorator_utils.singleton
  23 class ProfanityFilter(object):
  24     """A helper to identify and optionally obscure some bad words."""
  25
  26     def __init__(self):
  27         self.bad_words = set(
  28             [
  29                 'acrotomophilia',
  30                 'anal',
  31                 'analingu',
  32                 'anally',
  33                 'anilingu',
  34                 'anus',
  35                 'arsehol',
  36                 'arsehole',
  37                 'ass',
  38                 'asshol',
  39                 'asshole',
  40                 'assmunch',
  41                 'auto erot',
  42                 'auto erotic',
  43                 'autoerotic',
  44                 'babeland',
  45                 'babi batter',
  46                 'baby batter',
  47                 'ball gag',
  48                 'ball gravi',
  49                 'ball gravy',
  50                 'ball kick',
  51                 'ball kicking',
  52                 'ball lick',
  53                 'ball licking',
  54                 'ball sack',
  55                 'ball suck',
  56                 'ball sucking',
  57                 'ball zack',
  58                 'bangbro',
  59                 'bare legal',
  60                 'bareback',
  61                 'barely legal',
  62                 'barenak',
  63                 'barenaked',
  64                 'bastardo',
  65                 'bastinado',
  66                 'bbc',
  67                 'bbw',
  68                 'bdsm',
  69                 'beaver cleaver',
  70                 'beaver lip',
  71                 'bestial',
  72                 'bestiality',
  73                 'bi curiou',
  74                 'big black',
  75                 'big breast',
  76                 'big knocker',
  77                 'big tit',
  78                 'bimbo',
  79                 'birdlock',
  80                 'bitch',
  81                 'black cock',
  82                 'blond action',
  83                 'blond on blond',
  84                 'blonde action',
  85                 'blow j',
  86                 'blow job',
  87                 'blowjob',
  88                 'blow my',
  89                 'blow me',
  90                 'blow ourselv',
  91                 'blow ourselves',
  92                 'blow your load',
  93                 'blue waffl',
  94                 'blue waffle',
  95                 'blumpkin',
  96                 'bollock',
  97                 'bondag',
  98                 'bondage',
  99                 'boner',
 100                 'boob',
 101                 'booti call',
 102                 'booty call',
 103                 'breast',
 104                 'brown shower',
 105                 'brunett action',
 106                 'brunette action',
 107                 'bukkak',
 108                 'bukkake',
 109                 'bulldyk',
 110                 'bulldyke',
 111                 'bullet vibe',
 112                 'bullshit',
 113                 'bung hole',
 114                 'bunghol',
 115                 'bunghole',
 116                 'busti',
 117                 'busty',
 118                 'butt',
 119                 'buttcheek',
 120                 'butthol',
 121                 'butthole',
 122                 'camel toe',
 123                 'camgirl',
 124                 'camslut',
 125                 'camwhore',
 126                 'carpet muncher',
 127                 'carpetmuncher',
 128                 'chocol rosebud',
 129                 'circlejerk',
 130                 'chink',
 131                 'cleveland steamer',
 132                 'clit',
 133                 'clitor',
 134                 'clitori',
 135                 'clover clamp',
 136                 'clusterfuck',
 137                 'cluster fuck',
 138                 'cock',
 139                 'coprolagnia',
 140                 'coprophilia',
 141                 'cornhol',
 142                 'cornhole',
 143                 'cream pie',
 144                 'creampi',
 145                 'creampie',
 146                 'cum',
 147                 'cumming',
 148                 'cunnilingu',
 149                 'cunt',
 150                 'damn',
 151                 'darki',
 152                 'darkie',
 153                 'date rape',
 154                 'daterap',
 155                 'daterape',
 156                 'deep throat',
 157                 'deepthroat',
 158                 'dick',
 159                 'dildo',
 160                 'dirti pillow',
 161                 'dirti sanchez',
 162                 'dirty pillow',
 163                 'dirty sanchez',
 164                 'dog style',
 165                 'doggi style',
 166                 'doggie style',
 167                 'doggiestyl',
 168                 'doggiestyle',
 169                 'doggystyle',
 170                 'dolcett',
 171                 'domination',
 172                 'dominatrix',
 173                 'domm',
 174                 'dommes',
 175                 'donkey punch',
 176                 'doubl dick',
 177                 'doubl dong',
 178                 'doubl penetr',
 179                 'double dick',
 180                 'double dong',
 181                 'double penetration',
 182                 'dp action',
 183                 'dtf',
 184                 'eat my ass',
 185                 'ecchi',
 186                 'ejacul',
 187                 'erection',
 188                 'erotic',
 189                 'erotism',
 190                 'escort',
 191                 'ethical slut',
 192                 'eunuch',
 193                 'faggot',
 194                 'fecal',
 195                 'felch',
 196                 'fellatio',
 197                 'feltch',
 198                 'female squirting',
 199                 'femdom',
 200                 'figging',
 201                 'finger',
 202                 'fist',
 203                 'foot fetish',
 204                 'footjob',
 205                 'frotting',
 206                 'fuck button',
 207                 'fuck',
 208                 'fucked',
 209                 'fucker',
 210                 'fuckhead',
 211                 'fuckin',
 212                 'fucking',
 213                 'fudge packer',
 214                 'fudgepack',
 215                 'fudgepacker',
 216                 'futanari',
 217                 'g spot',
 218                 'g-spot',
 219                 'gang bang',
 220                 'gay sex',
 221                 'gee spot',
 222                 'genital',
 223                 'giant cock',
 224                 'girl gone wild',
 225                 'girl on top',
 226                 'girl on',
 227                 'give head',
 228                 'giving head',
 229                 'gave head',
 230                 'gave you head',
 231                 'gave him head',
 232                 'gave them head',
 233                 'gave us head',
 234                 'glori hole',
 235                 'goatcx',
 236                 'goatse',
 237                 'goddamn',
 238                 'gokkun',
 239                 'golden shower',
 240                 'goo girl',
 241                 'goodpoop',
 242                 'goregasm',
 243                 'grope',
 244                 'group sex',
 245                 'gspot',
 246                 'guro',
 247                 'hand job',
 248                 'handjob',
 249                 'hard core',
 250                 'hardcore',
 251                 'hentai',
 252                 'homoerotic',
 253                 'honkey',
 254                 'hooker',
 255                 'horni',
 256                 'horny',
 257                 'hot chick',
 258                 'how to kill',
 259                 'how to murder',
 260                 'huge fat',
 261                 'humped',
 262                 'humping',
 263                 'hump',
 264                 'incest',
 265                 'intercourse',
 266                 'jack off',
 267                 'jail bait',
 268                 'jailbait',
 269                 'jerk off',
 270                 'jigaboo',
 271                 'jiggaboo',
 272                 'jiggerboo',
 273                 'jizz',
 274                 'jugg',
 275                 'kike',
 276                 'kinbaku',
 277                 'kinkster',
 278                 'kinky',
 279                 'knobbing',
 280                 'leather restraint',
 281                 'lemon party',
 282                 'lolita',
 283                 'lovemaking',
 284                 'make me come',
 285                 'male squirting',
 286                 'masturb',
 287                 'menage a trois',
 288                 'menag a troi',
 289                 'milf',
 290                 'missionary position',
 291                 'motherfuck',
 292                 'mound of venu',
 293                 'mr hand',
 294                 'muff diver',
 295                 'muffdiv',
 296                 'muffdiving',
 297                 'nambla',
 298                 'nawashi',
 299                 'negro',
 300                 'neonazi',
 301                 'nig nog',
 302                 'nigga',
 303                 'nigger',
 304                 'nimphomania',
 305                 'nipple',
 306                 'nip',
 307                 'not safe for',
 308                 'nsfl',
 309                 'nsfw',
 310                 'nude',
 311                 'nudity',
 312                 'nut sack',
 313                 'nutsack',
 314                 'nympho',
 315                 'nymphomania',
 316                 'octopussy',
 317                 'omorashi',
 318                 'one night stand',
 319                 'orgasm',
 320                 'orgy',
 321                 'paedophil',
 322                 'paedophile',
 323                 'panties',
 324                 'panti',
 325                 'pedobear',
 326                 'pedophil',
 327                 'pedophile',
 328                 'pee',
 329                 'pegging',
 330                 'peni',
 331                 'penis',
 332                 'phone sex',
 333                 'pigfucker',
 334                 'piss pig',
 335                 'piss',
 336                 'pissing',
 337                 'pisspig',
 338                 'playboy',
 339                 'pleasure chest',
 340                 'pole smoker',
 341                 'ponyplay',
 342                 'poof',
 343                 'poop chute',
 344                 'poopchute',
 345                 'porn',
 346                 'pron',
 347                 'pornhub',
 348                 'porno',
 349                 'pornographi',
 350                 'pornography',
 351                 'prince albert',
 352                 'pthc',
 353                 'pube',
 354                 'pussi',
 355                 'pussies',
 356                 'pussy',
 357                 'queaf',
 358                 'queer',
 359                 'raghead',
 360                 'raging boner',
 361                 'rape',
 362                 'raping',
 363                 'rapist',
 364                 'rectum',
 365                 'reverse cowgirl',
 366                 'rimjob',
 367                 'rimming',
 368                 'rosy palm',
 369                 'rusty trombone',
 370                 's & m',
 371                 's&m',
 372                 's+m',
 373                 'sadism',
 374                 'scat',
 375                 'schlong',
 376                 'scissoring',
 377                 'semen',
 378                 'sex',
 379                 'sexi',
 380                 'sexo',
 381                 'sexy',
 382                 'shave beaver',
 383                 'shave pussi',
 384                 'shemale',
 385                 'shibari',
 386                 'shit',
 387                 'shota',
 388                 'shrimping',
 389                 'slanteye',
 390                 'slut',
 391                 'smut',
 392                 'snatch',
 393                 'snm',
 394                 'snowballing',
 395                 'sodomi',
 396                 'sodomize',
 397                 'sodomy',
 398                 'spic',
 399                 'spooge',
 400                 'spread leg',
 401                 'squirting',
 402                 'strap on',
 403                 'strapon',
 404                 'strappado',
 405                 'strip club',
 406                 'style doggy',
 407                 'suck',
 408                 'suicid girl',
 409                 'sultry women',
 410                 'swastika',
 411                 'swinger',
 412                 'taint',
 413                 'tainted love',
 414                 'taste my',
 415                 'tea bagging',
 416                 'threesome',
 417                 'throating',
 418                 'tied up',
 419                 'tight white',
 420                 'tit',
 421                 'titti',
 422                 'titties',
 423                 'titty',
 424                 'tongue in',
 425                 'topless',
 426                 'tosser',
 427                 'towelhead',
 428                 'tranny',
 429                 'tribadism',
 430                 'tub girl',
 431                 'tubgirl',
 432                 'tushy',
 433                 'twat',
 434                 'twink',
 435                 'twinki',
 436                 'twinkie',
 437                 'undress',
 438                 'upskirt',
 439                 'urethra play',
 440                 'urophilia',
 441                 'vag',
 442                 'vagina',
 443                 'venu mound',
 444                 'vibrator',
 445                 'violet blue',
 446                 'violet wand',
 447                 'vorarephilia',
 448                 'voyeur',
 449                 'vulva',
 450                 'wank',
 451                 'wet dream',
 452                 'wetback',
 453                 'white power',
 454                 'whore',
 455                 'women rapping',
 456                 'wrapping men',
 457                 'wrinkled starfish',
 458                 'xx',
 459                 'xxx',
 460                 'yaoi',
 461                 'yellow shower',
 462                 'yiffy',
 463                 'zoophilia',
 464             ]
 465         )
 466         self.stemmer = PorterStemmer()
 467
 468     def _normalize(self, text: str) -> str:
 469         """Normalize text.
 470
 471         >>> _normalize('Tittie5')
 472         'titties'
 473
 474         >>> _normalize('Suck a Dick!')
 475         'suck a dick'
 476
 477         >>> _normalize('fucking a whore')
 478         'fuck a whore'
 479
 480         """
 481         result = text.lower()
 482         result = result.replace("_", " ")
 483         result = result.replace('0', 'o')
 484         result = result.replace('1', 'l')
 485         result = result.replace('4', 'a')
 486         result = result.replace('5', 's')
 487         result = result.replace('3', 'e')
 488         for x in string.punctuation:
 489             result = result.replace(x, "")
 490         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
 491         return ' '.join(chunks)
 492
 493     @staticmethod
 494     def tokenize(text: str):
 495         for x in nltk.word_tokenize(text):
 496             for y in re.split(r'\W+', x):
 497                 yield y
 498
 499     def contains_bad_word(self, text: str) -> bool:
 500         """Returns True if text contains a bad word (or more than one)
 501         and False if no bad words were detected.
 502
 503         >>> contains_bad_word('fuck you')
 504         True
 505
 506         >>> contains_bad_word('FucK u')
 507         True
 508
 509         >>> contains_bad_word('FuK U')
 510         False
 511
 512         """
 513         words = list(self.tokenize(text))
 514         for word in words:
 515             if self.is_bad_word(word):
 516                 logger.debug('"%s" is profanity', word)
 517                 return True
 518
 519         if len(words) > 1:
 520             for bigram in string_utils.ngrams_presplit(words, 2):
 521                 bigram = ' '.join(bigram)
 522                 if self.is_bad_word(bigram):
 523                     logger.debug('"%s" is profanity', bigram)
 524                     return True
 525
 526         if len(words) > 2:
 527             for trigram in string_utils.ngrams_presplit(words, 3):
 528                 trigram = ' '.join(trigram)
 529                 if self.is_bad_word(trigram):
 530                     logger.debug('"%s" is profanity', trigram)
 531                     return True
 532         return False
 533
 534     def is_bad_word(self, word: str) -> bool:
 535         return word in self.bad_words or self._normalize(word) in self.bad_words
 536
 537     def obscure_bad_words(self, text: str) -> str:
 538         """Obscure bad words that are detected by inserting random punctuation
 539         characters.
 540
 541         """
 542
 543         def obscure(word: str):
 544             out = ''
 545             last = ''
 546             for letter in word:
 547                 if letter.isspace():
 548                     out += letter
 549                 else:
 550                     while True:
 551                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 552                         if last != char:
 553                             last = char
 554                             out += char
 555                             break
 556             return out
 557
 558         words = list(self.tokenize(text))
 559         words.append('')
 560         words.append('')
 561         words.append('')
 562         out = ''
 563
 564         cursor = 0
 565         while cursor < len(words) - 3:
 566             word = words[cursor]
 567             bigram = word + ' ' + words[cursor + 1]
 568             trigram = bigram + ' ' + words[cursor + 2]
 569             if self.is_bad_word(trigram):
 570                 out += obscure(trigram) + ' '
 571                 cursor += 3
 572             elif self.is_bad_word(bigram):
 573                 out += obscure(bigram) + ' '
 574                 cursor += 2
 575             elif self.is_bad_word(word):
 576                 out += obscure(word) + ' '
 577                 cursor += 1
 578             else:
 579                 out += word + ' '
 580                 cursor += 1
 581         return out.strip()
 582
 583
 584 def main() -> None:
 585     import doctest
 586
 587     doctest.testmod()
 588     pf = ProfanityFilter()
 589     phrase = ' '.join(sys.argv[1:])
 590     print(pf.contains_bad_word(phrase))
 591     print(pf.obscure_bad_words(phrase))
 592     sys.exit(0)
 593
 594
 595 if __name__ == '__main__':
 596     main()