profanity_filter.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2021-2022, Scott Gasch
   4
   5 """A helper to identify and optionally obscure some bad words.  Not
   6 perfect but decent.  Uses a fuzzy block list rather than ML."""
   7
   8 import logging
   9 import random
  10 import re
  11 import string
  12 import sys
  13
  14 import nltk
  15 from nltk.stem import PorterStemmer
  16
  17 import decorator_utils
  18 import string_utils
  19
  20 logger = logging.getLogger(__name__)
  21
  22
  23 @decorator_utils.singleton
  24 class ProfanityFilter(object):
  25     """A helper to identify and optionally obscure some bad words."""
  26
  27     def __init__(self):
  28         self.bad_words = set(
  29             [
  30                 'acrotomophilia',
  31                 'anal',
  32                 'analingu',
  33                 'anally',
  34                 'anilingu',
  35                 'anus',
  36                 'arsehol',
  37                 'arsehole',
  38                 'ass',
  39                 'asshol',
  40                 'asshole',
  41                 'assmunch',
  42                 'auto erot',
  43                 'auto erotic',
  44                 'autoerotic',
  45                 'babeland',
  46                 'babi batter',
  47                 'baby batter',
  48                 'ball gag',
  49                 'ball gravi',
  50                 'ball gravy',
  51                 'ball kick',
  52                 'ball kicking',
  53                 'ball lick',
  54                 'ball licking',
  55                 'ball sack',
  56                 'ball suck',
  57                 'ball sucking',
  58                 'ball zack',
  59                 'bangbro',
  60                 'bare legal',
  61                 'bareback',
  62                 'barely legal',
  63                 'barenak',
  64                 'barenaked',
  65                 'bastardo',
  66                 'bastinado',
  67                 'bbc',
  68                 'bbw',
  69                 'bdsm',
  70                 'beaver cleaver',
  71                 'beaver lip',
  72                 'bestial',
  73                 'bestiality',
  74                 'bi curiou',
  75                 'big black',
  76                 'big breast',
  77                 'big knocker',
  78                 'big tit',
  79                 'bimbo',
  80                 'birdlock',
  81                 'bitch',
  82                 'black cock',
  83                 'blond action',
  84                 'blond on blond',
  85                 'blonde action',
  86                 'blow j',
  87                 'blow job',
  88                 'blowjob',
  89                 'blow my',
  90                 'blow me',
  91                 'blow ourselv',
  92                 'blow ourselves',
  93                 'blow your load',
  94                 'blue waffl',
  95                 'blue waffle',
  96                 'blumpkin',
  97                 'bollock',
  98                 'bondag',
  99                 'bondage',
 100                 'boner',
 101                 'boob',
 102                 'booti call',
 103                 'booty call',
 104                 'breast',
 105                 'brown shower',
 106                 'brunett action',
 107                 'brunette action',
 108                 'bukkak',
 109                 'bukkake',
 110                 'bulldyk',
 111                 'bulldyke',
 112                 'bullet vibe',
 113                 'bullshit',
 114                 'bung hole',
 115                 'bunghol',
 116                 'bunghole',
 117                 'busti',
 118                 'busty',
 119                 'butt',
 120                 'buttcheek',
 121                 'butthol',
 122                 'butthole',
 123                 'camel toe',
 124                 'camgirl',
 125                 'camslut',
 126                 'camwhore',
 127                 'carpet muncher',
 128                 'carpetmuncher',
 129                 'chocol rosebud',
 130                 'circlejerk',
 131                 'chink',
 132                 'cleveland steamer',
 133                 'clit',
 134                 'clitor',
 135                 'clitori',
 136                 'clover clamp',
 137                 'clusterfuck',
 138                 'cluster fuck',
 139                 'cock',
 140                 'coprolagnia',
 141                 'coprophilia',
 142                 'cornhol',
 143                 'cornhole',
 144                 'cream pie',
 145                 'creampi',
 146                 'creampie',
 147                 'cum',
 148                 'cumming',
 149                 'cunnilingu',
 150                 'cunt',
 151                 'damn',
 152                 'darki',
 153                 'darkie',
 154                 'date rape',
 155                 'daterap',
 156                 'daterape',
 157                 'deep throat',
 158                 'deepthroat',
 159                 'dick',
 160                 'dildo',
 161                 'dirti pillow',
 162                 'dirti sanchez',
 163                 'dirty pillow',
 164                 'dirty sanchez',
 165                 'dog style',
 166                 'doggi style',
 167                 'doggie style',
 168                 'doggiestyl',
 169                 'doggiestyle',
 170                 'doggystyle',
 171                 'dolcett',
 172                 'domination',
 173                 'dominatrix',
 174                 'domm',
 175                 'dommes',
 176                 'donkey punch',
 177                 'doubl dick',
 178                 'doubl dong',
 179                 'doubl penetr',
 180                 'double dick',
 181                 'double dong',
 182                 'double penetration',
 183                 'dp action',
 184                 'dtf',
 185                 'eat my ass',
 186                 'ecchi',
 187                 'ejacul',
 188                 'erection',
 189                 'erotic',
 190                 'erotism',
 191                 'escort',
 192                 'ethical slut',
 193                 'eunuch',
 194                 'faggot',
 195                 'fecal',
 196                 'felch',
 197                 'fellatio',
 198                 'feltch',
 199                 'female squirting',
 200                 'femdom',
 201                 'figging',
 202                 'finger',
 203                 'fist',
 204                 'foot fetish',
 205                 'footjob',
 206                 'frotting',
 207                 'fuck button',
 208                 'fuck',
 209                 'fucked',
 210                 'fucker',
 211                 'fuckhead',
 212                 'fuckin',
 213                 'fucking',
 214                 'fudge packer',
 215                 'fudgepack',
 216                 'fudgepacker',
 217                 'futanari',
 218                 'g spot',
 219                 'g-spot',
 220                 'gang bang',
 221                 'gay sex',
 222                 'gee spot',
 223                 'genital',
 224                 'giant cock',
 225                 'girl gone wild',
 226                 'girl on top',
 227                 'girl on',
 228                 'give head',
 229                 'giving head',
 230                 'gave head',
 231                 'gave you head',
 232                 'gave him head',
 233                 'gave them head',
 234                 'gave us head',
 235                 'glori hole',
 236                 'goatcx',
 237                 'goatse',
 238                 'goddamn',
 239                 'gokkun',
 240                 'golden shower',
 241                 'goo girl',
 242                 'goodpoop',
 243                 'goregasm',
 244                 'grope',
 245                 'group sex',
 246                 'gspot',
 247                 'guro',
 248                 'hand job',
 249                 'handjob',
 250                 'hard core',
 251                 'hardcore',
 252                 'hentai',
 253                 'homoerotic',
 254                 'honkey',
 255                 'hooker',
 256                 'horni',
 257                 'horny',
 258                 'hot chick',
 259                 'how to kill',
 260                 'how to murder',
 261                 'huge fat',
 262                 'humped',
 263                 'humping',
 264                 'hump',
 265                 'incest',
 266                 'intercourse',
 267                 'jack off',
 268                 'jail bait',
 269                 'jailbait',
 270                 'jerk off',
 271                 'jigaboo',
 272                 'jiggaboo',
 273                 'jiggerboo',
 274                 'jizz',
 275                 'jugg',
 276                 'kike',
 277                 'kinbaku',
 278                 'kinkster',
 279                 'kinky',
 280                 'knobbing',
 281                 'leather restraint',
 282                 'lemon party',
 283                 'lolita',
 284                 'lovemaking',
 285                 'make me come',
 286                 'male squirting',
 287                 'masturb',
 288                 'menage a trois',
 289                 'menag a troi',
 290                 'milf',
 291                 'missionary position',
 292                 'motherfuck',
 293                 'mound of venu',
 294                 'mr hand',
 295                 'muff diver',
 296                 'muffdiv',
 297                 'muffdiving',
 298                 'nambla',
 299                 'nawashi',
 300                 'negro',
 301                 'neonazi',
 302                 'nig nog',
 303                 'nigga',
 304                 'nigger',
 305                 'nimphomania',
 306                 'nipple',
 307                 'nip',
 308                 'not safe for',
 309                 'nsfl',
 310                 'nsfw',
 311                 'nude',
 312                 'nudity',
 313                 'nut sack',
 314                 'nutsack',
 315                 'nympho',
 316                 'nymphomania',
 317                 'octopussy',
 318                 'omorashi',
 319                 'one night stand',
 320                 'orgasm',
 321                 'orgy',
 322                 'paedophil',
 323                 'paedophile',
 324                 'panties',
 325                 'panti',
 326                 'pedobear',
 327                 'pedophil',
 328                 'pedophile',
 329                 'pee',
 330                 'pegging',
 331                 'peni',
 332                 'penis',
 333                 'phone sex',
 334                 'pigfucker',
 335                 'piss pig',
 336                 'piss',
 337                 'pissing',
 338                 'pisspig',
 339                 'playboy',
 340                 'pleasure chest',
 341                 'pole smoker',
 342                 'ponyplay',
 343                 'poof',
 344                 'poop chute',
 345                 'poopchute',
 346                 'porn',
 347                 'pron',
 348                 'pornhub',
 349                 'porno',
 350                 'pornographi',
 351                 'pornography',
 352                 'prince albert',
 353                 'pthc',
 354                 'pube',
 355                 'pussi',
 356                 'pussies',
 357                 'pussy',
 358                 'queaf',
 359                 'queer',
 360                 'raghead',
 361                 'raging boner',
 362                 'rape',
 363                 'raping',
 364                 'rapist',
 365                 'rectum',
 366                 'reverse cowgirl',
 367                 'rimjob',
 368                 'rimming',
 369                 'rosy palm',
 370                 'rusty trombone',
 371                 's & m',
 372                 's&m',
 373                 's+m',
 374                 'sadism',
 375                 'scat',
 376                 'schlong',
 377                 'scissoring',
 378                 'semen',
 379                 'sex',
 380                 'sexi',
 381                 'sexo',
 382                 'sexy',
 383                 'shave beaver',
 384                 'shave pussi',
 385                 'shemale',
 386                 'shibari',
 387                 'shit',
 388                 'shota',
 389                 'shrimping',
 390                 'slanteye',
 391                 'slut',
 392                 'smut',
 393                 'snatch',
 394                 'snm',
 395                 'snowballing',
 396                 'sodomi',
 397                 'sodomize',
 398                 'sodomy',
 399                 'spic',
 400                 'spooge',
 401                 'spread leg',
 402                 'squirting',
 403                 'strap on',
 404                 'strapon',
 405                 'strappado',
 406                 'strip club',
 407                 'style doggy',
 408                 'suck',
 409                 'suicid girl',
 410                 'sultry women',
 411                 'swastika',
 412                 'swinger',
 413                 'taint',
 414                 'tainted love',
 415                 'taste my',
 416                 'tea bagging',
 417                 'threesome',
 418                 'throating',
 419                 'tied up',
 420                 'tight white',
 421                 'tit',
 422                 'titti',
 423                 'titties',
 424                 'titty',
 425                 'tongue in',
 426                 'topless',
 427                 'tosser',
 428                 'towelhead',
 429                 'tranny',
 430                 'tribadism',
 431                 'tub girl',
 432                 'tubgirl',
 433                 'tushy',
 434                 'twat',
 435                 'twink',
 436                 'twinki',
 437                 'twinkie',
 438                 'undress',
 439                 'upskirt',
 440                 'urethra play',
 441                 'urophilia',
 442                 'vag',
 443                 'vagina',
 444                 'venu mound',
 445                 'vibrator',
 446                 'violet blue',
 447                 'violet wand',
 448                 'vorarephilia',
 449                 'voyeur',
 450                 'vulva',
 451                 'wank',
 452                 'wet dream',
 453                 'wetback',
 454                 'white power',
 455                 'whore',
 456                 'women rapping',
 457                 'wrapping men',
 458                 'wrinkled starfish',
 459                 'xx',
 460                 'xxx',
 461                 'yaoi',
 462                 'yellow shower',
 463                 'yiffy',
 464                 'zoophilia',
 465             ]
 466         )
 467         self.stemmer = PorterStemmer()
 468
 469     def _normalize(self, text: str) -> str:
 470         """Normalize text.
 471
 472         >>> _normalize('Tittie5')
 473         'titties'
 474
 475         >>> _normalize('Suck a Dick!')
 476         'suck a dick'
 477
 478         >>> _normalize('fucking a whore')
 479         'fuck a whore'
 480
 481         >>> _normalize('pu55y')
 482         'pussy'
 483
 484         """
 485         result = text.lower()
 486         result = result.replace("_", " ")
 487         result = result.replace('0', 'o')
 488         result = result.replace('1', 'l')
 489         result = result.replace('4', 'a')
 490         result = result.replace('5', 's')
 491         result = result.replace('3', 'e')
 492         for x in string.punctuation:
 493             result = result.replace(x, "")
 494         chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
 495         return ' '.join(chunks)
 496
 497     @staticmethod
 498     def tokenize(text: str):
 499         """Tokenize text into word-like chunks"""
 500         for x in nltk.word_tokenize(text):
 501             for y in re.split(r'\W+', x):
 502                 yield y
 503
 504     def contains_bad_word(self, text: str) -> bool:
 505         """Returns True if text contains a bad word (or more than one)
 506         and False if no bad words were detected.
 507
 508         >>> contains_bad_word('fuck you')
 509         True
 510
 511         >>> contains_bad_word('FucK u')
 512         True
 513
 514         >>> contains_bad_word('FuK U')
 515         False
 516
 517         """
 518         words = list(self.tokenize(text))
 519         for word in words:
 520             if self.is_bad_word(word):
 521                 logger.debug('"%s" is profanity', word)
 522                 return True
 523
 524         if len(words) > 1:
 525             for bigram in string_utils.ngrams_presplit(words, 2):
 526                 bigram = ' '.join(bigram)
 527                 if self.is_bad_word(bigram):
 528                     logger.debug('"%s" is profanity', bigram)
 529                     return True
 530
 531         if len(words) > 2:
 532             for trigram in string_utils.ngrams_presplit(words, 3):
 533                 trigram = ' '.join(trigram)
 534                 if self.is_bad_word(trigram):
 535                     logger.debug('"%s" is profanity', trigram)
 536                     return True
 537         return False
 538
 539     def is_bad_word(self, word: str) -> bool:
 540         """True if we think word is a bad word."""
 541         return word in self.bad_words or self._normalize(word) in self.bad_words
 542
 543     def obscure_bad_words(self, text: str) -> str:
 544         """Obscure bad words that are detected by inserting random punctuation
 545         characters.
 546         """
 547
 548         def obscure(word: str):
 549             out = ''
 550             last = ''
 551             for letter in word:
 552                 if letter.isspace():
 553                     out += letter
 554                 else:
 555                     while True:
 556                         char = random.choice(['#', '%', '!', '@', '&', '*'])
 557                         if last != char:
 558                             last = char
 559                             out += char
 560                             break
 561             return out
 562
 563         words = list(self.tokenize(text))
 564         words.append('')
 565         words.append('')
 566         words.append('')
 567         out = ''
 568
 569         cursor = 0
 570         while cursor < len(words) - 3:
 571             word = words[cursor]
 572             bigram = word + ' ' + words[cursor + 1]
 573             trigram = bigram + ' ' + words[cursor + 2]
 574             if self.is_bad_word(trigram):
 575                 out += obscure(trigram) + ' '
 576                 cursor += 3
 577             elif self.is_bad_word(bigram):
 578                 out += obscure(bigram) + ' '
 579                 cursor += 2
 580             elif self.is_bad_word(word):
 581                 out += obscure(word) + ' '
 582                 cursor += 1
 583             else:
 584                 out += word + ' '
 585                 cursor += 1
 586         return out.strip()
 587
 588
 589 def main() -> None:
 590     import doctest
 591
 592     doctest.testmod()
 593     pf = ProfanityFilter()
 594     phrase = ' '.join(sys.argv[1:])
 595     print(pf.contains_bad_word(phrase))
 596     print(pf.obscure_bad_words(phrase))
 597     sys.exit(0)
 598
 599
 600 if __name__ == '__main__':
 601     main()