Used isort to sort imports. Also added to the git pre-commit hook.
[python_utils.git] / profanity_filter.py
index 5621cef94489f6b5446a9e786777a8cb93e68be4..e227165ba19a2b258847a9e8b86f9adc87de8fb5 100755 (executable)
@@ -2,6 +2,7 @@
 
 import logging
 import random
+import re
 import string
 import sys
 
@@ -11,475 +12,510 @@ from nltk.stem import PorterStemmer
 import decorator_utils
 import string_utils
 
-
 logger = logging.getLogger(__name__)
 
 
 @decorator_utils.singleton
 class ProfanityFilter(object):
     def __init__(self):
-        self.bad_words = set([
-            'acrotomophilia',
-            'anal',
-            'analingus',
-            'anally',
-            'anilingus',
-            'anus',
-            'arsehol',
-            'arsehole',
-            'ass',
-            'asses',
-            'asshol',
-            'asshole',
-            'assmunch',
-            'auto erot',
-            'auto erotic',
-            'autoerotic',
-            'babeland',
-            'babi batter',
-            'baby batter',
-            'ball gag',
-            'ball gravi',
-            'ball gravy',
-            'ball kick',
-            'ball kicking',
-            'ball lick',
-            'ball licking',
-            'ball sack',
-            'ball suck',
-            'ball sucking',
-            'ball zack',
-            'bangbro',
-            'bangbros',
-            'bare legal',
-            'bareback',
-            'barely legal',
-            'barenak',
-            'barenaked',
-            'bastardo',
-            'bastinado',
-            'bbc',
-            'bbw',
-            'bdsm',
-            'beaver cleaver',
-            'beaver lip',
-            'beaver lips',
-            'bestial',
-            'bestiality',
-            'bi curiou',
-            'bi curious',
-            'big black',
-            'big breasts',
-            'big knocker',
-            'big knockers',
-            'big tit',
-            'big tits',
-            'bimbo',
-            'birdlock',
-            'bitch',
-            'bitches',
-            'black cock',
-            'blond action',
-            'blond on blond',
-            'blonde action',
-            'blow j',
-            'blow job',
-            'blowjob',
-            'blow my',
-            'blow me',
-            'blow ourselv',
-            'blow ourselves',
-            'blow your load',
-            'blue waffl',
-            'blue waffle',
-            'blumpkin',
-            'bollock',
-            'bollocks',
-            'bondag',
-            'bondage',
-            'boner',
-            'boob',
-            'boobs',
-            'booti call',
-            'booty call',
-            'breast',
-            'breasts',
-            'brown shower',
-            'brown showers',
-            'brunett action',
-            'brunette action',
-            'bukkak',
-            'bukkake',
-            'bulldyk',
-            'bulldyke',
-            'bullet vibe',
-            'bullshit',
-            'bung hole',
-            'bunghol',
-            'bunghole',
-            'busti',
-            'busty',
-            'butt',
-            'buttcheek',
-            'buttcheeks',
-            'butthol',
-            'butthole',
-            'camel toe',
-            'camgirl',
-            'camslut',
-            'camwhore',
-            'carpet muncher',
-            'carpetmuncher',
-            'chocol rosebud',
-            'chocolate rosebuds',
-            'circlejerk',
-            'chink',
-            'cleveland steamer',
-            'clit',
-            'clitori',
-            'clitoris',
-            'clover clamp',
-            'clover clamps',
-            'clusterfuck',
-            'cock',
-            'cocks',
-            'coprolagnia',
-            'coprophilia',
-            'cornhol',
-            'cornhole',
-            'cream pie',
-            'creampi',
-            'creampie',
-            'cum',
-            'cumming',
-            'cunnilingu',
-            'cunnilingus',
-            'cunt',
-            'damn',
-            'darki',
-            'darkie',
-            'date rape',
-            'daterap',
-            'daterape',
-            'deep throat',
-            'deepthroat',
-            'dick',
-            'dildo',
-            'dirti pillow',
-            'dirti sanchez',
-            'dirty pillow',
-            'dirty sanchez',
-            'dog style',
-            'doggi style',
-            'doggie style',
-            'doggiestyl',
-            'doggiestyle',
-            'doggystyle',
-            'dolcett',
-            'domination',
-            'dominatrix',
-            'domm',
-            'dommes',
-            'donkey punch',
-            'doubl dick',
-            'doubl dong',
-            'doubl penetr',
-            'double dick',
-            'double dong',
-            'double penetration',
-            'dp action',
-            'dtf',
-            'eat my ass',
-            'ecchi',
-            'ejacul',
-            'erection',
-            'erotic',
-            'erotism',
-            'escort',
-            'ethical slut',
-            'eunuch',
-            'faggot',
-            'fecal',
-            'felch',
-            'fellatio',
-            'feltch',
-            'female squirting',
-            'femdom',
-            'figging',
-            'fingered',
-            'fingering',
-            'fingers',
-            'fisted',
-            'fisting',
-            'fists',
-            'foot fetish',
-            'footjob',
-            'frotting',
-            'fuck button',
-            'fuck',
-            'fucked',
-            'fucker',
-            'fuckhead',
-            'fuckin',
-            'fucking',
-            'fudge packer',
-            'fudgepack',
-            'fudgepacker',
-            'futanari',
-            'g spot',
-            'g-spot',
-            'gang bang',
-            'gay sex',
-            'gee spot',
-            'genital',
-            'giant cock',
-            'girl gone wild',
-            'girl on top',
-            'girl on',
-            'goatcx',
-            'goatse',
-            'goddamn',
-            'gokkun',
-            'golden shower',
-            'goo girl',
-            'goodpoop',
-            'goregasm',
-            'grope',
-            'group sex',
-            'gspot',
-            'guro',
-            'hand job',
-            'handjob',
-            'hard core',
-            'hardcore',
-            'hentai',
-            'homoerotic',
-            'honkey',
-            'hooker',
-            'horni',
-            'horny',
-            'hot chick',
-            'how to kill',
-            'how to murder',
-            'huge fat',
-            'humped',
-            'humping',
-            'humps',
-            'incest',
-            'intercourse',
-            'jack off',
-            'jail bait',
-            'jailbait',
-            'jerk off',
-            'jigaboo',
-            'jiggaboo',
-            'jiggerboo',
-            'jizz',
-            'jugg',
-            'kike',
-            'kinbaku',
-            'kinkster',
-            'kinky',
-            'knobbing',
-            'leather restraint',
-            'lemon party',
-            'lolita',
-            'lovemaking',
-            'make me come',
-            'male squirting',
-            'masturb',
-            'menage a trois',
-            'milf',
-            'missionary position',
-            'motherfuck',
-            'mound of venus',
-            'mr hand',
-            'muff diver',
-            'muffdiv',
-            'muffdiving',
-            'nambla',
-            'nawashi',
-            'negro',
-            'neonazi',
-            'nig nog',
-            'nigga',
-            'nigger',
-            'nimphomania',
-            'nipple',
-            'nip',
-            'not safe for',
-            'nsfl',
-            'nsfw',
-            'nude',
-            'nudes',
-            'nudity',
-            'nut sack',
-            'nutsack',
-            'nympho',
-            'nymphomania',
-            'octopussy',
-            'omorashi',
-            'one night stand',
-            'orgasm',
-            'orgy',
-            'paedophil',
-            'paedophile',
-            'panties',
-            'panty',
-            'pedobear',
-            'pedophil',
-            'pedophile',
-            'pee',
-            'pegging',
-            'peni',
-            'penis',
-            'phone sex',
-            'pigfucker',
-            'piss pig',
-            'piss',
-            'pissing',
-            'pisspig',
-            'playboy',
-            'pleasure chest',
-            'pole smoker',
-            'ponyplay',
-            'poof',
-            'poop chute',
-            'poopchute',
-            'porn',
-            'pornhub',
-            'porno',
-            'pornographi',
-            'pornography',
-            'prince albert',
-            'pthc',
-            'pube',
-            'pussi',
-            'pussies',
-            'pussy',
-            'queaf',
-            'queer',
-            'raghead',
-            'raging boner',
-            'rape',
-            'raping',
-            'rapist',
-            'rectum',
-            'reverse cowgirl',
-            'rimjob',
-            'rimming',
-            'rosy palm',
-            'rusty trombone',
-            's & m',
-            's&m',
-            's+m',
-            'sadism',
-            'scat',
-            'schlong',
-            'scissoring',
-            'semen',
-            'sex',
-            'sexi',
-            'sexo',
-            'sexy',
-            'shaved beaver',
-            'shaved pussy',
-            'shemale',
-            'shibari',
-            'shit',
-            'shota',
-            'shrimping',
-            'slanteye',
-            'slut',
-            'smut',
-            'snatch',
-            'snm',
-            'snowballing',
-            'sodomi',
-            'sodomize',
-            'sodomy',
-            'spic',
-            'spooge',
-            'spread legs',
-            'squirting',
-            'strap on',
-            'strapon',
-            'strappado',
-            'strip club',
-            'style doggy',
-            'suck',
-            'suicide girls',
-            'sultry women',
-            'swastika',
-            'swinger',
-            'taint',
-            'tainted love',
-            'taste my',
-            'tea bagging',
-            'threesome',
-            'throating',
-            'tied up',
-            'tight white',
-            'tit',
-            'tits',
-            'titti',
-            'titties',
-            'titty',
-            'tongue in',
-            'topless',
-            'tosser',
-            'towelhead',
-            'tranny',
-            'tribadism',
-            'tub girl',
-            'tubgirl',
-            'tushy',
-            'twat',
-            'twink',
-            'twinki',
-            'twinkie',
-            'undress',
-            'upskirt',
-            'urethra play',
-            'urophilia',
-            'vag',
-            'vagina',
-            'venus mound',
-            'vibrator',
-            'violet blue',
-            'violet wand',
-            'vorarephilia',
-            'voyeur',
-            'vulva',
-            'wank',
-            'wet dream',
-            'wetback',
-            'white power',
-            'whore',
-            'women rapping',
-            'wrapping men',
-            'wrinkled starfish',
-            'xx',
-            'xxx',
-            'yaoi',
-            'yellow shower',
-            'yiffy',
-            'zoophilia',
-        ])
+        self.bad_words = set(
+            [
+                'acrotomophilia',
+                'anal',
+                'analingus',
+                'anally',
+                'anilingus',
+                'anus',
+                'arsehol',
+                'arsehole',
+                'ass',
+                'asses',
+                'asshol',
+                'asshole',
+                'assmunch',
+                'auto erot',
+                'auto erotic',
+                'autoerotic',
+                'babeland',
+                'babi batter',
+                'baby batter',
+                'ball gag',
+                'ball gravi',
+                'ball gravy',
+                'ball kick',
+                'ball kicking',
+                'ball lick',
+                'ball licking',
+                'ball sack',
+                'ball suck',
+                'ball sucking',
+                'ball zack',
+                'bangbro',
+                'bangbros',
+                'bare legal',
+                'bareback',
+                'barely legal',
+                'barenak',
+                'barenaked',
+                'bastardo',
+                'bastinado',
+                'bbc',
+                'bbw',
+                'bdsm',
+                'beaver cleaver',
+                'beaver lip',
+                'beaver lips',
+                'bestial',
+                'bestiality',
+                'bi curiou',
+                'bi curious',
+                'big black',
+                'big breasts',
+                'big knocker',
+                'big knockers',
+                'big tit',
+                'big tits',
+                'bimbo',
+                'birdlock',
+                'bitch',
+                'bitches',
+                'black cock',
+                'blond action',
+                'blond on blond',
+                'blonde action',
+                'blow j',
+                'blow job',
+                'blowjob',
+                'blow my',
+                'blow me',
+                'blow ourselv',
+                'blow ourselves',
+                'blow your load',
+                'blue waffl',
+                'blue waffle',
+                'blumpkin',
+                'bollock',
+                'bollocks',
+                'bondag',
+                'bondage',
+                'boner',
+                'boob',
+                'boobs',
+                'booti call',
+                'booty call',
+                'breast',
+                'breasts',
+                'brown shower',
+                'brown showers',
+                'brunett action',
+                'brunette action',
+                'bukkak',
+                'bukkake',
+                'bulldyk',
+                'bulldyke',
+                'bullet vibe',
+                'bullshit',
+                'bung hole',
+                'bunghol',
+                'bunghole',
+                'busti',
+                'busty',
+                'butt',
+                'buttcheek',
+                'buttcheeks',
+                'butthol',
+                'butthole',
+                'camel toe',
+                'camgirl',
+                'camslut',
+                'camwhore',
+                'carpet muncher',
+                'carpetmuncher',
+                'chocol rosebud',
+                'chocolate rosebuds',
+                'circlejerk',
+                'chink',
+                'cleveland steamer',
+                'clit',
+                'clitori',
+                'clitoris',
+                'clover clamp',
+                'clover clamps',
+                'clusterfuck',
+                'cock',
+                'cocks',
+                'coprolagnia',
+                'coprophilia',
+                'cornhol',
+                'cornhole',
+                'cream pie',
+                'creampi',
+                'creampie',
+                'cum',
+                'cumming',
+                'cunnilingu',
+                'cunnilingus',
+                'cunt',
+                'damn',
+                'darki',
+                'darkie',
+                'date rape',
+                'daterap',
+                'daterape',
+                'deep throat',
+                'deepthroat',
+                'dick',
+                'dildo',
+                'dirti pillow',
+                'dirti sanchez',
+                'dirty pillow',
+                'dirty sanchez',
+                'dog style',
+                'doggi style',
+                'doggie style',
+                'doggiestyl',
+                'doggiestyle',
+                'doggystyle',
+                'dolcett',
+                'domination',
+                'dominatrix',
+                'domm',
+                'dommes',
+                'donkey punch',
+                'doubl dick',
+                'doubl dong',
+                'doubl penetr',
+                'double dick',
+                'double dong',
+                'double penetration',
+                'dp action',
+                'dtf',
+                'eat my ass',
+                'ecchi',
+                'ejacul',
+                'erection',
+                'erotic',
+                'erotism',
+                'escort',
+                'ethical slut',
+                'eunuch',
+                'faggot',
+                'fecal',
+                'felch',
+                'fellatio',
+                'feltch',
+                'female squirting',
+                'femdom',
+                'figging',
+                'fingered',
+                'fingering',
+                'fingers',
+                'fisted',
+                'fisting',
+                'fists',
+                'foot fetish',
+                'footjob',
+                'frotting',
+                'fuck button',
+                'fuck',
+                'fucked',
+                'fucker',
+                'fuckhead',
+                'fuckin',
+                'fucking',
+                'fudge packer',
+                'fudgepack',
+                'fudgepacker',
+                'futanari',
+                'g spot',
+                'g-spot',
+                'gang bang',
+                'gay sex',
+                'gee spot',
+                'genital',
+                'giant cock',
+                'girl gone wild',
+                'girl on top',
+                'girl on',
+                'goatcx',
+                'goatse',
+                'goddamn',
+                'gokkun',
+                'golden shower',
+                'goo girl',
+                'goodpoop',
+                'goregasm',
+                'grope',
+                'group sex',
+                'gspot',
+                'guro',
+                'hand job',
+                'handjob',
+                'hard core',
+                'hardcore',
+                'hentai',
+                'homoerotic',
+                'honkey',
+                'hooker',
+                'horni',
+                'horny',
+                'hot chick',
+                'how to kill',
+                'how to murder',
+                'huge fat',
+                'humped',
+                'humping',
+                'humps',
+                'incest',
+                'intercourse',
+                'jack off',
+                'jail bait',
+                'jailbait',
+                'jerk off',
+                'jigaboo',
+                'jiggaboo',
+                'jiggerboo',
+                'jizz',
+                'jugg',
+                'kike',
+                'kinbaku',
+                'kinkster',
+                'kinky',
+                'knobbing',
+                'leather restraint',
+                'lemon party',
+                'lolita',
+                'lovemaking',
+                'make me come',
+                'male squirting',
+                'masturb',
+                'menage a trois',
+                'milf',
+                'missionary position',
+                'motherfuck',
+                'mound of venus',
+                'mr hand',
+                'muff diver',
+                'muffdiv',
+                'muffdiving',
+                'nambla',
+                'nawashi',
+                'negro',
+                'neonazi',
+                'nig nog',
+                'nigga',
+                'nigger',
+                'nimphomania',
+                'nipple',
+                'nip',
+                'not safe for',
+                'nsfl',
+                'nsfw',
+                'nude',
+                'nudes',
+                'nudity',
+                'nut sack',
+                'nutsack',
+                'nympho',
+                'nymphomania',
+                'octopussy',
+                'omorashi',
+                'one night stand',
+                'orgasm',
+                'orgy',
+                'paedophil',
+                'paedophile',
+                'panties',
+                'panty',
+                'pedobear',
+                'pedophil',
+                'pedophile',
+                'pee',
+                'pegging',
+                'peni',
+                'penis',
+                'phone sex',
+                'pigfucker',
+                'piss pig',
+                'piss',
+                'pissing',
+                'pisspig',
+                'playboy',
+                'pleasure chest',
+                'pole smoker',
+                'ponyplay',
+                'poof',
+                'poop chute',
+                'poopchute',
+                'porn',
+                'pron',
+                'pornhub',
+                'porno',
+                'pornographi',
+                'pornography',
+                'prince albert',
+                'pthc',
+                'pube',
+                'pussi',
+                'pussies',
+                'pussy',
+                'queaf',
+                'queer',
+                'raghead',
+                'raging boner',
+                'rape',
+                'raping',
+                'rapist',
+                'rectum',
+                'reverse cowgirl',
+                'rimjob',
+                'rimming',
+                'rosy palm',
+                'rusty trombone',
+                's & m',
+                's&m',
+                's+m',
+                'sadism',
+                'scat',
+                'schlong',
+                'scissoring',
+                'semen',
+                'sex',
+                'sexi',
+                'sexo',
+                'sexy',
+                'shaved beaver',
+                'shaved pussy',
+                'shemale',
+                'shibari',
+                'shit',
+                'shota',
+                'shrimping',
+                'slanteye',
+                'slut',
+                'smut',
+                'snatch',
+                'snm',
+                'snowballing',
+                'sodomi',
+                'sodomize',
+                'sodomy',
+                'spic',
+                'spooge',
+                'spread legs',
+                'squirting',
+                'strap on',
+                'strapon',
+                'strappado',
+                'strip club',
+                'style doggy',
+                'suck',
+                'suicide girls',
+                'sultry women',
+                'swastika',
+                'swinger',
+                'taint',
+                'tainted love',
+                'taste my',
+                'tea bagging',
+                'threesome',
+                'throating',
+                'tied up',
+                'tight white',
+                'tit',
+                'tits',
+                'titti',
+                'titties',
+                'titty',
+                'tongue in',
+                'topless',
+                'tosser',
+                'towelhead',
+                'tranny',
+                'tribadism',
+                'tub girl',
+                'tubgirl',
+                'tushy',
+                'twat',
+                'twink',
+                'twinki',
+                'twinkie',
+                'undress',
+                'upskirt',
+                'urethra play',
+                'urophilia',
+                'vag',
+                'vagina',
+                'venus mound',
+                'vibrator',
+                'violet blue',
+                'violet wand',
+                'vorarephilia',
+                'voyeur',
+                'vulva',
+                'wank',
+                'wet dream',
+                'wetback',
+                'white power',
+                'whore',
+                'women rapping',
+                'wrapping men',
+                'wrinkled starfish',
+                'xx',
+                'xxx',
+                'yaoi',
+                'yellow shower',
+                'yiffy',
+                'zoophilia',
+            ]
+        )
         self.stemmer = PorterStemmer()
 
     def _normalize(self, text: str) -> str:
+        """Normalize text.
+
+        >>> _normalize('Tittie5')
+        'titties'
+
+        >>> _normalize('Suck a Dick!')
+        'suck a dick'
+
+        >>> _normalize('fucking a whore')
+        'fuck a whore'
+
+        """
         result = text.lower()
         result = result.replace("_", " ")
+        result = result.replace('0', 'o')
+        result = result.replace('1', 'l')
+        result = result.replace('4', 'a')
+        result = result.replace('5', 's')
+        result = result.replace('3', 'e')
         for x in string.punctuation:
             result = result.replace(x, "")
-        chunks = [
-            self.stemmer.stem(word) for word in nltk.word_tokenize(result)
-        ]
+        chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
         return ' '.join(chunks)
 
+    def tokenize(self, text: str):
+        for x in nltk.word_tokenize(text):
+            for y in re.split(r'\W+', x):
+                yield y
+
     def contains_bad_word(self, text: str) -> bool:
-        words = nltk.word_tokenize(text)
+        """Returns True if text contains a bad word (or more than one)
+        and False if no bad words were detected.
+
+        >>> contains_bad_word('fuck you')
+        True
+
+        >>> contains_bad_word('FucK u')
+        True
+
+        >>> contains_bad_word('FuK U')
+        False
+
+        """
+        words = [word for word in self.tokenize(text)]
         for word in words:
             if self.is_bad_word(word):
                 logger.debug(f'"{word}" is profanity')
@@ -501,12 +537,13 @@ class ProfanityFilter(object):
         return False
 
     def is_bad_word(self, word: str) -> bool:
-        return (
-            word in self.bad_words or
-            self._normalize(word) in self.bad_words
-        )
+        return word in self.bad_words or self._normalize(word) in self.bad_words
 
     def obscure_bad_words(self, text: str) -> str:
+        """Obscure bad words that are detected by inserting random punctuation
+        characters.
+
+        """
 
         def obscure(word: str):
             out = ''
@@ -523,7 +560,7 @@ class ProfanityFilter(object):
                             break
             return out
 
-        words = nltk.word_tokenize(text)
+        words = self.tokenize(text)
         words.append('')
         words.append('')
         words.append('')
@@ -550,6 +587,9 @@ class ProfanityFilter(object):
 
 
 def main() -> None:
+    import doctest
+
+    doctest.testmod()
     pf = ProfanityFilter()
     phrase = ' '.join(sys.argv[1:])
     print(pf.contains_bad_word(phrase))