Adds profanity filter, fixes bugs.
[python_utils.git] / profanity_filter.py
diff --git a/profanity_filter.py b/profanity_filter.py
new file mode 100755 (executable)
index 0000000..e1b4743
--- /dev/null
@@ -0,0 +1,556 @@
+#!/usr/bin/env python3
+
+import logging
+import random
+import string
+import sys
+
+import nltk
+from nltk.stem import PorterStemmer
+
+import string_utils
+
+
+logger = logging.getLogger(__name__)
+
+
+class ProfanityFilter(object):
+    def __init__(self):
+        self.bad_words = set([
+            'acrotomophilia',
+            'anal',
+            'analingus',
+            'anally',
+            'anilingus',
+            'anus',
+            'arsehol',
+            'arsehole',
+            'ass',
+            'asses',
+            'asshol',
+            'asshole',
+            'assmunch',
+            'auto erot',
+            'auto erotic',
+            'autoerotic',
+            'babeland',
+            'babi batter',
+            'baby batter',
+            'ball gag',
+            'ball gravi',
+            'ball gravy',
+            'ball kick',
+            'ball kicking',
+            'ball lick',
+            'ball licking',
+            'ball sack',
+            'ball suck',
+            'ball sucking',
+            'ball zack',
+            'bangbro',
+            'bangbros',
+            'bare legal',
+            'bareback',
+            'barely legal',
+            'barenak',
+            'barenaked',
+            'bastardo',
+            'bastinado',
+            'bbc',
+            'bbw',
+            'bdsm',
+            'beaver cleaver',
+            'beaver lip',
+            'beaver lips',
+            'bestial',
+            'bestiality',
+            'bi curiou',
+            'bi curious',
+            'big black',
+            'big breasts',
+            'big knocker',
+            'big knockers',
+            'big tit',
+            'big tits',
+            'bimbo',
+            'birdlock',
+            'bitch',
+            'bitches',
+            'black cock',
+            'blond action',
+            'blond on blond',
+            'blonde action',
+            'blow j',
+            'blow job',
+            'blow my',
+            'blow me',
+            'blow ourselv',
+            'blow ourselves',
+            'blow your load',
+            'blue waffl',
+            'blue waffle',
+            'blumpkin',
+            'bollock',
+            'bollocks',
+            'bondag',
+            'bondage',
+            'boner',
+            'boob',
+            'boobs',
+            'booti call',
+            'booty call',
+            'breast',
+            'breasts',
+            'brown shower',
+            'brown showers',
+            'brunett action',
+            'brunette action',
+            'bukkak',
+            'bukkake',
+            'bulldyk',
+            'bulldyke',
+            'bullet vibe',
+            'bullshit',
+            'bung hole',
+            'bunghol',
+            'bunghole',
+            'busti',
+            'busty',
+            'butt',
+            'buttcheek',
+            'buttcheeks',
+            'butthol',
+            'butthole',
+            'camel toe',
+            'camgirl',
+            'camslut',
+            'camwhore',
+            'carpet muncher',
+            'carpetmuncher',
+            'chocol rosebud',
+            'chocolate rosebuds',
+            'circlejerk',
+            'chink',
+            'cleveland steamer',
+            'clit',
+            'clitori',
+            'clitoris',
+            'clover clamp',
+            'clover clamps',
+            'clusterfuck',
+            'cock',
+            'cocks',
+            'coprolagnia',
+            'coprophilia',
+            'cornhol',
+            'cornhole',
+            'cream pie',
+            'creampi',
+            'creampie',
+            'cum',
+            'cumming',
+            'cunnilingu',
+            'cunnilingus',
+            'cunt',
+            'damn',
+            'darki',
+            'darkie',
+            'date rape',
+            'daterap',
+            'daterape',
+            'deep throat',
+            'deepthroat',
+            'dick',
+            'dildo',
+            'dirti pillow',
+            'dirti sanchez',
+            'dirty pillow',
+            'dirty sanchez',
+            'dog style',
+            'doggi style',
+            'doggie style',
+            'doggiestyl',
+            'doggiestyle',
+            'doggystyle',
+            'dolcett',
+            'domination',
+            'dominatrix',
+            'domm',
+            'dommes',
+            'donkey punch',
+            'doubl dick',
+            'doubl dong',
+            'doubl penetr',
+            'double dick',
+            'double dong',
+            'double penetration',
+            'dp action',
+            'dtf',
+            'eat my ass',
+            'ecchi',
+            'ejacul',
+            'erection',
+            'erotic',
+            'erotism',
+            'escort',
+            'ethical slut',
+            'eunuch',
+            'faggot',
+            'fecal',
+            'felch',
+            'fellatio',
+            'feltch',
+            'female squirting',
+            'femdom',
+            'figging',
+            'fingered',
+            'fingering',
+            'fingers',
+            'fisted',
+            'fisting',
+            'fists',
+            'foot fetish',
+            'footjob',
+            'frotting',
+            'fuck button',
+            'fuck',
+            'fucked',
+            'fucker',
+            'fuckhead',
+            'fuckin',
+            'fucking',
+            'fudge packer',
+            'fudgepack',
+            'fudgepacker',
+            'futanari',
+            'g spot',
+            'g-spot',
+            'gang bang',
+            'gay sex',
+            'gee spot',
+            'genital',
+            'giant cock',
+            'girl gone wild',
+            'girl on top',
+            'girl on',
+            'goatcx',
+            'goatse',
+            'goddamn',
+            'gokkun',
+            'golden shower',
+            'goo girl',
+            'goodpoop',
+            'goregasm',
+            'grope',
+            'group sex',
+            'gspot',
+            'guro',
+            'hand job',
+            'handjob',
+            'hard core',
+            'hardcore',
+            'hentai',
+            'homoerotic',
+            'honkey',
+            'hooker',
+            'horni',
+            'horny',
+            'hot chick',
+            'how to kill',
+            'how to murder',
+            'huge fat',
+            'humped',
+            'humping',
+            'humps',
+            'incest',
+            'intercourse',
+            'jack off',
+            'jail bait',
+            'jailbait',
+            'jerk off',
+            'jigaboo',
+            'jiggaboo',
+            'jiggerboo',
+            'jizz',
+            'jugg',
+            'kike',
+            'kinbaku',
+            'kinkster',
+            'kinky',
+            'knobbing',
+            'leather restraint',
+            'lemon party',
+            'lolita',
+            'lovemaking',
+            'make me come',
+            'male squirting',
+            'masturb',
+            'menage a trois',
+            'milf',
+            'missionary position',
+            'motherfuck',
+            'mound of venus',
+            'mr hand',
+            'muff diver',
+            'muffdiv',
+            'muffdiving',
+            'nambla',
+            'nawashi',
+            'negro',
+            'neonazi',
+            'nig nog',
+            'nigga',
+            'nigger',
+            'nimphomania',
+            'nipple',
+            'nip',
+            'not safe for',
+            'nsfl',
+            'nsfw',
+            'nude',
+            'nudes',
+            'nudity',
+            'nut sack',
+            'nutsack',
+            'nympho',
+            'nymphomania',
+            'octopussy',
+            'omorashi',
+            'one night stand',
+            'orgasm',
+            'orgy',
+            'paedophil',
+            'paedophile',
+            'panties',
+            'panty',
+            'pedobear',
+            'pedophil',
+            'pedophile',
+            'pee',
+            'pegging',
+            'peni',
+            'penis',
+            'phone sex',
+            'pigfucker',
+            'piss pig',
+            'piss',
+            'pissing',
+            'pisspig',
+            'playboy',
+            'pleasure chest',
+            'pole smoker',
+            'ponyplay',
+            'poof',
+            'poop chute',
+            'poopchute',
+            'porn',
+            'pornhub',
+            'porno',
+            'pornographi',
+            'pornography',
+            'prince albert',
+            'pthc',
+            'pube',
+            'pussi',
+            'pussies',
+            'pussy',
+            'queaf',
+            'queer',
+            'raghead',
+            'raging boner',
+            'rape',
+            'raping',
+            'rapist',
+            'rectum',
+            'reverse cowgirl',
+            'rimjob',
+            'rimming',
+            'rosy palm',
+            'rusty trombone',
+            's & m',
+            's&m',
+            's+m',
+            'sadism',
+            'scat',
+            'schlong',
+            'scissoring',
+            'semen',
+            'sex',
+            'sexi',
+            'sexo',
+            'sexy',
+            'shaved beaver',
+            'shaved pussy',
+            'shemale',
+            'shibari',
+            'shit',
+            'shota',
+            'shrimping',
+            'slanteye',
+            'slut',
+            'smut',
+            'snatch',
+            'snm',
+            'snowballing',
+            'sodomi',
+            'sodomize',
+            'sodomy',
+            'spic',
+            'spooge',
+            'spread legs',
+            'squirting',
+            'strap on',
+            'strapon',
+            'strappado',
+            'strip club',
+            'style doggy',
+            'suck',
+            'suicide girls',
+            'sultry women',
+            'swastika',
+            'swinger',
+            'taint',
+            'tainted love',
+            'taste my',
+            'tea bagging',
+            'threesome',
+            'throating',
+            'tied up',
+            'tight white',
+            'tit',
+            'tits',
+            'titti',
+            'titties',
+            'titty',
+            'tongue in',
+            'topless',
+            'tosser',
+            'towelhead',
+            'tranny',
+            'tribadism',
+            'tub girl',
+            'tubgirl',
+            'tushy',
+            'twat',
+            'twink',
+            'twinki',
+            'twinkie',
+            'undress',
+            'upskirt',
+            'urethra play',
+            'urophilia',
+            'vag',
+            'vagina',
+            'venus mound',
+            'vibrator',
+            'violet blue',
+            'violet wand',
+            'vorarephilia',
+            'voyeur',
+            'vulva',
+            'wank',
+            'wet dream',
+            'wetback',
+            'white power',
+            'whore',
+            'women rapping',
+            'wrapping men',
+            'wrinkled starfish',
+            'xx',
+            'xxx',
+            'yaoi',
+            'yellow shower',
+            'yiffy',
+            'zoophilia',
+        ])
+        self.stemmer = PorterStemmer()
+
+    def _normalize(self, text: str) -> str:
+        result = text.lower()
+        result = result.replace("_", " ")
+        for x in string.punctuation:
+            result = result.replace(x, "")
+        chunks = [
+            self.stemmer.stem(word) for word in nltk.word_tokenize(result)
+        ]
+        return ' '.join(chunks)
+
+    def contains_bad_word(self, text: str) -> bool:
+        words = nltk.word_tokenize(text)
+        for word in words:
+            if self.is_bad_word(word):
+                logger.debug(f'"{word}" is profanity')
+                return True
+
+        if len(words) > 1:
+            for bigram in string_utils.ngrams_presplit(words, 2):
+                if self.is_bad_word(bigram):
+                    logger.debug('"{bigram}" is profanity')
+                    return True
+
+        if len(words) > 2:
+            for trigram in string_utils.ngrams_presplit(words, 3):
+                if self.is_bad_word(trigram):
+                    logger.debug('"{trigram}" is profanity')
+                    return True
+        return False
+
+    def is_bad_word(self, word: str) -> bool:
+        return (
+            word in self.bad_words or
+            self._normalize(word) in self.bad_words
+        )
+
+    def obscure_bad_words(self, text: str) -> str:
+
+        def obscure(word: str):
+            out = ''
+            last = ''
+            for letter in word:
+                if letter.isspace():
+                    out += letter
+                else:
+                    while True:
+                        char = random.choice(['#', '%', '!', '@', '&', '*'])
+                        if last != char:
+                            last = char
+                            out += char
+                            break
+            return out
+
+        words = nltk.word_tokenize(text)
+        words.append('')
+        words.append('')
+        words.append('')
+        out = ''
+
+        cursor = 0
+        while cursor < len(words) - 3:
+            word = words[cursor]
+            bigram = word + ' ' + words[cursor + 1]
+            trigram = bigram + ' ' + words[cursor + 2]
+            if self.is_bad_word(trigram):
+                out += obscure(trigram) + ' '
+                cursor += 3
+            elif self.is_bad_word(bigram):
+                out += obscure(bigram) + ' '
+                cursor += 2
+            elif self.is_bad_word(word):
+                out += obscure(word) + ' '
+                cursor += 1
+            else:
+                out += word + ' '
+                cursor += 1
+        return out.strip()
+
+
+def main() -> None:
+    pf = ProfanityFilter()
+    phrase = ' '.join(sys.argv[1:])
+    print(pf.contains_bad_word(phrase))
+    print(pf.obscure_bad_words(phrase))
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()