#!/usr/bin/env python3
+"""A helper to identify and optionally obscure some bad words."""
+
import logging
import random
import re
import decorator_utils
import string_utils
-
logger = logging.getLogger(__name__)
@decorator_utils.singleton
class ProfanityFilter(object):
+ """A helper to identify and optionally obscure some bad words."""
+
def __init__(self):
self.bad_words = set(
[
'girl gone wild',
'girl on top',
'girl on',
+ 'give head',
+ 'giving head',
+ 'gave head',
'goatcx',
'goatse',
'goddamn',
chunks = [self.stemmer.stem(word) for word in nltk.word_tokenize(result)]
return ' '.join(chunks)
- def tokenize(self, text: str):
+ @staticmethod
+ def tokenize(text: str):
for x in nltk.word_tokenize(text):
- for y in re.split('\W+', x):
+ for y in re.split(r'\W+', x):
yield y
def contains_bad_word(self, text: str) -> bool:
False
"""
- words = [word for word in self.tokenize(text)]
+ words = list(self.tokenize(text))
for word in words:
if self.is_bad_word(word):
- logger.debug(f'"{word}" is profanity')
+ logger.debug('"%s" is profanity', word)
return True
if len(words) > 1:
for bigram in string_utils.ngrams_presplit(words, 2):
bigram = ' '.join(bigram)
if self.is_bad_word(bigram):
- logger.debug(f'"{bigram}" is profanity')
+ logger.debug('"%s" is profanity', bigram)
return True
if len(words) > 2:
for trigram in string_utils.ngrams_presplit(words, 3):
trigram = ' '.join(trigram)
if self.is_bad_word(trigram):
- logger.debug(f'"{trigram}" is profanity')
+ logger.debug('"%s" is profanity', trigram)
return True
return False
break
return out
- words = self.tokenize(text)
+ words = list(self.tokenize(text))
words.append('')
words.append('')
words.append('')