X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=profanity_filter.py;h=3109f166af211d0160aeca81ddf72e526ceaf2d3;hb=12dfb5afcdc42c449364c1207c175de20393a5c1;hp=fe5422179ba9a50c678188e088689184f139a14d;hpb=e8671a716da868332d3ac1f66d4d2f7f8d33fc28;p=python_utils.git diff --git a/profanity_filter.py b/profanity_filter.py index fe54221..3109f16 100755 --- a/profanity_filter.py +++ b/profanity_filter.py @@ -2,6 +2,7 @@ import logging import random +import re import string import sys @@ -470,6 +471,18 @@ class ProfanityFilter(object): self.stemmer = PorterStemmer() def _normalize(self, text: str) -> str: + """Normalize text. + + >>> _normalize('Tittie5') + 'titties' + + >>> _normalize('Suck a Dick!') + 'suck a dick' + + >>> _normalize('fucking a whore') + 'fuck a whore' + + """ result = text.lower() result = result.replace("_", " ") result = result.replace('0', 'o') @@ -484,8 +497,26 @@ class ProfanityFilter(object): ] return ' '.join(chunks) + def tokenize(self, text: str): + for x in nltk.word_tokenize(text): + for y in re.split('\W+', x): + yield y + def contains_bad_word(self, text: str) -> bool: - words = nltk.word_tokenize(text) + """Returns True if text contains a bad word (or more than one) + and False if no bad words were detected. + + >>> contains_bad_word('fuck you') + True + + >>> contains_bad_word('FucK u') + True + + >>> contains_bad_word('FuK U') + False + + """ + words = [word for word in self.tokenize(text)] for word in words: if self.is_bad_word(word): logger.debug(f'"{word}" is profanity') @@ -513,7 +544,10 @@ class ProfanityFilter(object): ) def obscure_bad_words(self, text: str) -> str: + """Obscure bad words that are detected by inserting random punctuation + characters. + """ def obscure(word: str): out = '' last = '' @@ -529,7 +563,7 @@ class ProfanityFilter(object): break return out - words = nltk.word_tokenize(text) + words = self.tokenize(text) words.append('') words.append('') words.append('') @@ -556,6 +590,8 @@ class ProfanityFilter(object): def main() -> None: + import doctest + doctest.testmod() pf = ProfanityFilter() phrase = ' '.join(sys.argv[1:]) print(pf.contains_bad_word(phrase))