projects
/
python_utils.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
36fe954
)
Make profanity filter catch foo/bar where foo and/or bar are bad
author
Scott
<
[email protected]
>
Mon, 24 Jan 2022 00:13:44 +0000
(16:13 -0800)
committer
Scott
<
[email protected]
>
Mon, 24 Jan 2022 00:13:44 +0000
(16:13 -0800)
words.
profanity_filter.py
patch
|
blob
|
history
diff --git
a/profanity_filter.py
b/profanity_filter.py
index db014e1704742c7cab01bc6e7ca1f6ca7f874de5..3109f166af211d0160aeca81ddf72e526ceaf2d3 100755
(executable)
--- a/
profanity_filter.py
+++ b/
profanity_filter.py
@@
-2,6
+2,7
@@
import logging
import random
import logging
import random
+import re
import string
import sys
import string
import sys
@@
-496,6
+497,11
@@
class ProfanityFilter(object):
]
return ' '.join(chunks)
]
return ' '.join(chunks)
+ def tokenize(self, text: str):
+ for x in nltk.word_tokenize(text):
+ for y in re.split('\W+', x):
+ yield y
+
def contains_bad_word(self, text: str) -> bool:
"""Returns True if text contains a bad word (or more than one)
and False if no bad words were detected.
def contains_bad_word(self, text: str) -> bool:
"""Returns True if text contains a bad word (or more than one)
and False if no bad words were detected.
@@
-510,7
+516,7
@@
class ProfanityFilter(object):
False
"""
False
"""
- words =
nltk.word_tokenize(text)
+ words =
[word for word in self.tokenize(text)]
for word in words:
if self.is_bad_word(word):
logger.debug(f'"{word}" is profanity')
for word in words:
if self.is_bad_word(word):
logger.debug(f'"{word}" is profanity')
@@
-557,7
+563,7
@@
class ProfanityFilter(object):
break
return out
break
return out
- words =
nltk.word_
tokenize(text)
+ words =
self.
tokenize(text)
words.append('')
words.append('')
words.append('')
words.append('')
words.append('')
words.append('')