Changes ;)
[kiosk.git] / bellevue_reporter_rss_renderer.py
index 104147dfc170facc6008f4ecef4e5c3ad98be125..b8fd27bafed6e9fe97b4562d6fb044615e69c7e7 100644 (file)
@@ -3,6 +3,7 @@
 import re
 from typing import List, Dict
 import xml
+import xml.etree.ElementTree as ET
 
 import generic_news_rss_renderer as gnrss
 
@@ -23,7 +24,7 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer):
         self.debug = True
 
     def debug_prefix(self) -> str:
-        return "bellevue_reporter(%s)" % (self.page_title)
+        return f"bellevue_reporter({self.page_title})"
 
     def get_headlines_page_prefix(self) -> str:
         return "bellevue-reporter"
@@ -34,7 +35,7 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer):
     def should_use_https(self) -> bool:
         return True
 
-    def munge_description(self, description: str) -> str:
+    def munge_description(self, description: str, item: ET.Element) -> str:
         description = re.sub("<[^>]+>", "", description)
         description = re.sub(
             "Bellevue\s+Reporter\s+Bellevue\s+Reporter", "", description
@@ -56,31 +57,52 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer):
     def looks_like_review(title: str, description: str) -> bool:
         return "review" in title or "Review" in title
 
+    @staticmethod
+    def looks_like_spam(title: str, description: str) -> bool:
+        return (
+            'marketplace' in description
+            or 'national-marketplace' in description
+            or re.search('[Ww]eed', title) is not None
+            or re.search('[Cc]annabis', title) is not None
+            or re.search('[Cc]annabis', description) is not None
+            or 'THC' in title
+            or re.search('[Ll]ose [Ww]eight', title) is not None
+            or re.search('[Ll]ose [Ww]eight', description) is not None
+        )
+
     def item_is_interesting_for_headlines(
         self, title: str, description: str, item: xml.etree.ElementTree.Element
     ) -> bool:
+        unfiltered_description = item.findtext("description")
         if self.is_item_older_than_n_days(item, 10):
-            self.debug_print("%s: is too old!" % title)
+            self.debug_print(f'{title}: is too old!')
+            return False
+        if bellevue_reporter_rss_renderer.looks_like_spam(title, unfiltered_description):
+            self.debug_print(f'{title}: looks like spam')
             return False
         if bellevue_reporter_rss_renderer.looks_like_football(title, description):
-            self.debug_print("%s: looks like it's about football." % title)
+            self.debug_print(f'{title}: looks like it\'s about football.')
             return False
         if bellevue_reporter_rss_renderer.looks_like_review(title, description):
-            self.debug_print("%s: looks like bullshit." % title)
+            self.debug_print(f'{title}: looks like a review.')
             return False
         return True
 
     def item_is_interesting_for_article(
         self, title: str, description: str, item: xml.etree.ElementTree.Element
     ) -> bool:
+        unfiltered_description = item.findtext("description")
         if self.is_item_older_than_n_days(item, 10):
-            self.debug_print("%s: is too old!" % title)
+            self.debug_print(f'{title}: is too old!')
+            return False
+        if bellevue_reporter_rss_renderer.looks_like_spam(title, unfiltered_description):
+            self.debug_print(f'{title}: looks like spam')
             return False
         if bellevue_reporter_rss_renderer.looks_like_football(title, description):
-            self.debug_print("%s: looks like it's about football." % title)
+            self.debug_print(f'{title}: looks like it\'s about football.')
             return False
         if bellevue_reporter_rss_renderer.looks_like_review(title, description):
-            self.debug_print("%s: looks like bullshit." % title)
+            self.debug_print(f'{title}: looks like a review.')
             return False
         return True