X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=bellevue_reporter_rss_renderer.py;h=b8fd27bafed6e9fe97b4562d6fb044615e69c7e7;hb=5c39d86ebc075ccb7be98b1dfab8040b72ff9134;hp=104147dfc170facc6008f4ecef4e5c3ad98be125;hpb=0bee476055b15d8de59c74b61e5c56f214b83941;p=kiosk.git diff --git a/bellevue_reporter_rss_renderer.py b/bellevue_reporter_rss_renderer.py index 104147d..b8fd27b 100644 --- a/bellevue_reporter_rss_renderer.py +++ b/bellevue_reporter_rss_renderer.py @@ -3,6 +3,7 @@ import re from typing import List, Dict import xml +import xml.etree.ElementTree as ET import generic_news_rss_renderer as gnrss @@ -23,7 +24,7 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer): self.debug = True def debug_prefix(self) -> str: - return "bellevue_reporter(%s)" % (self.page_title) + return f"bellevue_reporter({self.page_title})" def get_headlines_page_prefix(self) -> str: return "bellevue-reporter" @@ -34,7 +35,7 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer): def should_use_https(self) -> bool: return True - def munge_description(self, description: str) -> str: + def munge_description(self, description: str, item: ET.Element) -> str: description = re.sub("<[^>]+>", "", description) description = re.sub( "Bellevue\s+Reporter\s+Bellevue\s+Reporter", "", description @@ -56,31 +57,52 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer): def looks_like_review(title: str, description: str) -> bool: return "review" in title or "Review" in title + @staticmethod + def looks_like_spam(title: str, description: str) -> bool: + return ( + 'marketplace' in description + or 'national-marketplace' in description + or re.search('[Ww]eed', title) is not None + or re.search('[Cc]annabis', title) is not None + or re.search('[Cc]annabis', description) is not None + or 'THC' in title + or re.search('[Ll]ose [Ww]eight', title) is not None + or re.search('[Ll]ose [Ww]eight', description) is not None + ) + def item_is_interesting_for_headlines( self, title: str, description: str, item: xml.etree.ElementTree.Element ) -> bool: + unfiltered_description = item.findtext("description") if self.is_item_older_than_n_days(item, 10): - self.debug_print("%s: is too old!" % title) + self.debug_print(f'{title}: is too old!') + return False + if bellevue_reporter_rss_renderer.looks_like_spam(title, unfiltered_description): + self.debug_print(f'{title}: looks like spam') return False if bellevue_reporter_rss_renderer.looks_like_football(title, description): - self.debug_print("%s: looks like it's about football." % title) + self.debug_print(f'{title}: looks like it\'s about football.') return False if bellevue_reporter_rss_renderer.looks_like_review(title, description): - self.debug_print("%s: looks like bullshit." % title) + self.debug_print(f'{title}: looks like a review.') return False return True def item_is_interesting_for_article( self, title: str, description: str, item: xml.etree.ElementTree.Element ) -> bool: + unfiltered_description = item.findtext("description") if self.is_item_older_than_n_days(item, 10): - self.debug_print("%s: is too old!" % title) + self.debug_print(f'{title}: is too old!') + return False + if bellevue_reporter_rss_renderer.looks_like_spam(title, unfiltered_description): + self.debug_print(f'{title}: looks like spam') return False if bellevue_reporter_rss_renderer.looks_like_football(title, description): - self.debug_print("%s: looks like it's about football." % title) + self.debug_print(f'{title}: looks like it\'s about football.') return False if bellevue_reporter_rss_renderer.looks_like_review(title, description): - self.debug_print("%s: looks like bullshit." % title) + self.debug_print(f'{title}: looks like a review.') return False return True