From: Scott Gasch Date: Tue, 7 Jul 2020 22:46:10 +0000 (-0700) Subject: Cleanup and improve the RSS stuff. X-Git-Url: https://wannabe.guru.org/gitweb/?a=commitdiff_plain;h=08440780a64ab1226bb4447b49ce422edfd1500e;p=kiosk.git Cleanup and improve the RSS stuff. --- diff --git a/bellevue_reporter_rss_renderer.py b/bellevue_reporter_rss_renderer.py index f630aee..c94bbc0 100644 --- a/bellevue_reporter_rss_renderer.py +++ b/bellevue_reporter_rss_renderer.py @@ -30,10 +30,10 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer): return description def item_is_interesting_for_headlines(self, title, description, item): - return True + return not self.is_item_older_than_n_days(item, 10) def item_is_interesting_for_article(self, title, description, item): - return True + return not self.is_item_older_than_n_days(item, 10) # Test #x = bellevue_reporter_rss_renderer( diff --git a/cnn_rss_renderer.py b/cnn_rss_renderer.py index a93b491..0d8a0bd 100644 --- a/cnn_rss_renderer.py +++ b/cnn_rss_renderer.py @@ -20,17 +20,30 @@ class cnn_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer): return "cnn-details-%s" % (self.page_title) def munge_description(self, description): - description = re.sub('[Rr]ead full story for latest details.', '', description) + description = re.sub('[Rr]ead full story for latest details.', + '', + description) description = re.sub('<[^>]+>', '', description) return description + def find_image(self, item): + image = item.findtext('media:thumbnail') + if image is not None: + image_url = image.get('url') + return image_url + return None + def should_use_https(self): return False def item_is_interesting_for_headlines(self, title, description, item): + if self.is_item_older_than_n_days(item, 7): + return False return re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None def item_is_interesting_for_article(self, title, description, item): + if self.is_item_older_than_n_days(item, 7): + return False return (re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None and len(description) >= 65) diff --git a/generic_news_rss_renderer.py b/generic_news_rss_renderer.py index ec7a7a5..21f9afe 100644 --- a/generic_news_rss_renderer.py +++ b/generic_news_rss_renderer.py @@ -1,3 +1,5 @@ +import datetime +from dateutil.parser import parse import file_writer import grab_bag import renderer @@ -10,7 +12,8 @@ import xml.etree.ElementTree as ET class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title): - super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False) + super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, + False) self.debug = 1 self.feed_site = feed_site self.feed_uris = feed_uris @@ -50,14 +53,31 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): def find_link(self, item): return item.findtext('link') + def munge_link(self, link): + return link + def find_image(self, item): return item.findtext('image') + def munge_image(self, image): + return image + def item_is_interesting_for_headlines(self, title, description, item): - pass + return True + + def is_item_older_than_n_days(self, item, n): + pubdate = item.findtext('pubDate') + if pubdate is not None: + pubdate = parse(pubdate) + tzinfo = pubdate.tzinfo + now = datetime.datetime.now(tzinfo) + delta = (now - pubdate).total_seconds() / (60 * 60 * 24) + if (delta > n): + return True + return False def item_is_interesting_for_article(self, title, description, item): - pass + return True def periodic_render(self, key): if key == "Fetch News": @@ -132,8 +152,12 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): description = item.findtext('description') if description is not None: description = self.munge_description(description) + image = self.find_image(item) + if image is not None: + image = self.munge_image(image) link = item.findtext('link') - image = item.findtext('image') + if link is not None: + link = self.munge_link(link) if (title is None or not self.item_is_interesting_for_headlines(title, @@ -155,11 +179,18 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): font-size:34pt; -webkit-column-break-inside:avoid;">""" if image is not None: - blurb += u'' % image - blurb += u'

%s' % title + blurb += u'' + + if link is None: + blurb += u'

%s' % title + else: + blurb += u'

%s' % (link, title) if (description is not None and - self.item_is_interesting_for_article(title, description, item)): + self.item_is_interesting_for_article(title, + description, + item)): longblurb = blurb longblurb += u"
" longblurb += description diff --git a/mynorthwest_rss_renderer.py b/mynorthwest_rss_renderer.py index 38bcd28..fd7a6a7 100644 --- a/mynorthwest_rss_renderer.py +++ b/mynorthwest_rss_renderer.py @@ -18,14 +18,21 @@ class mynorthwest_rss_renderer(generic_news_rss_renderer.generic_news_rss_render def get_details_page_prefix(self): return "mynorthwest-details-%s" % (self.page_title) + def find_image(self, item): + image = item.findtext('media:content') + if image is not None: + image_url = image.get('url') + return image_url + return None + def should_use_https(self): return True def item_is_interesting_for_headlines(self, title, description, item): - return True + return not self.is_item_older_than_n_days(item, 10) def item_is_interesting_for_article(self, title, description, item): - return True + return not self.is_item_older_than_n_days(item, 10) # Test #x = mynorthwest_rss_renderer( diff --git a/seattletimes_rss_renderer.py b/seattletimes_rss_renderer.py index c8d12ce..4d02008 100644 --- a/seattletimes_rss_renderer.py +++ b/seattletimes_rss_renderer.py @@ -26,8 +26,6 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer): feed_site, feed_uris, page_title) - self.oldest = datetime.datetime.now() - datetime.timedelta(14) - self.debug_print("oldest story we'll keep: %s" % self.oldest) def debug_prefix(self): return "seattletimes" @@ -45,6 +43,8 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer): if item.tag != "item": self.debug_print("Item.tag isn't item?!") return False + if self.is_item_older_than_n_days(item, 14): + return False details = {} for detail in item.getchildren(): @@ -79,6 +79,8 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer): return True def item_is_interesting_for_article(self, title, description, item): + if self.is_item_older_than_n_days(item, 14): + return False return len(description) >= 65 #x = seattletimes_rss_renderer({"Test", 123}, diff --git a/wsj_rss_renderer.py b/wsj_rss_renderer.py index 8e2b0cc..aa56fdd 100644 --- a/wsj_rss_renderer.py +++ b/wsj_rss_renderer.py @@ -18,14 +18,25 @@ class wsj_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer): def get_details_page_prefix(self): return "wsj-details-%s" % (self.page_title) + def find_image(self, item): + image = item.findtext('image') + if image is not None: + url = image.get('url') + return url + return None + def should_use_https(self): return True def item_is_interesting_for_headlines(self, title, description, item): + if self.is_item_older_than_n_days(item, 7): + return False return ("WSJ.com" not in title and "WSJ.com" not in description) def item_is_interesting_for_article(self, title, description, item): + if self.is_item_older_than_n_days(item, 7): + return False return ("WSJ.com" not in title and "WSJ.com" not in description)