From 0e2601682b4629eb425f2d20a3c4ca3d592d3cbe Mon Sep 17 00:00:00 2001 From: Scott Gasch Date: Tue, 7 Jul 2020 17:39:16 -0700 Subject: [PATCH] Add a Google News RSS-based renderer. Minor improvements to all RSS renderers. --- bellevue_reporter_rss_renderer.py | 10 ++++-- cnn_rss_renderer.py | 4 ++- google_news_rss_renderer.py | 57 +++++++++++++++++++++++++++++++ mynorthwest_rss_renderer.py | 10 ++++-- renderer.py | 4 +-- renderer_catalog.py | 16 ++++----- seattletimes_rss_renderer.py | 13 ++----- wsj_rss_renderer.py | 2 ++ 8 files changed, 89 insertions(+), 27 deletions(-) create mode 100644 google_news_rss_renderer.py diff --git a/bellevue_reporter_rss_renderer.py b/bellevue_reporter_rss_renderer.py index c94bbc0..b71a34b 100644 --- a/bellevue_reporter_rss_renderer.py +++ b/bellevue_reporter_rss_renderer.py @@ -30,10 +30,16 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer): return description def item_is_interesting_for_headlines(self, title, description, item): - return not self.is_item_older_than_n_days(item, 10) + if self.is_item_older_than_n_days(item, 10): + self.debug_print("%s: is too old!" % title) + return False + return True def item_is_interesting_for_article(self, title, description, item): - return not self.is_item_older_than_n_days(item, 10) + if self.is_item_older_than_n_days(item, 10): + self.debug_print("%s: is too old!" % title) + return False + return True # Test #x = bellevue_reporter_rss_renderer( diff --git a/cnn_rss_renderer.py b/cnn_rss_renderer.py index 0d8a0bd..413b58a 100644 --- a/cnn_rss_renderer.py +++ b/cnn_rss_renderer.py @@ -37,12 +37,14 @@ class cnn_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer): return False def item_is_interesting_for_headlines(self, title, description, item): - if self.is_item_older_than_n_days(item, 7): + if self.is_item_older_than_n_days(item, 14): + self.debug_print("%s: is too old!" % title) return False return re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None def item_is_interesting_for_article(self, title, description, item): if self.is_item_older_than_n_days(item, 7): + self.debug_print("%s: is too old!" % title) return False return (re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None and len(description) >= 65) diff --git a/google_news_rss_renderer.py b/google_news_rss_renderer.py new file mode 100644 index 0000000..334f10d --- /dev/null +++ b/google_news_rss_renderer.py @@ -0,0 +1,57 @@ +from bs4 import BeautifulSoup +import generic_news_rss_renderer +import re + +class google_news_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer): + def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title): + super(google_news_rss_renderer, self).__init__( + name_to_timeout_dict, + feed_site, + feed_uris, + page_title) + self.debug = 1 + + def debug_prefix(self): + return "google-news" + + def get_headlines_page_prefix(self): + return "google-news" + + def get_details_page_prefix(self): + return "google-news-details" + + def find_description(self, item): + descr = item.findtext('description') + source = item.findtext('source') + if source is not None: + descr = descr + " (%s)" % source + return descr + + def munge_description(self, description): + soup = BeautifulSoup(description) + for a in soup.findAll('a'): + del a['href'] + return str(soup) + + def find_image(self, item): + return None + + def should_use_https(self): + return True + + def item_is_interesting_for_headlines(self, title, description, item): + return not self.is_item_older_than_n_days(item, 2) + + def item_is_interesting_for_article(self, title, description, item): + return not self.is_item_older_than_n_days(item, 2) + +# Test +#x = google_news_rss_renderer( +# {"Fetch News" : 1, +# "Shuffle News" : 1}, +# "news.google.com", +# [ "/rss?hl=en-US&gl=US&ceid=US:en" ], +# "Test" ) +#if x.fetch_news() == 0: +# print("Error fetching news, no items fetched.") +#x.shuffle_news() diff --git a/mynorthwest_rss_renderer.py b/mynorthwest_rss_renderer.py index fd7a6a7..cfd3cf2 100644 --- a/mynorthwest_rss_renderer.py +++ b/mynorthwest_rss_renderer.py @@ -29,10 +29,16 @@ class mynorthwest_rss_renderer(generic_news_rss_renderer.generic_news_rss_render return True def item_is_interesting_for_headlines(self, title, description, item): - return not self.is_item_older_than_n_days(item, 10) + if self.is_item_older_than_n_days(item, 10): + self.debug_print("%s: is too old!" % title) + return False + return True def item_is_interesting_for_article(self, title, description, item): - return not self.is_item_older_than_n_days(item, 10) + if self.is_item_older_than_n_days(item, 10): + self.debug_print("%s: is too old!" % title) + return False + return True # Test #x = mynorthwest_rss_renderer( diff --git a/renderer.py b/renderer.py index bfd6a90..e5fbbaa 100644 --- a/renderer.py +++ b/renderer.py @@ -39,8 +39,8 @@ class abstaining_renderer(renderer): tries_per_key[key] = 0 if tries_per_key[key] >= 3: - print('renderer: Too many failures/retries for "%s.%s", ' + - ', giving up for now' % (self.get_name(), key)) + print('renderer: Too many failures for "%s.%s", giving up' % ( + self.get_name(), key)) keys_to_skip.add(key) else: msg = 'renderer: executing "%s.%s"' % (self.get_name(), key) diff --git a/renderer_catalog.py b/renderer_catalog.py index 3c2fb1e..738b4d1 100644 --- a/renderer_catalog.py +++ b/renderer_catalog.py @@ -3,6 +3,7 @@ import constants import cnn_rss_renderer import gdata_oauth import gcal_renderer +import google_news_rss_renderer import gkeep_renderer import health_renderer import local_photos_mirror_renderer @@ -58,15 +59,6 @@ __registry = [ "mynorthwest.com", [ "/feed/" ], "MyNorthwest News" ), - cnn_rss_renderer.cnn_rss_renderer( - {"Fetch News" : (hours * 1), - "Shuffle News" : (always)}, - "rss.cnn.com", - [ "/rss/money_latest.rss", - "/rss/money_mostpopular.rss", - "/rss/money_news_economy.rss", - "/rss/money_news_companies.rss" ], - "CNNMoney" ), cnn_rss_renderer.cnn_rss_renderer( {"Fetch News" : (hours * 1), "Shuffle News" : (always)}, @@ -95,6 +87,12 @@ __registry = [ [ "/rss/RSSMarketsMain.xml", "/rss/WSJcomUSBusiness.xml"], "WSJBusiness" ), + google_news_rss_renderer.google_news_rss_renderer( + {"Fetch News" : (minutes * 30), + "Shuffle News" : (always)}, + "news.google.com", + [ "/rss?hl=en-US&gl=US&ceid=US:en" ], + "Google News" ), health_renderer.periodic_health_renderer( {"Update Perioidic Job Health" : (seconds * 45)}), stock_renderer.stock_quote_renderer( diff --git a/seattletimes_rss_renderer.py b/seattletimes_rss_renderer.py index 4d02008..fae27b1 100644 --- a/seattletimes_rss_renderer.py +++ b/seattletimes_rss_renderer.py @@ -44,6 +44,7 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer): self.debug_print("Item.tag isn't item?!") return False if self.is_item_older_than_n_days(item, 14): + self.debug_print("%s: is too old!" % title) return False details = {} @@ -65,21 +66,11 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer): interesting = True if not interesting: return False - - if 'enclosure' in details: - if 'pubDate' in details: - x = details['pubDate'] - x = x.rsplit(' ', 1)[0] - # Fri, 13 Nov 2015 10:07:00 - dt = datetime.datetime.strptime(x, '%a, %d %b %Y %H:%M:%S') - if dt < self.oldest: - self.debug_print("%s is too old." % ( - details["pubDate"])) - return False return True def item_is_interesting_for_article(self, title, description, item): if self.is_item_older_than_n_days(item, 14): + self.debug_print("%s: is too old!" % title) return False return len(description) >= 65 diff --git a/wsj_rss_renderer.py b/wsj_rss_renderer.py index aa56fdd..a8ccf29 100644 --- a/wsj_rss_renderer.py +++ b/wsj_rss_renderer.py @@ -30,12 +30,14 @@ class wsj_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer): def item_is_interesting_for_headlines(self, title, description, item): if self.is_item_older_than_n_days(item, 7): + self.debug_print("%s: is too old!" % title) return False return ("WSJ.com" not in title and "WSJ.com" not in description) def item_is_interesting_for_article(self, title, description, item): if self.is_item_older_than_n_days(item, 7): + self.debug_print("%s: is too old!" % title) return False return ("WSJ.com" not in title and "WSJ.com" not in description) -- 2.45.2