Add a Google News RSS-based renderer. Minor improvements to all
authorScott Gasch <[email protected]>
Wed, 8 Jul 2020 00:39:16 +0000 (17:39 -0700)
committerScott Gasch <[email protected]>
Wed, 8 Jul 2020 00:39:16 +0000 (17:39 -0700)
RSS renderers.

bellevue_reporter_rss_renderer.py
cnn_rss_renderer.py
google_news_rss_renderer.py [new file with mode: 0644]
mynorthwest_rss_renderer.py
renderer.py
renderer_catalog.py
seattletimes_rss_renderer.py
wsj_rss_renderer.py

index c94bbc009daa5ff40c8edc31eddbb621275e5f0d..b71a34bafcdc60d80957da7ed2c01e9d8084cca8 100644 (file)
@@ -30,10 +30,16 @@ class bellevue_reporter_rss_renderer(gnrss.generic_news_rss_renderer):
         return description
 
     def item_is_interesting_for_headlines(self, title, description, item):
-        return not self.is_item_older_than_n_days(item, 10)
+        if self.is_item_older_than_n_days(item, 10):
+            self.debug_print("%s: is too old!" % title)
+            return False
+        return True
 
     def item_is_interesting_for_article(self, title, description, item):
-        return not self.is_item_older_than_n_days(item, 10)
+        if self.is_item_older_than_n_days(item, 10):
+            self.debug_print("%s: is too old!" % title)
+            return False
+        return True
 
 # Test
 #x = bellevue_reporter_rss_renderer(
index 0d8a0bd9b2d5af00e5d346e96c89b4b4814231b5..413b58a81da19129a75ce60547a2d996e6013b39 100644 (file)
@@ -37,12 +37,14 @@ class cnn_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer):
         return False
 
     def item_is_interesting_for_headlines(self, title, description, item):
-        if self.is_item_older_than_n_days(item, 7):
+        if self.is_item_older_than_n_days(item, 14):
+            self.debug_print("%s: is too old!" % title)
             return False
         return re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None
 
     def item_is_interesting_for_article(self, title, description, item):
         if self.is_item_older_than_n_days(item, 7):
+            self.debug_print("%s: is too old!" % title)
             return False
         return (re.search(r'[Cc][Nn][Nn][A-Za-z]*\.com', title) is None and
                 len(description) >= 65)
diff --git a/google_news_rss_renderer.py b/google_news_rss_renderer.py
new file mode 100644 (file)
index 0000000..334f10d
--- /dev/null
@@ -0,0 +1,57 @@
+from bs4 import BeautifulSoup
+import generic_news_rss_renderer
+import re
+
+class google_news_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer):
+    def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title):
+        super(google_news_rss_renderer, self).__init__(
+            name_to_timeout_dict,
+            feed_site,
+            feed_uris,
+            page_title)
+        self.debug = 1
+
+    def debug_prefix(self):
+        return "google-news"
+
+    def get_headlines_page_prefix(self):
+        return "google-news"
+
+    def get_details_page_prefix(self):
+        return "google-news-details"
+
+    def find_description(self, item):
+        descr = item.findtext('description')
+        source = item.findtext('source')
+        if source is not None:
+            descr = descr + " (%s)" % source
+        return descr
+
+    def munge_description(self, description):
+        soup = BeautifulSoup(description)
+        for a in soup.findAll('a'):
+            del a['href']
+        return str(soup)
+
+    def find_image(self, item):
+        return None
+
+    def should_use_https(self):
+        return True
+
+    def item_is_interesting_for_headlines(self, title, description, item):
+        return not self.is_item_older_than_n_days(item, 2)
+
+    def item_is_interesting_for_article(self, title, description, item):
+        return not self.is_item_older_than_n_days(item, 2)
+
+# Test
+#x = google_news_rss_renderer(
+#    {"Fetch News" : 1,
+#     "Shuffle News" : 1},
+#    "news.google.com",
+#    [ "/rss?hl=en-US&gl=US&ceid=US:en" ],
+#    "Test" )
+#if x.fetch_news() == 0:
+#    print("Error fetching news, no items fetched.")
+#x.shuffle_news()
index fd7a6a795fd348bb2a4f9e43e30605cc1eaaa924..cfd3cf254e9bbb4968fa53988d40b6375890d6ea 100644 (file)
@@ -29,10 +29,16 @@ class mynorthwest_rss_renderer(generic_news_rss_renderer.generic_news_rss_render
         return True
 
     def item_is_interesting_for_headlines(self, title, description, item):
-        return not self.is_item_older_than_n_days(item, 10)
+        if self.is_item_older_than_n_days(item, 10):
+            self.debug_print("%s: is too old!" % title)
+            return False
+        return True
 
     def item_is_interesting_for_article(self, title, description, item):
-        return not self.is_item_older_than_n_days(item, 10)
+        if self.is_item_older_than_n_days(item, 10):
+            self.debug_print("%s: is too old!" % title)
+            return False
+        return True
 
 # Test
 #x = mynorthwest_rss_renderer(
index bfd6a9012267e202f0004c68ee543290b3765c97..e5fbbaa2d3cd59f278baa4bc369fbfa3705bc520 100644 (file)
@@ -39,8 +39,8 @@ class abstaining_renderer(renderer):
                 tries_per_key[key] = 0
 
             if tries_per_key[key] >= 3:
-                print('renderer: Too many failures/retries for "%s.%s", ' +
-                      ', giving up for now' % (self.get_name(), key))
+                print('renderer: Too many failures for "%s.%s", giving up' % (
+                    self.get_name(), key))
                 keys_to_skip.add(key)
             else:
                 msg = 'renderer: executing "%s.%s"' % (self.get_name(), key)
index 3c2fb1e2666c7380a7aa73bed20fe4be577e3fc3..738b4d15d77c6153ccc44e652f4472803e496f9e 100644 (file)
@@ -3,6 +3,7 @@ import constants
 import cnn_rss_renderer
 import gdata_oauth
 import gcal_renderer
+import google_news_rss_renderer
 import gkeep_renderer
 import health_renderer
 import local_photos_mirror_renderer
@@ -58,15 +59,6 @@ __registry = [
                      "mynorthwest.com",
                      [ "/feed/" ],
                      "MyNorthwest News" ),
-                 cnn_rss_renderer.cnn_rss_renderer(
-                     {"Fetch News" : (hours * 1),
-                      "Shuffle News" : (always)},
-                     "rss.cnn.com",
-                     [ "/rss/money_latest.rss",
-                       "/rss/money_mostpopular.rss",
-                       "/rss/money_news_economy.rss",
-                       "/rss/money_news_companies.rss" ],
-                     "CNNMoney" ),
                  cnn_rss_renderer.cnn_rss_renderer(
                      {"Fetch News" : (hours * 1),
                       "Shuffle News" : (always)},
@@ -95,6 +87,12 @@ __registry = [
                      [ "/rss/RSSMarketsMain.xml",
                        "/rss/WSJcomUSBusiness.xml"],
                      "WSJBusiness" ),
+                 google_news_rss_renderer.google_news_rss_renderer(
+                     {"Fetch News" : (minutes * 30),
+                      "Shuffle News" : (always)},
+                      "news.google.com",
+                      [ "/rss?hl=en-US&gl=US&ceid=US:en" ],
+                     "Google News" ),
                  health_renderer.periodic_health_renderer(
                      {"Update Perioidic Job Health" : (seconds * 45)}),
                  stock_renderer.stock_quote_renderer(
index 4d02008b7a2aac075b70d4ea28c2a28d0e5c9ae6..fae27b1260bfb17ac0f9988dec27b5aec915a946 100644 (file)
@@ -44,6 +44,7 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer):
             self.debug_print("Item.tag isn't item?!")
             return False
         if self.is_item_older_than_n_days(item, 14):
+            self.debug_print("%s: is too old!" % title)
             return False
 
         details = {}
@@ -65,21 +66,11 @@ class seattletimes_rss_renderer(gnrss.generic_news_rss_renderer):
                 interesting = True
         if not interesting:
             return False
-
-        if 'enclosure' in details:
-            if 'pubDate' in details:
-                x = details['pubDate']
-                x = x.rsplit(' ', 1)[0]
-                # Fri, 13 Nov 2015 10:07:00
-                dt = datetime.datetime.strptime(x, '%a, %d %b %Y %H:%M:%S')
-                if dt < self.oldest:
-                    self.debug_print("%s is too old." % (
-                        details["pubDate"]))
-                    return False
         return True
 
     def item_is_interesting_for_article(self, title, description, item):
         if self.is_item_older_than_n_days(item, 14):
+            self.debug_print("%s: is too old!" % title)
             return False
         return len(description) >= 65
 
index aa56fddad51abc4e68a2961e23db93a5d7e1f3b9..a8ccf29c871e54dd4c09a915e0796e539710ed4e 100644 (file)
@@ -30,12 +30,14 @@ class wsj_rss_renderer(generic_news_rss_renderer.generic_news_rss_renderer):
 
     def item_is_interesting_for_headlines(self, title, description, item):
         if self.is_item_older_than_n_days(item, 7):
+            self.debug_print("%s: is too old!" % title)
             return False
         return ("WSJ.com" not in title and
                 "WSJ.com" not in description)
 
     def item_is_interesting_for_article(self, title, description, item):
         if self.is_item_older_than_n_days(item, 7):
+            self.debug_print("%s: is too old!" % title)
             return False
         return ("WSJ.com" not in title and
                 "WSJ.com" not in description)