More changes related to running on new kiosk.house.

[kiosk.git] / generic_news_rss_renderer.py
diff --git a/generic_news_rss_renderer.py b/generic_news_rss_renderer.py

index 34c48210c9ce4b3069710e27db9d6ecf783a0113..149f8acb3aa9f163d195d42deec8e82b442da34f 100644 (file)
--- a/generic_news_rss_renderer.py
+++ b/generic_news_rss_renderer.py
@@ -6,6 +6,8 @@ from dateutil.parser import parse
  import http.client
  import random
  import re
+import sys
+import traceback
  from typing import Dict, List, Optional, Union
  import xml.etree.ElementTree as ET
  
@@ -31,7 +33,7 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
          self.page_title = page_title
          self.news = grab_bag.grab_bag()
          self.details = grab_bag.grab_bag()
-        self.filter = profanity_filter.profanity_filter()
+        self.filter = profanity_filter.ProfanityFilter()
  
      @abstractmethod
      def debug_prefix(self) -> str:
@@ -61,13 +63,17 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
      def find_title(self, item: ET.Element) -> Optional[str]:
          return item.findtext("title")
  
-    def munge_title(self, title: str) -> str:
+    def munge_title(self, title: str, item: ET.Element) -> str:
          return title
  
      def find_description(self, item: ET.Element) -> Optional[str]:
          return item.findtext("description")
  
-    def munge_description(self, description: str) -> str:
+    def munge_description(
+            self,
+            description: str,
+            item: ET.Element
+    ) -> str:
          description = re.sub("<[^>]+>", "", description)
          return description
  
@@ -94,6 +100,12 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
      ) -> bool:
          return True
  
+    def do_headlines(self) -> bool:
+        return True
+
+    def do_details(self) -> bool:
+        return True
+
      def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
          pubdate = self.find_pubdate(item)
          if pubdate is None:
@@ -118,73 +130,75 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
              raise Exception
  
      def shuffle_news(self) -> bool:
-        headlines = page_builder.page_builder()
-        headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
-        headlines.set_title("%s" % self.page_title)
-        subset = self.news.subset(4)
-        if subset is None:
-            self.debug_print("Not enough messages to choose from.")
-            return False
-        for msg in subset:
-            headlines.add_item(msg)
-        headlines.set_custom_html(
-            """
-<STYLE>
-a:link {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-a:visited {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-a:active {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-</STYLE>"""
-        )
-        _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
-        with file_writer.file_writer(_) as f:
-            headlines.render_html(f)
-
-        details = page_builder.page_builder()
-        details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
-        details.set_custom_html(
-            """
-<STYLE>
-a:link {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-a:visited {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-a:active {
-  color: black;
-  text-decoration: none;
-  font-weight: bold;
-}
-</STYLE>"""
-        )
-        details.set_title(f"{self.page_title}")
-        subset = self.details.subset(1)
-        if subset is None:
-            self.debug_print("Not enough details to choose from.")
-            return False
-        for msg in subset:
-            blurb = msg
-            blurb += "</TD>"
-            details.add_item(blurb)
-        _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
-        with file_writer.file_writer(_) as g:
-            details.render_html(g)
+        if self.do_headlines():
+            headlines = page_builder.page_builder()
+            headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
+            headlines.set_title("%s" % self.page_title)
+            subset = self.news.subset(4)
+            if subset is None:
+                self.debug_print("Not enough messages to choose from.")
+                return False
+            for msg in subset:
+                headlines.add_item(msg)
+            headlines.set_custom_html(
+                """
+    <STYLE>
+    a:link {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    a:visited {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    a:active {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    </STYLE>"""
+            )
+            _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
+            with file_writer.file_writer(_) as f:
+                headlines.render_html(f)
+
+        if self.do_details():
+            details = page_builder.page_builder()
+            details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
+            details.set_custom_html(
+                """
+    <STYLE>
+    a:link {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    a:visited {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    a:active {
+      color: black;
+      text-decoration: none;
+      font-weight: bold;
+    }
+    </STYLE>"""
+            )
+            details.set_title(f"{self.page_title}")
+            subset = self.details.subset(1)
+            if subset is None:
+                self.debug_print("Not enough details to choose from.")
+                return False
+            for msg in subset:
+                blurb = msg
+                blurb += "</TD>"
+                details.add_item(blurb)
+            _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
+            with file_writer.file_writer(_) as g:
+                details.render_html(g)
          return True
  
      def fetch_news(self) -> bool:
@@ -197,10 +211,10 @@ a:active {
          for uri in self.feed_uris:
              if self.should_use_https():
                  self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
-                self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
+                self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
              else:
                  self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
-                self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
+                self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
              assert(self.conn is not None)
              self.conn.request(
                  "GET",
@@ -208,14 +222,17 @@ a:active {
                  None,
                  {
                      "Accept": "*/*",
-                    "Cache-control": "max-age=59",
-                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+#                    "Cache-control": "max-age=50",
+#                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
                  },
              )
              try:
                  response = self.conn.getresponse()
-            except:
-                print("Exception in generic RSS renderer HTTP connection")
+            except Exception as e:
+                traceback.print_exc(file=sys.stdout)
+                print(
+                    f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
+                )
                  return False
  
              if response.status != 200:
@@ -227,13 +244,14 @@ a:active {
  
              rss = ET.fromstring(response.read())
              channel = rss[0]
+            title_filter = set()
              for item in channel.getchildren():
                  title = self.find_title(item)
-                if title is not None:
-                    title = self.munge_title(title)
                  description = item.findtext("description")
+                if title is not None:
+                    title = self.munge_title(title, item)
                  if description is not None:
-                    description = self.munge_description(description)
+                    description = self.munge_description(description, item)
                  else:
                      description = ""
                  image = self.find_image(item)
@@ -250,15 +268,19 @@ a:active {
                      continue
  
                  if self.should_profanity_filter() and (
-                    self.filter.contains_bad_words(title)
-                    or self.filter.contains_bad_words(description)
+                    self.filter.contains_bad_word(title)
+                    or self.filter.contains_bad_word(description)
                  ):
                      self.debug_print(f'Found bad words in item "{title}"')
                      continue
  
+                if title in title_filter:
+                    self.debug_print(f'Already saw title {title}, skipping.')
+                title_filter.add(title)
+
                  blurb = """<DIV style="padding:8px;
-                                 font-size:34pt;
-                                 -webkit-column-break-inside:avoid;">"""
+                                font-size:34pt;
+                                -webkit-column-break-inside:avoid;">"""
                  if image is not None:
                      blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
                      blurb += 'style="padding:8px;">'
@@ -274,9 +296,7 @@ a:active {
                      ts = parse(pubdate)
                      blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
  
-                if description is not None and self.item_is_interesting_for_article(
-                    title, description, item
-                ):
+                if self.item_is_interesting_for_article(title, description, item):
                      longblurb = blurb
                      longblurb += "<BR>"
                      longblurb += description