generic_news_rss_renderer.py

   1 #!/usr/bin/env python3
   2
   3 from abc import abstractmethod
   4 import datetime
   5 from dateutil.parser import parse
   6 import http.client
   7 import random
   8 import re
   9 from typing import Dict, List
  10 import xml.etree.ElementTree as ET
  11
  12 import file_writer
  13 import grab_bag
  14 import renderer
  15 import page_builder
  16 import profanity_filter
  17
  18
  19 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
  20     def __init__(
  21         self,
  22         name_to_timeout_dict: Dict[str, int],
  23         feed_site: str,
  24         feed_uris: List[str],
  25         page_title: str,
  26     ):
  27         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
  28         self.debug = True
  29         self.feed_site = feed_site
  30         self.feed_uris = feed_uris
  31         self.page_title = page_title
  32         self.news = grab_bag.grab_bag()
  33         self.details = grab_bag.grab_bag()
  34         self.filter = profanity_filter.profanity_filter()
  35
  36     @abstractmethod
  37     def debug_prefix(self) -> str:
  38         pass
  39
  40     @abstractmethod
  41     def get_headlines_page_prefix(self) -> str:
  42         pass
  43
  44     @abstractmethod
  45     def get_details_page_prefix(self) -> str:
  46         pass
  47
  48     def get_headlines_page_priority(self) -> str:
  49         return "4"
  50
  51     def get_details_page_priority(self) -> str:
  52         return "6"
  53
  54     @abstractmethod
  55     def should_use_https(self) -> bool:
  56         pass
  57
  58     def should_profanity_filter(self) -> bool:
  59         return False
  60
  61     def find_title(self, item: ET.Element) -> str:
  62         return item.findtext("title")
  63
  64     def munge_title(self, title: str) -> str:
  65         return title
  66
  67     def find_description(self, item: ET.Element) -> str:
  68         return item.findtext("description")
  69
  70     def munge_description(self, description: str) -> str:
  71         description = re.sub("<[^>]+>", "", description)
  72         return description
  73
  74     def find_link(self, item: ET.Element) -> str:
  75         return item.findtext("link")
  76
  77     def munge_link(self, link: str) -> str:
  78         return link
  79
  80     def find_image(self, item: ET.Element) -> str:
  81         return item.findtext("image")
  82
  83     def munge_image(self, image: str) -> str:
  84         return image
  85
  86     def find_pubdate(self, item: ET.Element) -> str:
  87         return item.findtext("pubDate")
  88
  89     def munge_pubdate(self, pubdate: str) -> str:
  90         return pubdate
  91
  92     def item_is_interesting_for_headlines(
  93         self, title: str, description: str, item: ET.Element
  94     ) -> bool:
  95         return True
  96
  97     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
  98         pubdate = self.find_pubdate(item)
  99         if pubdate is None:
 100             return False
 101         pubdate = parse(pubdate)
 102         tzinfo = pubdate.tzinfo
 103         now = datetime.datetime.now(tzinfo)
 104         delta = (now - pubdate).total_seconds() / (60 * 60 * 24)
 105         return delta > n
 106
 107     def item_is_interesting_for_article(
 108         self, title: str, description: str, item: ET.Element
 109     ) -> bool:
 110         return True
 111
 112     def periodic_render(self, key: str) -> bool:
 113         if key == "Fetch News":
 114             return self.fetch_news()
 115         elif key == "Shuffle News":
 116             return self.shuffle_news()
 117         else:
 118             raise error("Unexpected operation")
 119
 120     def shuffle_news(self) -> bool:
 121         headlines = page_builder.page_builder()
 122         headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 123         headlines.set_title("%s" % self.page_title)
 124         subset = self.news.subset(4)
 125         if subset is None:
 126             self.debug_print("Not enough messages to choose from.")
 127             return False
 128         for msg in subset:
 129             headlines.add_item(msg)
 130         headlines.set_custom_html(
 131             """
 132 <STYLE>
 133 a:link {
 134   color: black;
 135   text-decoration: none;
 136   font-weight: bold;
 137 }
 138 a:visited {
 139   color: black;
 140   text-decoration: none;
 141   font-weight: bold;
 142 }
 143 a:active {
 144   color: black;
 145   text-decoration: none;
 146   font-weight: bold;
 147 }
 148 </STYLE>"""
 149         )
 150         _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
 151         with file_writer.file_writer(_) as f:
 152             headlines.render_html(f)
 153
 154         details = page_builder.page_builder()
 155         details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 156         details.set_custom_html(
 157             """
 158 <STYLE>
 159 a:link {
 160   color: black;
 161   text-decoration: none;
 162   font-weight: bold;
 163 }
 164 a:visited {
 165   color: black;
 166   text-decoration: none;
 167   font-weight: bold;
 168 }
 169 a:active {
 170   color: black;
 171   text-decoration: none;
 172   font-weight: bold;
 173 }
 174 </STYLE>"""
 175         )
 176         details.set_title(f"{self.page_title}")
 177         subset = self.details.subset(1)
 178         if subset is None:
 179             self.debug_print("Not enough details to choose from.")
 180             return False
 181         for msg in subset:
 182             blurb = msg
 183             blurb += "</TD>"
 184             details.add_item(blurb)
 185         _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
 186         with file_writer.file_writer(_) as g:
 187             details.render_html(g)
 188         return True
 189
 190     def fetch_news(self) -> bool:
 191         count = 0
 192         self.news.clear()
 193         self.details.clear()
 194
 195         for uri in self.feed_uris:
 196             if self.should_use_https():
 197                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
 198                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
 199             else:
 200                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
 201                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
 202             self.conn.request(
 203                 "GET",
 204                 uri,
 205                 None,
 206                 {
 207                     "Accept": "*/*",
 208                     "Cache-control": "max-age=59",
 209                     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 210                 },
 211             )
 212             try:
 213                 response = self.conn.getresponse()
 214             except:
 215                 print("Exception in generic RSS renderer HTTP connection")
 216                 return False
 217
 218             if response.status != 200:
 219                 print(
 220                     f"{self.page_title}: RSS fetch_news error, response: {response.status}"
 221                 )
 222                 self.debug_print(response.read())
 223                 return False
 224
 225             rss = ET.fromstring(response.read())
 226             channel = rss[0]
 227             for item in channel.getchildren():
 228                 title = self.find_title(item)
 229                 if title is not None:
 230                     title = self.munge_title(title)
 231                 description = item.findtext("description")
 232                 if description is not None:
 233                     description = self.munge_description(description)
 234                 image = self.find_image(item)
 235                 if image is not None:
 236                     image = self.munge_image(image)
 237                 link = item.findtext("link")
 238                 if link is not None:
 239                     link = self.munge_link(link)
 240
 241                 if title is None or not self.item_is_interesting_for_headlines(
 242                     title, description, item
 243                 ):
 244                     self.debug_print(f'Item "{title}" is not interesting')
 245                     continue
 246
 247                 if self.should_profanity_filter() and (
 248                     self.filter.contains_bad_words(title)
 249                     or self.filter.contains_bad_words(description)
 250                 ):
 251                     self.debug_print(f'Found bad words in item "{title}"')
 252                     continue
 253
 254                 blurb = """<DIV style="padding:8px;
 255                                  font-size:34pt;
 256                                  -webkit-column-break-inside:avoid;">"""
 257                 if image is not None:
 258                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
 259                     blurb += 'style="padding:8px;">'
 260
 261                 if link is None:
 262                     blurb += f"<P><B>{title}</B>"
 263                 else:
 264                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
 265
 266                 pubdate = self.find_pubdate(item)
 267                 if pubdate is not None:
 268                     pubdate = self.munge_pubdate(pubdate)
 269                     ts = parse(pubdate)
 270                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
 271
 272                 if description is not None and self.item_is_interesting_for_article(
 273                     title, description, item
 274                 ):
 275                     longblurb = blurb
 276                     longblurb += "<BR>"
 277                     longblurb += description
 278                     longblurb += "</DIV>"
 279                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 280                     self.details.add(longblurb)
 281                 blurb += "</DIV>"
 282                 self.news.add(blurb)
 283                 count += 1
 284         return count > 0