generic_news_rss_renderer.py

   1 #!/usr/bin/env python3
   2
   3 from abc import abstractmethod
   4 import datetime
   5 from dateutil.parser import parse
   6 import http.client
   7 import random
   8 import re
   9 import sys
  10 import traceback
  11 from typing import Dict, List, Optional, Union
  12 import xml.etree.ElementTree as ET
  13
  14 import file_writer
  15 import grab_bag
  16 import renderer
  17 import page_builder
  18 import profanity_filter
  19
  20
  21 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
  22     def __init__(
  23         self,
  24         name_to_timeout_dict: Dict[str, int],
  25         feed_site: str,
  26         feed_uris: List[str],
  27         page_title: str,
  28     ):
  29         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
  30         self.debug = True
  31         self.feed_site = feed_site
  32         self.feed_uris = feed_uris
  33         self.page_title = page_title
  34         self.news = grab_bag.grab_bag()
  35         self.details = grab_bag.grab_bag()
  36         self.filter = profanity_filter.ProfanityFilter()
  37
  38     @abstractmethod
  39     def debug_prefix(self) -> str:
  40         pass
  41
  42     @abstractmethod
  43     def get_headlines_page_prefix(self) -> str:
  44         pass
  45
  46     @abstractmethod
  47     def get_details_page_prefix(self) -> str:
  48         pass
  49
  50     def get_headlines_page_priority(self) -> str:
  51         return "4"
  52
  53     def get_details_page_priority(self) -> str:
  54         return "6"
  55
  56     @abstractmethod
  57     def should_use_https(self) -> bool:
  58         pass
  59
  60     def should_profanity_filter(self) -> bool:
  61         return False
  62
  63     def find_title(self, item: ET.Element) -> Optional[str]:
  64         return item.findtext("title")
  65
  66     def munge_title(self, title: str, item: ET.Element) -> str:
  67         return title
  68
  69     def find_description(self, item: ET.Element) -> Optional[str]:
  70         return item.findtext("description")
  71
  72     def munge_description(
  73             self,
  74             description: str,
  75             item: ET.Element
  76     ) -> str:
  77         description = re.sub("<[^>]+>", "", description)
  78         return description
  79
  80     def find_link(self, item: ET.Element) -> Optional[str]:
  81         return item.findtext("link")
  82
  83     def munge_link(self, link: str) -> str:
  84         return link
  85
  86     def find_image(self, item: ET.Element) -> Optional[str]:
  87         return item.findtext("image")
  88
  89     def munge_image(self, image: str) -> str:
  90         return image
  91
  92     def find_pubdate(self, item: ET.Element) -> Optional[str]:
  93         return item.findtext("pubDate")
  94
  95     def munge_pubdate(self, pubdate: str) -> str:
  96         return pubdate
  97
  98     def item_is_interesting_for_headlines(
  99         self, title: str, description: str, item: ET.Element
 100     ) -> bool:
 101         return True
 102
 103     def do_headlines(self) -> bool:
 104         return True
 105
 106     def do_details(self) -> bool:
 107         return True
 108
 109     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
 110         pubdate = self.find_pubdate(item)
 111         if pubdate is None:
 112             return False
 113         pubdatetime = parse(pubdate)
 114         tzinfo = pubdatetime.tzinfo
 115         now = datetime.datetime.now(tzinfo)
 116         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
 117         return delta > n
 118
 119     def item_is_interesting_for_article(
 120         self, title: str, description: str, item: ET.Element
 121     ) -> bool:
 122         return True
 123
 124     def periodic_render(self, key: str) -> bool:
 125         if key == "Fetch News":
 126             return self.fetch_news()
 127         elif key == "Shuffle News":
 128             return self.shuffle_news()
 129         else:
 130             raise Exception
 131
 132     def shuffle_news(self) -> bool:
 133         if self.do_headlines():
 134             headlines = page_builder.page_builder()
 135             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 136             headlines.set_title("%s" % self.page_title)
 137             subset = self.news.subset(4)
 138             if subset is None:
 139                 self.debug_print("Not enough messages to choose from.")
 140                 return False
 141             for msg in subset:
 142                 headlines.add_item(msg)
 143             headlines.set_custom_html(
 144                 """
 145     <STYLE>
 146     a:link {
 147       color: black;
 148       text-decoration: none;
 149       font-weight: bold;
 150     }
 151     a:visited {
 152       color: black;
 153       text-decoration: none;
 154       font-weight: bold;
 155     }
 156     a:active {
 157       color: black;
 158       text-decoration: none;
 159       font-weight: bold;
 160     }
 161     </STYLE>"""
 162             )
 163             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
 164             with file_writer.file_writer(_) as f:
 165                 headlines.render_html(f)
 166
 167         if self.do_details():
 168             details = page_builder.page_builder()
 169             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 170             details.set_custom_html(
 171                 """
 172     <STYLE>
 173     a:link {
 174       color: black;
 175       text-decoration: none;
 176       font-weight: bold;
 177     }
 178     a:visited {
 179       color: black;
 180       text-decoration: none;
 181       font-weight: bold;
 182     }
 183     a:active {
 184       color: black;
 185       text-decoration: none;
 186       font-weight: bold;
 187     }
 188     </STYLE>"""
 189             )
 190             details.set_title(f"{self.page_title}")
 191             subset = self.details.subset(1)
 192             if subset is None:
 193                 self.debug_print("Not enough details to choose from.")
 194                 return False
 195             for msg in subset:
 196                 blurb = msg
 197                 blurb += "</TD>"
 198                 details.add_item(blurb)
 199             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
 200             with file_writer.file_writer(_) as g:
 201                 details.render_html(g)
 202         return True
 203
 204     def fetch_news(self) -> bool:
 205         count = 0
 206         self.news.clear()
 207         self.details.clear()
 208         self.conn: Optional[Union[http.client.HTTPConnection,
 209                                   http.client.HTTPSConnection]] = None
 210
 211         for uri in self.feed_uris:
 212             if self.should_use_https():
 213                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
 214                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
 215             else:
 216                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
 217                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
 218             assert(self.conn is not None)
 219             self.conn.request(
 220                 "GET",
 221                 uri,
 222                 None,
 223                 {
 224                     "Accept": "*/*",
 225 #                    "Cache-control": "max-age=50",
 226 #                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 227                 },
 228             )
 229             try:
 230                 response = self.conn.getresponse()
 231             except Exception as e:
 232                 traceback.print_exc(file=sys.stdout)
 233                 print(
 234                     f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
 235                 )
 236                 return False
 237
 238             if response.status != 200:
 239                 print(
 240                     f"{self.page_title}: RSS fetch_news error, response: {response.status}"
 241                 )
 242                 self.debug_print(str(response.read()))
 243                 return False
 244
 245             rss = ET.fromstring(response.read())
 246             channel = rss[0]
 247             title_filter = set()
 248             for item in channel.getchildren():
 249                 title = self.find_title(item)
 250                 description = item.findtext("description")
 251                 if title is not None:
 252                     title = self.munge_title(title, item)
 253                 if description is not None:
 254                     description = self.munge_description(description, item)
 255                 else:
 256                     description = ""
 257                 image = self.find_image(item)
 258                 if image is not None:
 259                     image = self.munge_image(image)
 260                 link = item.findtext("link")
 261                 if link is not None:
 262                     link = self.munge_link(link)
 263
 264                 if title is None or not self.item_is_interesting_for_headlines(
 265                     title, description, item
 266                 ):
 267                     self.debug_print(f'Item "{title}" is not interesting')
 268                     continue
 269
 270                 if self.should_profanity_filter() and (
 271                     self.filter.contains_bad_word(title)
 272                     or self.filter.contains_bad_word(description)
 273                 ):
 274                     self.debug_print(f'Found bad words in item "{title}"')
 275                     continue
 276
 277                 if title in title_filter:
 278                     self.debug_print(f'Already saw title {title}, skipping.')
 279                 title_filter.add(title)
 280
 281                 blurb = """<DIV style="padding:8px;
 282                                 font-size:34pt;
 283                                 -webkit-column-break-inside:avoid;">"""
 284                 if image is not None:
 285                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
 286                     blurb += 'style="padding:8px;">'
 287
 288                 if link is None:
 289                     blurb += f"<P><B>{title}</B>"
 290                 else:
 291                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
 292
 293                 pubdate = self.find_pubdate(item)
 294                 if pubdate is not None:
 295                     pubdate = self.munge_pubdate(pubdate)
 296                     ts = parse(pubdate)
 297                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
 298
 299                 if self.item_is_interesting_for_article(title, description, item):
 300                     longblurb = blurb
 301                     longblurb += "<BR>"
 302                     longblurb += description
 303                     longblurb += "</DIV>"
 304                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 305                     self.details.add(longblurb)
 306                 blurb += "</DIV>"
 307                 self.news.add(blurb)
 308                 count += 1
 309         return count > 0