generic_news_rss_renderer.py

   1 #!/usr/bin/env python3
   2
   3 from abc import abstractmethod
   4 import datetime
   5 from dateutil.parser import parse
   6 import http.client
   7 import random
   8 import re
   9 from typing import Dict, List, Optional, Union
  10 import xml.etree.ElementTree as ET
  11
  12 import file_writer
  13 import grab_bag
  14 import renderer
  15 import page_builder
  16 import profanity_filter
  17
  18
  19 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
  20     def __init__(
  21         self,
  22         name_to_timeout_dict: Dict[str, int],
  23         feed_site: str,
  24         feed_uris: List[str],
  25         page_title: str,
  26     ):
  27         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
  28         self.debug = True
  29         self.feed_site = feed_site
  30         self.feed_uris = feed_uris
  31         self.page_title = page_title
  32         self.news = grab_bag.grab_bag()
  33         self.details = grab_bag.grab_bag()
  34         self.filter = profanity_filter.ProfanityFilter()
  35
  36     @abstractmethod
  37     def debug_prefix(self) -> str:
  38         pass
  39
  40     @abstractmethod
  41     def get_headlines_page_prefix(self) -> str:
  42         pass
  43
  44     @abstractmethod
  45     def get_details_page_prefix(self) -> str:
  46         pass
  47
  48     def get_headlines_page_priority(self) -> str:
  49         return "4"
  50
  51     def get_details_page_priority(self) -> str:
  52         return "6"
  53
  54     @abstractmethod
  55     def should_use_https(self) -> bool:
  56         pass
  57
  58     def should_profanity_filter(self) -> bool:
  59         return False
  60
  61     def find_title(self, item: ET.Element) -> Optional[str]:
  62         return item.findtext("title")
  63
  64     def munge_title(self, title: str, item: ET.Element) -> str:
  65         return title
  66
  67     def find_description(self, item: ET.Element) -> Optional[str]:
  68         return item.findtext("description")
  69
  70     def munge_description(
  71             self,
  72             description: str,
  73             item: ET.Element
  74     ) -> str:
  75         description = re.sub("<[^>]+>", "", description)
  76         return description
  77
  78     def find_link(self, item: ET.Element) -> Optional[str]:
  79         return item.findtext("link")
  80
  81     def munge_link(self, link: str) -> str:
  82         return link
  83
  84     def find_image(self, item: ET.Element) -> Optional[str]:
  85         return item.findtext("image")
  86
  87     def munge_image(self, image: str) -> str:
  88         return image
  89
  90     def find_pubdate(self, item: ET.Element) -> Optional[str]:
  91         return item.findtext("pubDate")
  92
  93     def munge_pubdate(self, pubdate: str) -> str:
  94         return pubdate
  95
  96     def item_is_interesting_for_headlines(
  97         self, title: str, description: str, item: ET.Element
  98     ) -> bool:
  99         return True
 100
 101     def do_headlines(self) -> bool:
 102         return True
 103
 104     def do_details(self) -> bool:
 105         return True
 106
 107     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
 108         pubdate = self.find_pubdate(item)
 109         if pubdate is None:
 110             return False
 111         pubdatetime = parse(pubdate)
 112         tzinfo = pubdatetime.tzinfo
 113         now = datetime.datetime.now(tzinfo)
 114         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
 115         return delta > n
 116
 117     def item_is_interesting_for_article(
 118         self, title: str, description: str, item: ET.Element
 119     ) -> bool:
 120         return True
 121
 122     def periodic_render(self, key: str) -> bool:
 123         if key == "Fetch News":
 124             return self.fetch_news()
 125         elif key == "Shuffle News":
 126             return self.shuffle_news()
 127         else:
 128             raise Exception
 129
 130     def shuffle_news(self) -> bool:
 131         if self.do_headlines():
 132             headlines = page_builder.page_builder()
 133             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 134             headlines.set_title("%s" % self.page_title)
 135             subset = self.news.subset(4)
 136             if subset is None:
 137                 self.debug_print("Not enough messages to choose from.")
 138                 return False
 139             for msg in subset:
 140                 headlines.add_item(msg)
 141             headlines.set_custom_html(
 142                 """
 143     <STYLE>
 144     a:link {
 145       color: black;
 146       text-decoration: none;
 147       font-weight: bold;
 148     }
 149     a:visited {
 150       color: black;
 151       text-decoration: none;
 152       font-weight: bold;
 153     }
 154     a:active {
 155       color: black;
 156       text-decoration: none;
 157       font-weight: bold;
 158     }
 159     </STYLE>"""
 160             )
 161             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
 162             with file_writer.file_writer(_) as f:
 163                 headlines.render_html(f)
 164
 165         if self.do_details():
 166             details = page_builder.page_builder()
 167             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 168             details.set_custom_html(
 169                 """
 170     <STYLE>
 171     a:link {
 172       color: black;
 173       text-decoration: none;
 174       font-weight: bold;
 175     }
 176     a:visited {
 177       color: black;
 178       text-decoration: none;
 179       font-weight: bold;
 180     }
 181     a:active {
 182       color: black;
 183       text-decoration: none;
 184       font-weight: bold;
 185     }
 186     </STYLE>"""
 187             )
 188             details.set_title(f"{self.page_title}")
 189             subset = self.details.subset(1)
 190             if subset is None:
 191                 self.debug_print("Not enough details to choose from.")
 192                 return False
 193             for msg in subset:
 194                 blurb = msg
 195                 blurb += "</TD>"
 196                 details.add_item(blurb)
 197             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
 198             with file_writer.file_writer(_) as g:
 199                 details.render_html(g)
 200         return True
 201
 202     def fetch_news(self) -> bool:
 203         count = 0
 204         self.news.clear()
 205         self.details.clear()
 206         self.conn: Optional[Union[http.client.HTTPConnection,
 207                                   http.client.HTTPSConnection]] = None
 208
 209         for uri in self.feed_uris:
 210             if self.should_use_https():
 211                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
 212                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
 213             else:
 214                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
 215                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
 216             assert(self.conn is not None)
 217             self.conn.request(
 218                 "GET",
 219                 uri,
 220                 None,
 221                 {
 222                     "Accept": "*/*",
 223                     "Cache-control": "max-age=59",
 224                     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 225                 },
 226             )
 227             try:
 228                 response = self.conn.getresponse()
 229             except Exception as e:
 230                 traceback.print_exc(file=sys.stdout)
 231                 print(
 232                     f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
 233                 )
 234                 return False
 235
 236             if response.status != 200:
 237                 print(
 238                     f"{self.page_title}: RSS fetch_news error, response: {response.status}"
 239                 )
 240                 self.debug_print(str(response.read()))
 241                 return False
 242
 243             rss = ET.fromstring(response.read())
 244             channel = rss[0]
 245             title_filter = set()
 246             for item in channel.getchildren():
 247                 title = self.find_title(item)
 248                 description = item.findtext("description")
 249                 if title is not None:
 250                     title = self.munge_title(title, item)
 251                 if description is not None:
 252                     description = self.munge_description(description, item)
 253                 else:
 254                     description = ""
 255                 image = self.find_image(item)
 256                 if image is not None:
 257                     image = self.munge_image(image)
 258                 link = item.findtext("link")
 259                 if link is not None:
 260                     link = self.munge_link(link)
 261
 262                 if title is None or not self.item_is_interesting_for_headlines(
 263                     title, description, item
 264                 ):
 265                     self.debug_print(f'Item "{title}" is not interesting')
 266                     continue
 267
 268                 if self.should_profanity_filter() and (
 269                     self.filter.contains_bad_word(title)
 270                     or self.filter.contains_bad_word(description)
 271                 ):
 272                     self.debug_print(f'Found bad words in item "{title}"')
 273                     continue
 274
 275                 if title in title_filter:
 276                     self.debug_print(f'Already saw title {title}, skipping.')
 277                 title_filter.add(title)
 278
 279                 blurb = """<DIV style="padding:8px;
 280                                 font-size:34pt;
 281                                 -webkit-column-break-inside:avoid;">"""
 282                 if image is not None:
 283                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
 284                     blurb += 'style="padding:8px;">'
 285
 286                 if link is None:
 287                     blurb += f"<P><B>{title}</B>"
 288                 else:
 289                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
 290
 291                 pubdate = self.find_pubdate(item)
 292                 if pubdate is not None:
 293                     pubdate = self.munge_pubdate(pubdate)
 294                     ts = parse(pubdate)
 295                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
 296
 297                 if self.item_is_interesting_for_article(title, description, item):
 298                     longblurb = blurb
 299                     longblurb += "<BR>"
 300                     longblurb += description
 301                     longblurb += "</DIV>"
 302                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 303                     self.details.add(longblurb)
 304                 blurb += "</DIV>"
 305                 self.news.add(blurb)
 306                 count += 1
 307         return count > 0