generic_news_rss_renderer.py

   1 #!/usr/bin/env python3
   2
   3 from abc import abstractmethod
   4 import datetime
   5 from dateutil.parser import parse
   6 import http.client
   7 import logging
   8 import re
   9 from typing import Dict, List, Optional, Union
  10 import xml.etree.ElementTree as ET
  11
  12 import file_writer
  13 import grab_bag
  14 import renderer
  15 import page_builder
  16 import profanity_filter
  17
  18
  19 logger = logging.getLogger(__file__)
  20
  21
  22 class generic_news_rss_renderer(renderer.abstaining_renderer):
  23     def __init__(
  24         self,
  25         name_to_timeout_dict: Dict[str, int],
  26         feed_site: str,
  27         feed_uris: List[str],
  28         page_title: str,
  29     ):
  30         super().__init__(name_to_timeout_dict)
  31         self.feed_site = feed_site
  32         self.feed_uris = feed_uris
  33         self.page_title = page_title
  34         self.news = grab_bag.grab_bag()
  35         self.details = grab_bag.grab_bag()
  36         self.filter = profanity_filter.ProfanityFilter()
  37
  38     @abstractmethod
  39     def get_headlines_page_prefix(self) -> str:
  40         pass
  41
  42     @abstractmethod
  43     def get_details_page_prefix(self) -> str:
  44         pass
  45
  46     def get_headlines_page_priority(self) -> str:
  47         return "4"
  48
  49     def get_details_page_priority(self) -> str:
  50         return "6"
  51
  52     @abstractmethod
  53     def should_use_https(self) -> bool:
  54         pass
  55
  56     def should_profanity_filter(self) -> bool:
  57         return False
  58
  59     def find_title(self, item: ET.Element) -> Optional[str]:
  60         return item.findtext("title")
  61
  62     def munge_title(self, title: str, item: ET.Element) -> str:
  63         return title
  64
  65     def find_description(self, item: ET.Element) -> Optional[str]:
  66         return item.findtext("description")
  67
  68     def munge_description(
  69             self,
  70             description: str,
  71             item: ET.Element
  72     ) -> str:
  73         description = re.sub("<[^>]+>", "", description)
  74         return description
  75
  76     def find_link(self, item: ET.Element) -> Optional[str]:
  77         return item.findtext("link")
  78
  79     def munge_link(self, link: str) -> str:
  80         return link
  81
  82     def find_image(self, item: ET.Element) -> Optional[str]:
  83         return item.findtext("image")
  84
  85     def munge_image(self, image: str) -> str:
  86         return image
  87
  88     def find_pubdate(self, item: ET.Element) -> Optional[str]:
  89         return item.findtext("pubDate")
  90
  91     def munge_pubdate(self, pubdate: str) -> str:
  92         return pubdate
  93
  94     def item_is_interesting_for_headlines(
  95         self, title: str, description: str, item: ET.Element
  96     ) -> bool:
  97         return True
  98
  99     def do_headlines(self) -> bool:
 100         return True
 101
 102     def do_details(self) -> bool:
 103         return True
 104
 105     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
 106         pubdate = self.find_pubdate(item)
 107         if pubdate is None:
 108             return False
 109         pubdatetime = parse(pubdate)
 110         tzinfo = pubdatetime.tzinfo
 111         now = datetime.datetime.now(tzinfo)
 112         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
 113         return delta > n
 114
 115     def item_is_interesting_for_article(
 116         self, title: str, description: str, item: ET.Element
 117     ) -> bool:
 118         return True
 119
 120     def periodic_render(self, key: str) -> bool:
 121         if key == "Fetch News":
 122             return self.fetch_news()
 123         elif key == "Shuffle News":
 124             return self.shuffle_news()
 125         else:
 126             raise Exception
 127
 128     def shuffle_news(self) -> bool:
 129         if self.do_headlines():
 130             headlines = page_builder.page_builder()
 131             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 132             headlines.set_title("%s" % self.page_title)
 133             subset = self.news.subset(4)
 134             if subset is None:
 135                 logger.warning('Not enough messages to select from in shuffle_news?!')
 136                 return False
 137             for msg in subset:
 138                 headlines.add_item(msg)
 139             headlines.set_custom_html(
 140                 """
 141     <STYLE>
 142     a:link {
 143       color: black;
 144       text-decoration: none;
 145       font-weight: bold;
 146     }
 147     a:visited {
 148       color: black;
 149       text-decoration: none;
 150       font-weight: bold;
 151     }
 152     a:active {
 153       color: black;
 154       text-decoration: none;
 155       font-weight: bold;
 156     }
 157     </STYLE>"""
 158             )
 159             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
 160             with file_writer.file_writer(_) as f:
 161                 headlines.render_html(f)
 162
 163         if self.do_details():
 164             details = page_builder.page_builder()
 165             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 166             details.set_custom_html(
 167                 """
 168     <STYLE>
 169     a:link {
 170       color: black;
 171       text-decoration: none;
 172       font-weight: bold;
 173     }
 174     a:visited {
 175       color: black;
 176       text-decoration: none;
 177       font-weight: bold;
 178     }
 179     a:active {
 180       color: black;
 181       text-decoration: none;
 182       font-weight: bold;
 183     }
 184     </STYLE>"""
 185             )
 186             details.set_title(self.page_title)
 187             subset = self.details.subset(1)
 188             if subset is None:
 189                 logger.warning('Not enough details to choose from in do_details')
 190                 logger.debug("Not enough details to choose from.")
 191                 return False
 192             for msg in subset:
 193                 blurb = msg
 194                 blurb += "</TD>"
 195                 details.add_item(blurb)
 196             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
 197             with file_writer.file_writer(_) as g:
 198                 details.render_html(g)
 199         return True
 200
 201     def fetch_news(self) -> bool:
 202         count = 0
 203         self.news.clear()
 204         self.details.clear()
 205         self.conn: Optional[Union[http.client.HTTPConnection,
 206                                   http.client.HTTPSConnection]] = None
 207
 208         for uri in self.feed_uris:
 209             url = None
 210             if self.should_use_https():
 211                 url = f'https://{self.feed_site}{uri}'
 212                 logger.info(f'Fetching: {url}')
 213                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
 214             else:
 215                 url = f'http://{self.feed_site}{uri}'
 216                 logger.info(f'Fetching: {url}')
 217                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
 218             assert self.conn is not None
 219             assert url is not None
 220             self.conn.request(
 221                 "GET",
 222                 uri,
 223                 None,
 224                 {
 225                     "Accept": "*/*",
 226                     "Cache-control": "max-age=50",
 227                 },
 228             )
 229             try:
 230                 response = self.conn.getresponse()
 231             except Exception as e:
 232                 logger.exception(e)
 233                 logger.error(
 234                     f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
 235                 )
 236                 return False
 237
 238             if response.status != 200:
 239                 logger.error(
 240                     f'Unexpected status {response.status} while fetching {url}; giving up.'
 241                 )
 242                 return False
 243
 244             raw = response.read()
 245             logger.info(f'Status 200: got {len(raw)} bytes back from {url}')
 246             rss = ET.fromstring(raw)
 247             channel = rss[0]
 248             title_filter = set()
 249             for item in list(channel):
 250                 title = self.find_title(item)
 251                 description = item.findtext("description")
 252                 if title is not None:
 253                     title = self.munge_title(title, item)
 254                 else:
 255                     logger.info('Skipping RSS feed item with no title.')
 256                     continue
 257                 logger.debug(f'Considering RSS item {title}...')
 258                 if description is not None:
 259                     description = self.munge_description(description, item)
 260                 else:
 261                     description = ""
 262                 image = self.find_image(item)
 263                 if image is not None:
 264                     image = self.munge_image(image)
 265                 link = item.findtext("link")
 266                 if link is not None:
 267                     link = self.munge_link(link)
 268                 if not self.item_is_interesting_for_headlines(
 269                         title, description, item
 270                 ):
 271                     logger.info(f'Skipping {title} because it\'s not interesting.')
 272                     continue
 273
 274                 if self.should_profanity_filter() and (
 275                     self.filter.contains_bad_word(title)
 276                     or self.filter.contains_bad_word(description)
 277                 ):
 278                     logger.info(f'Skipping {title} because it contains profanity.')
 279                     continue
 280
 281                 if title in title_filter:
 282                     logger.info(f'Skipping {title} because we already saw an item with the same title.')
 283                     continue
 284                 title_filter.add(title)
 285
 286                 blurb = """<DIV style="padding:8px;
 287                                 font-size:34pt;
 288                                 -webkit-column-break-inside:avoid;">"""
 289                 if image is not None:
 290                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
 291                     blurb += 'style="padding:8px;">'
 292
 293                 if link is None:
 294                     blurb += f"<P><B>{title}</B>"
 295                 else:
 296                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
 297
 298                 pubdate = self.find_pubdate(item)
 299                 if pubdate is not None:
 300                     logger.debug(f'Raw pubdate={pubdate}')
 301                     pubdate = self.munge_pubdate(pubdate)
 302                     ts = parse(pubdate)
 303                     logger.debug(f'Translated pubdate into: {ts}')
 304                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
 305
 306                 if self.item_is_interesting_for_article(title, description, item):
 307                     logger.info(f'Item {title} is also interesting as an article details page; creating...')
 308                     longblurb = blurb
 309                     longblurb += "<BR>"
 310                     longblurb += description
 311                     longblurb += "</DIV>"
 312                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 313                     self.details.add(longblurb)
 314                 else:
 315                     logger.info(f'Item {title} isn\'t interesting for article details page; skipped.')
 316                 blurb += "</DIV>"
 317                 self.news.add(blurb)
 318                 count += 1
 319                 logger.debug(f'Added {count} items so far...')
 320         return count > 0