generic_news_rss_renderer.py

   1 #!/usr/bin/env python3
   2
   3 from abc import abstractmethod
   4 import datetime
   5 from dateutil.parser import parse
   6 import http.client
   7 import logging
   8 import re
   9 from typing import Dict, List, Optional, Union
  10 import xml.etree.ElementTree as ET
  11
  12 from scottutilz import profanity_filter
  13
  14 import file_writer
  15 import grab_bag
  16 import renderer
  17 import page_builder
  18
  19
  20 logger = logging.getLogger(__name__)
  21
  22
  23 class generic_news_rss_renderer(renderer.abstaining_renderer):
  24     def __init__(
  25         self,
  26         name_to_timeout_dict: Dict[str, int],
  27         feed_site: str,
  28         feed_uris: List[str],
  29         page_title: str,
  30     ):
  31         super().__init__(name_to_timeout_dict)
  32         self.feed_site = feed_site
  33         self.feed_uris = feed_uris
  34         self.page_title = page_title
  35         self.news = grab_bag.grab_bag()
  36         self.details = grab_bag.grab_bag()
  37         self.filter = profanity_filter.ProfanityFilter()
  38
  39     @abstractmethod
  40     def get_headlines_page_prefix(self) -> str:
  41         pass
  42
  43     @abstractmethod
  44     def get_details_page_prefix(self) -> str:
  45         pass
  46
  47     def get_headlines_page_priority(self) -> str:
  48         return "4"
  49
  50     def get_details_page_priority(self) -> str:
  51         return "6"
  52
  53     @abstractmethod
  54     def should_use_https(self) -> bool:
  55         pass
  56
  57     def should_profanity_filter(self) -> bool:
  58         return False
  59
  60     def find_title(self, item: ET.Element) -> Optional[str]:
  61         return item.findtext("title")
  62
  63     def munge_title(self, title: str, item: ET.Element) -> str:
  64         return title
  65
  66     def find_description(self, item: ET.Element) -> Optional[str]:
  67         return item.findtext("description")
  68
  69     def munge_description(self, description: str, item: ET.Element) -> str:
  70         description = re.sub("<[^>]+>", "", description)
  71         return description
  72
  73     def find_link(self, item: ET.Element) -> Optional[str]:
  74         return item.findtext("link")
  75
  76     def munge_link(self, link: str) -> str:
  77         return link
  78
  79     def find_image(self, item: ET.Element) -> Optional[str]:
  80         return item.findtext("image")
  81
  82     def munge_image(self, image: str) -> str:
  83         return image
  84
  85     def find_pubdate(self, item: ET.Element) -> Optional[str]:
  86         return item.findtext("pubDate")
  87
  88     def munge_pubdate(self, pubdate: str) -> str:
  89         return pubdate
  90
  91     def item_is_interesting_for_headlines(
  92         self, title: str, description: str, item: ET.Element
  93     ) -> bool:
  94         return True
  95
  96     def do_headlines(self) -> bool:
  97         return True
  98
  99     def do_details(self) -> bool:
 100         return True
 101
 102     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
 103         pubdate = self.find_pubdate(item)
 104         if pubdate is None:
 105             return False
 106         pubdatetime = parse(pubdate)
 107         tzinfo = pubdatetime.tzinfo
 108         now = datetime.datetime.now(tzinfo)
 109         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
 110         return delta > n
 111
 112     def item_is_interesting_for_article(
 113         self, title: str, description: str, item: ET.Element
 114     ) -> bool:
 115         return True
 116
 117     def periodic_render(self, key: str) -> bool:
 118         if key == "Fetch News":
 119             return self.fetch_news()
 120         elif key == "Shuffle News":
 121             return self.shuffle_news()
 122         else:
 123             raise Exception
 124
 125     def shuffle_news(self) -> bool:
 126         if self.do_headlines():
 127             headlines = page_builder.page_builder()
 128             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 129             headlines.set_title("%s" % self.page_title)
 130             subset = self.news.subset(4)
 131             if subset is None:
 132                 logger.warning("Not enough messages to select from in shuffle_news?!")
 133                 return False
 134             for msg in subset:
 135                 headlines.add_item(msg)
 136             headlines.set_custom_html(
 137                 """
 138     <STYLE>
 139     a:link {
 140       color: black;
 141       text-decoration: none;
 142       font-weight: bold;
 143     }
 144     a:visited {
 145       color: black;
 146       text-decoration: none;
 147       font-weight: bold;
 148     }
 149     a:active {
 150       color: black;
 151       text-decoration: none;
 152       font-weight: bold;
 153     }
 154     </STYLE>"""
 155             )
 156             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
 157             with file_writer.file_writer(_) as f:
 158                 headlines.render_html(f)
 159
 160         if self.do_details():
 161             details = page_builder.page_builder()
 162             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 163             details.set_custom_html(
 164                 """
 165     <STYLE>
 166     a:link {
 167       color: black;
 168       text-decoration: none;
 169       font-weight: bold;
 170     }
 171     a:visited {
 172       color: black;
 173       text-decoration: none;
 174       font-weight: bold;
 175     }
 176     a:active {
 177       color: black;
 178       text-decoration: none;
 179       font-weight: bold;
 180     }
 181     </STYLE>"""
 182             )
 183             details.set_title(self.page_title)
 184             subset = self.details.subset(1)
 185             if subset is None:
 186                 logger.warning("Not enough details to choose from in do_details")
 187                 logger.debug("Not enough details to choose from.")
 188                 return False
 189             for msg in subset:
 190                 blurb = msg
 191                 blurb += "</TD>"
 192                 details.add_item(blurb)
 193             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
 194             with file_writer.file_writer(_) as g:
 195                 details.render_html(g)
 196         return True
 197
 198     def fetch_news(self) -> bool:
 199         count = 0
 200         self.news.clear()
 201         self.details.clear()
 202         self.conn: Optional[
 203             Union[http.client.HTTPConnection, http.client.HTTPSConnection]
 204         ] = None
 205
 206         for uri in self.feed_uris:
 207             url = None
 208             if self.should_use_https():
 209                 url = f"https://{self.feed_site}{uri}"
 210                 logger.info(f"Fetching: {url}")
 211                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
 212             else:
 213                 url = f"http://{self.feed_site}{uri}"
 214                 logger.info(f"Fetching: {url}")
 215                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
 216             assert self.conn is not None
 217             assert url is not None
 218             self.conn.request(
 219                 "GET",
 220                 uri,
 221                 None,
 222                 {
 223                     "Accept": "*/*",
 224                     "Cache-control": "max-age=50",
 225                 },
 226             )
 227             try:
 228                 response = self.conn.getresponse()
 229             except Exception:
 230                 logger.exception(
 231                     f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
 232                 )
 233                 return False
 234
 235             if response.status != 200:
 236                 logger.error(
 237                     f"Unexpected status {response.status} while fetching {url}; giving up."
 238                 )
 239                 return False
 240
 241             raw = response.read()
 242             logger.info(f"Status 200: got {len(raw)} bytes back from {url}")
 243             rss = ET.fromstring(raw)
 244             channel = rss[0]
 245             title_filter = set()
 246             for item in list(channel):
 247                 title = self.find_title(item)
 248                 description = item.findtext("description")
 249                 if title is not None:
 250                     title = self.munge_title(title, item)
 251                 else:
 252                     logger.info("Skipping RSS feed item with no title.")
 253                     continue
 254                 logger.debug(f"Considering RSS item {title}...")
 255                 if description is not None:
 256                     description = self.munge_description(description, item)
 257                 else:
 258                     description = ""
 259                 image = self.find_image(item)
 260                 if image is not None:
 261                     image = self.munge_image(image)
 262                 link = item.findtext("link")
 263                 if link is not None:
 264                     link = self.munge_link(link)
 265                 if not self.item_is_interesting_for_headlines(title, description, item):
 266                     logger.info(f"Skipping {title} because it's not interesting.")
 267                     continue
 268
 269                 if self.should_profanity_filter() and (
 270                     self.filter.contains_bad_word(title)
 271                     or self.filter.contains_bad_word(description)
 272                 ):
 273                     logger.info(f"Skipping {title} because it contains profanity.")
 274                     continue
 275
 276                 if title in title_filter:
 277                     logger.info(
 278                         f"Skipping {title} because we already saw an item with the same title."
 279                     )
 280                     continue
 281                 title_filter.add(title)
 282
 283                 blurb = """<DIV style="padding:8px;
 284                                 font-size:34pt;
 285                                 -webkit-column-break-inside:avoid;">"""
 286                 if image is not None:
 287                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
 288                     blurb += 'style="padding:8px;">'
 289
 290                 if link is None:
 291                     blurb += f"<P><B>{title}</B>"
 292                 else:
 293                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
 294
 295                 pubdate = self.find_pubdate(item)
 296                 if pubdate is not None:
 297                     logger.debug(f"Raw pubdate={pubdate}")
 298                     pubdate = self.munge_pubdate(pubdate)
 299                     ts = parse(pubdate)
 300                     logger.debug(f"Translated pubdate into: {ts}")
 301                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
 302
 303                 if self.item_is_interesting_for_article(title, description, item):
 304                     logger.info(
 305                         f"Item {title} is also interesting as an article details page; creating..."
 306                     )
 307                     longblurb = blurb
 308                     longblurb += "<BR>"
 309                     longblurb += description
 310                     longblurb += "</DIV>"
 311                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 312                     self.details.add(longblurb)
 313                 else:
 314                     logger.info(
 315                         f"Item {title} isn't interesting for article details page; skipped."
 316                     )
 317                 blurb += "</DIV>"
 318                 self.news.add(blurb)
 319                 count += 1
 320                 logger.debug(f"Added {count} items so far...")
 321         return count > 0