#!/usr/bin/env python3 from abc import abstractmethod import datetime from dateutil.parser import parse import http.client import logging import re from typing import Dict, List, Optional, Union import xml.etree.ElementTree as ET from scottutilz import profanity_filter import file_writer import grab_bag import renderer import page_builder logger = logging.getLogger(__name__) class generic_news_rss_renderer(renderer.abstaining_renderer): def __init__( self, name_to_timeout_dict: Dict[str, int], feed_site: str, feed_uris: List[str], page_title: str, ): super().__init__(name_to_timeout_dict) self.feed_site = feed_site self.feed_uris = feed_uris self.page_title = page_title self.news = grab_bag.grab_bag() self.details = grab_bag.grab_bag() self.filter = profanity_filter.ProfanityFilter() @abstractmethod def get_headlines_page_prefix(self) -> str: pass @abstractmethod def get_details_page_prefix(self) -> str: pass def get_headlines_page_priority(self) -> str: return "4" def get_details_page_priority(self) -> str: return "6" @abstractmethod def should_use_https(self) -> bool: pass def should_profanity_filter(self) -> bool: return True def find_title(self, item: ET.Element) -> Optional[str]: return item.findtext("title") def munge_title(self, title: str, item: ET.Element) -> str: return title def find_description(self, item: ET.Element) -> Optional[str]: return item.findtext("description") def munge_description(self, description: str, item: ET.Element) -> str: description = re.sub("<[^>]+>", "", description) return description def find_link(self, item: ET.Element) -> Optional[str]: return item.findtext("link") def munge_link(self, link: str) -> str: return link def find_image(self, item: ET.Element) -> Optional[str]: return item.findtext("image") def munge_image(self, image: str) -> str: return image def find_pubdate(self, item: ET.Element) -> Optional[str]: return item.findtext("pubDate") def munge_pubdate(self, pubdate: str) -> str: return pubdate def item_is_interesting_for_headlines( self, title: str, description: str, item: ET.Element ) -> bool: return True def do_headlines(self) -> bool: return True def do_details(self) -> bool: return True def is_item_older_than_n_days( self, item: ET.Element, n: int, default: bool = False ) -> bool: pubdate = self.find_pubdate(item) if pubdate is None: return default pubdatetime = parse(pubdate) tzinfo = pubdatetime.tzinfo now = datetime.datetime.now(tzinfo) delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24) return delta > n def item_is_interesting_for_article( self, title: str, description: str, item: ET.Element ) -> bool: return True def periodic_render(self, key: str) -> bool: if key == "Fetch News": return self.fetch_news() elif key == "Shuffle News": return self.shuffle_news() else: raise Exception def shuffle_news(self) -> bool: if self.do_headlines(): headlines = page_builder.page_builder() headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS) headlines.set_title("%s" % self.page_title) subset = self.news.subset(4) if subset is None: logger.warning("Not enough messages to select from in shuffle_news?!") return False for msg in subset: headlines.add_item(msg) headlines.set_custom_html( """ """ ) _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html" with file_writer.file_writer(_) as f: headlines.render_html(f) if self.do_details(): details = page_builder.page_builder() details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM) details.set_custom_html( """ """ ) details.set_title(self.page_title) subset = self.details.subset(1) if subset is None: logger.warning("Not enough details to choose from in do_details") logger.debug("Not enough details to choose from.") return False for msg in subset: blurb = msg blurb += "" details.add_item(blurb) _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html" with file_writer.file_writer(_) as g: details.render_html(g) return True def fetch_news(self) -> bool: count = 0 self.news.clear() self.details.clear() self.conn: Optional[ Union[http.client.HTTPConnection, http.client.HTTPSConnection] ] = None for uri in self.feed_uris: url = None if self.should_use_https(): url = f"https://{self.feed_site}{uri}" logger.info(f"Fetching: {url}") self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10) else: url = f"http://{self.feed_site}{uri}" logger.info(f"Fetching: {url}") self.conn = http.client.HTTPConnection(self.feed_site, timeout=10) assert self.conn is not None assert url is not None self.conn.request( "GET", uri, None, { "Accept": "*/*", "Cache-control": "max-age=50", }, ) try: response = self.conn.getresponse() except Exception: logger.exception( f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up." ) return False if response.status != 200: logger.error( f"Unexpected status {response.status} while fetching {url}: {response.reason}; giving up." ) print(dir(response)) print(response.headers) return False raw = response.read() logger.info(f"Status 200: got {len(raw)} bytes back from {url}") rss = ET.fromstring(raw) channel = rss[0] title_filter = set() for item in list(channel): title = self.find_title(item) description = item.findtext("description") if title is not None: title = self.munge_title(title, item) else: logger.info("Skipping RSS feed item with no title.") continue logger.debug(f"Considering RSS item {title}...") if description is not None: description = self.munge_description(description, item) else: description = "" image = self.find_image(item) if image is not None: image = self.munge_image(image) link = item.findtext("link") if link is not None: link = self.munge_link(link) if not self.item_is_interesting_for_headlines(title, description, item): logger.info(f"Skipping {title} because it's not interesting.") continue if self.should_profanity_filter() and ( self.filter.contains_bad_word(title) or self.filter.contains_bad_word(description) ): logger.info(f"Skipping {title} because it contains profanity.") continue if title in title_filter: logger.info( f"Skipping {title} because we already saw an item with the same title." ) continue title_filter.add(title) blurb = """
""" if image is not None: blurb += f'{title}" else: blurb += f'

{title}' pubdate = self.find_pubdate(item) if pubdate is not None: logger.debug(f"Raw pubdate={pubdate}") pubdate = self.munge_pubdate(pubdate) ts = parse(pubdate) logger.debug(f"Translated pubdate into: {ts}") blurb += f' {ts.strftime("%b %d")}' if self.item_is_interesting_for_article(title, description, item): logger.info( f"Item {title} is also interesting as an article details page; creating..." ) longblurb = blurb longblurb += "
" longblurb += description longblurb += "

" longblurb = longblurb.replace("font-size:34pt", "font-size:44pt") self.details.add(longblurb) else: logger.info( f"Item {title} isn't interesting for article details page; skipped." ) blurb += "" self.news.add(blurb) count += 1 logger.debug(f"Added {count} items so far...") return count > 0