X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=generic_news_rss_renderer.py;h=f1261cb59e171cfa2a8348487839cb7edc2dd124;hb=5ea88ab72e175e2d4f57ae8645ca6f825549a7a9;hp=e73db4e7f983db3321432a06c74166236eb71149;hpb=c06bfef53f70551e7920bc4facce27f47b89e2ba;p=kiosk.git diff --git a/generic_news_rss_renderer.py b/generic_news_rss_renderer.py index e73db4e..f1261cb 100644 --- a/generic_news_rss_renderer.py +++ b/generic_news_rss_renderer.py @@ -4,19 +4,23 @@ from abc import abstractmethod import datetime from dateutil.parser import parse import http.client -import random +import logging import re -from typing import Dict, List +from typing import Dict, List, Optional, Union import xml.etree.ElementTree as ET +from scottutilz import profanity_filter + import file_writer import grab_bag import renderer import page_builder -import profanity_filter -class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): +logger = logging.getLogger(__name__) + + +class generic_news_rss_renderer(renderer.abstaining_renderer): def __init__( self, name_to_timeout_dict: Dict[str, int], @@ -24,18 +28,13 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): feed_uris: List[str], page_title: str, ): - super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False) - self.debug = True + super().__init__(name_to_timeout_dict) self.feed_site = feed_site self.feed_uris = feed_uris self.page_title = page_title self.news = grab_bag.grab_bag() self.details = grab_bag.grab_bag() - self.filter = profanity_filter.profanity_filter() - - @abstractmethod - def debug_prefix(self) -> str: - pass + self.filter = profanity_filter.ProfanityFilter() @abstractmethod def get_headlines_page_prefix(self) -> str: @@ -56,34 +55,34 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): pass def should_profanity_filter(self) -> bool: - return False + return True - def find_title(self, item: ET.Element) -> str: + def find_title(self, item: ET.Element) -> Optional[str]: return item.findtext("title") - def munge_title(self, title: str) -> str: + def munge_title(self, title: str, item: ET.Element) -> str: return title - def find_description(self, item: ET.Element) -> str: + def find_description(self, item: ET.Element) -> Optional[str]: return item.findtext("description") - def munge_description(self, description: str) -> str: + def munge_description(self, description: str, item: ET.Element) -> str: description = re.sub("<[^>]+>", "", description) return description - def find_link(self, item: ET.Element) -> str: + def find_link(self, item: ET.Element) -> Optional[str]: return item.findtext("link") def munge_link(self, link: str) -> str: return link - def find_image(self, item: ET.Element) -> str: + def find_image(self, item: ET.Element) -> Optional[str]: return item.findtext("image") def munge_image(self, image: str) -> str: return image - def find_pubdate(self, item: ET.Element) -> str: + def find_pubdate(self, item: ET.Element) -> Optional[str]: return item.findtext("pubDate") def munge_pubdate(self, pubdate: str) -> str: @@ -94,14 +93,22 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): ) -> bool: return True - def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool: + def do_headlines(self) -> bool: + return True + + def do_details(self) -> bool: + return True + + def is_item_older_than_n_days( + self, item: ET.Element, n: int, default: bool = False + ) -> bool: pubdate = self.find_pubdate(item) if pubdate is None: - return False - pubdate = parse(pubdate) - tzinfo = pubdate.tzinfo + return default + pubdatetime = parse(pubdate) + tzinfo = pubdatetime.tzinfo now = datetime.datetime.now(tzinfo) - delta = (now - pubdate).total_seconds() / (60 * 60 * 24) + delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24) return delta > n def item_is_interesting_for_article( @@ -115,145 +122,171 @@ class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer): elif key == "Shuffle News": return self.shuffle_news() else: - raise error("Unexpected operation") + raise Exception def shuffle_news(self) -> bool: - headlines = page_builder.page_builder() - headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS) - headlines.set_title("%s" % self.page_title) - subset = self.news.subset(4) - if subset is None: - self.debug_print("Not enough messages to choose from.") - return False - for msg in subset: - headlines.add_item(msg) - headlines.set_custom_html( - """ -""" - ) - _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html" - with file_writer.file_writer(_) as f: - headlines.render_html(f) - - details = page_builder.page_builder() - details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM) - details.set_custom_html( - """ -""" - ) - details.set_title(f"{self.page_title}") - subset = self.details.subset(1) - if subset is None: - self.debug_print("Not enough details to choose from.") - return False - for msg in subset: - blurb = msg - blurb += "" - details.add_item(blurb) - _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html" - with file_writer.file_writer(_) as g: - details.render_html(g) + if self.do_headlines(): + headlines = page_builder.page_builder() + headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS) + headlines.set_title("%s" % self.page_title) + subset = self.news.subset(4) + if subset is None: + logger.warning("Not enough messages to select from in shuffle_news?!") + return False + for msg in subset: + headlines.add_item(msg) + headlines.set_custom_html( + """ + """ + ) + _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html" + with file_writer.file_writer(_) as f: + headlines.render_html(f) + + if self.do_details(): + details = page_builder.page_builder() + details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM) + details.set_custom_html( + """ + """ + ) + details.set_title(self.page_title) + subset = self.details.subset(1) + if subset is None: + logger.warning("Not enough details to choose from in do_details") + logger.debug("Not enough details to choose from.") + return False + for msg in subset: + blurb = msg + blurb += "" + details.add_item(blurb) + _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html" + with file_writer.file_writer(_) as g: + details.render_html(g) return True def fetch_news(self) -> bool: count = 0 self.news.clear() self.details.clear() + self.conn: Optional[ + Union[http.client.HTTPConnection, http.client.HTTPSConnection] + ] = None for uri in self.feed_uris: + url = None if self.should_use_https(): - self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri)) - self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20) + url = f"https://{self.feed_site}{uri}" + logger.info(f"Fetching: {url}") + self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10) else: - self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri)) - self.conn = http.client.HTTPConnection(self.feed_site, timeout=20) + url = f"http://{self.feed_site}{uri}" + logger.info(f"Fetching: {url}") + self.conn = http.client.HTTPConnection(self.feed_site, timeout=10) + assert self.conn is not None + assert url is not None self.conn.request( "GET", uri, None, { "Accept": "*/*", - "Cache-control": "max-age=59", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", + "Cache-control": "max-age=50", }, ) try: response = self.conn.getresponse() - except: - print("Exception in generic RSS renderer HTTP connection") + except Exception: + logger.exception( + f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up." + ) return False if response.status != 200: - print( - f"{self.page_title}: RSS fetch_news error, response: {response.status}" + logger.error( + f"Unexpected status {response.status} while fetching {url}: {response.reason}; giving up." ) - self.debug_print(response.read()) + print(dir(response)) + print(response.headers) return False - rss = ET.fromstring(response.read()) + raw = response.read() + logger.info(f"Status 200: got {len(raw)} bytes back from {url}") + rss = ET.fromstring(raw) channel = rss[0] - for item in channel.getchildren(): + title_filter = set() + for item in list(channel): title = self.find_title(item) - if title is not None: - title = self.munge_title(title) description = item.findtext("description") + if title is not None: + title = self.munge_title(title, item) + else: + logger.info("Skipping RSS feed item with no title.") + continue + logger.debug(f"Considering RSS item {title}...") if description is not None: - description = self.munge_description(description) + description = self.munge_description(description, item) + else: + description = "" image = self.find_image(item) if image is not None: image = self.munge_image(image) link = item.findtext("link") if link is not None: link = self.munge_link(link) - - if title is None or not self.item_is_interesting_for_headlines( - title, description, item - ): - self.debug_print(f'Item "{title}" is not interesting') + if not self.item_is_interesting_for_headlines(title, description, item): + logger.info(f"Skipping {title} because it's not interesting.") continue if self.should_profanity_filter() and ( - self.filter.contains_bad_words(title) - or self.filter.contains_bad_words(description) + self.filter.contains_bad_word(title) + or self.filter.contains_bad_word(description) ): - self.debug_print(f'Found bad words in item "{title}"') + logger.info(f"Skipping {title} because it contains profanity.") continue + if title in title_filter: + logger.info( + f"Skipping {title} because we already saw an item with the same title." + ) + continue + title_filter.add(title) + blurb = """
""" + font-size:34pt; + -webkit-column-break-inside:avoid;">""" if image is not None: blurb += f'{ts.strftime("%b %d")}' - if description is not None and self.item_is_interesting_for_article( - title, description, item - ): + if self.item_is_interesting_for_article(title, description, item): + logger.info( + f"Item {title} is also interesting as an article details page; creating..." + ) longblurb = blurb longblurb += "
" longblurb += description longblurb += "
" longblurb = longblurb.replace("font-size:34pt", "font-size:44pt") self.details.add(longblurb) + else: + logger.info( + f"Item {title} isn't interesting for article details page; skipped." + ) blurb += "" self.news.add(blurb) count += 1 + logger.debug(f"Added {count} items so far...") return count > 0