+#!/usr/bin/env python3
+
+from abc import abstractmethod
import datetime
from dateutil.parser import parse
+import http.client
+import logging
+import re
+from typing import Dict, List, Optional, Union
+import xml.etree.ElementTree as ET
+
+from scottutilz import profanity_filter
+
import file_writer
import grab_bag
import renderer
-import http.client
import page_builder
-import profanity_filter
-import random
-import re
-import xml.etree.ElementTree as ET
-class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
- def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title):
- super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
- self.debug = 1
+logger = logging.getLogger(__file__)
+
+
+class generic_news_rss_renderer(renderer.abstaining_renderer):
+ def __init__(
+ self,
+ name_to_timeout_dict: Dict[str, int],
+ feed_site: str,
+ feed_uris: List[str],
+ page_title: str,
+ ):
+ super().__init__(name_to_timeout_dict)
self.feed_site = feed_site
self.feed_uris = feed_uris
self.page_title = page_title
self.news = grab_bag.grab_bag()
self.details = grab_bag.grab_bag()
- self.filter = profanity_filter.profanity_filter()
-
- def debug_prefix(self):
- pass
+ self.filter = profanity_filter.ProfanityFilter()
- def get_headlines_page_prefix(self):
+ @abstractmethod
+ def get_headlines_page_prefix(self) -> str:
pass
- def get_details_page_prefix(self):
+ @abstractmethod
+ def get_details_page_prefix(self) -> str:
pass
- def get_headlines_page_priority(self):
+ def get_headlines_page_priority(self) -> str:
return "4"
- def get_details_page_priority(self):
+ def get_details_page_priority(self) -> str:
return "6"
- def should_use_https(self):
+ @abstractmethod
+ def should_use_https(self) -> bool:
pass
- def should_profanity_filter(self):
+ def should_profanity_filter(self) -> bool:
return False
- def find_title(self, item):
+ def find_title(self, item: ET.Element) -> Optional[str]:
return item.findtext("title")
- def munge_title(self, title):
+ def munge_title(self, title: str, item: ET.Element) -> str:
return title
- def find_description(self, item):
+ def find_description(self, item: ET.Element) -> Optional[str]:
return item.findtext("description")
- def munge_description(self, description):
+ def munge_description(
+ self,
+ description: str,
+ item: ET.Element
+ ) -> str:
description = re.sub("<[^>]+>", "", description)
return description
- def find_link(self, item):
+ def find_link(self, item: ET.Element) -> Optional[str]:
return item.findtext("link")
- def munge_link(self, link):
+ def munge_link(self, link: str) -> str:
return link
- def find_image(self, item):
+ def find_image(self, item: ET.Element) -> Optional[str]:
return item.findtext("image")
- def munge_image(self, image):
+ def munge_image(self, image: str) -> str:
return image
- def find_pubdate(self, item):
+ def find_pubdate(self, item: ET.Element) -> Optional[str]:
return item.findtext("pubDate")
- def munge_pubdate(self, pubdate):
+ def munge_pubdate(self, pubdate: str) -> str:
return pubdate
- def item_is_interesting_for_headlines(self, title, description, item):
+ def item_is_interesting_for_headlines(
+ self, title: str, description: str, item: ET.Element
+ ) -> bool:
return True
- def is_item_older_than_n_days(self, item, n):
- pubdate = self.find_pubdate(item)
- if pubdate is not None:
- pubdate = parse(pubdate)
- tzinfo = pubdate.tzinfo
- now = datetime.datetime.now(tzinfo)
- delta = (now - pubdate).total_seconds() / (60 * 60 * 24)
- if delta > n:
- return True
- return False
+ def do_headlines(self) -> bool:
+ return True
+
+ def do_details(self) -> bool:
+ return True
- def item_is_interesting_for_article(self, title, description, item):
+ def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
+ pubdate = self.find_pubdate(item)
+ if pubdate is None:
+ return False
+ pubdatetime = parse(pubdate)
+ tzinfo = pubdatetime.tzinfo
+ now = datetime.datetime.now(tzinfo)
+ delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
+ return delta > n
+
+ def item_is_interesting_for_article(
+ self, title: str, description: str, item: ET.Element
+ ) -> bool:
return True
- def periodic_render(self, key):
+ def periodic_render(self, key: str) -> bool:
if key == "Fetch News":
return self.fetch_news()
elif key == "Shuffle News":
return self.shuffle_news()
else:
- raise error("Unexpected operation")
-
- def shuffle_news(self):
- headlines = page_builder.page_builder()
- headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
- headlines.set_title("%s" % self.page_title)
- subset = self.news.subset(4)
- if subset is None:
- self.debug_print("Not enough messages to choose from.")
- return False
- for msg in subset:
- headlines.add_item(msg)
- headlines.set_custom_html(
- """
-<STYLE>
-a:link {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:visited {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:active {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-</STYLE>"""
- )
- f = file_writer.file_writer(
- "%s_%s_25900.html"
- % (self.get_headlines_page_prefix(), self.get_headlines_page_priority())
- )
- headlines.render_html(f)
- f.close()
-
- details = page_builder.page_builder()
- details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
- details.set_custom_html(
- """
-<STYLE>
-a:link {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:visited {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:active {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-</STYLE>"""
- )
- details.set_title("%s" % self.page_title)
- subset = self.details.subset(1)
- if subset is None:
- self.debug_print("Not enough details to choose from.")
- return False
- for msg in subset:
- blurb = msg
- blurb += u"</TD>"
- details.add_item(blurb)
- g = file_writer.file_writer(
- "%s_%s_86400.html"
- % (self.get_details_page_prefix(), self.get_details_page_priority())
- )
- details.render_html(g)
- g.close()
+ raise Exception
+
+ def shuffle_news(self) -> bool:
+ if self.do_headlines():
+ headlines = page_builder.page_builder()
+ headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
+ headlines.set_title("%s" % self.page_title)
+ subset = self.news.subset(4)
+ if subset is None:
+ logger.warning('Not enough messages to select from in shuffle_news?!')
+ return False
+ for msg in subset:
+ headlines.add_item(msg)
+ headlines.set_custom_html(
+ """
+ <STYLE>
+ a:link {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:visited {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:active {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ </STYLE>"""
+ )
+ _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
+ with file_writer.file_writer(_) as f:
+ headlines.render_html(f)
+
+ if self.do_details():
+ details = page_builder.page_builder()
+ details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
+ details.set_custom_html(
+ """
+ <STYLE>
+ a:link {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:visited {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:active {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ </STYLE>"""
+ )
+ details.set_title(self.page_title)
+ subset = self.details.subset(1)
+ if subset is None:
+ logger.warning('Not enough details to choose from in do_details')
+ logger.debug("Not enough details to choose from.")
+ return False
+ for msg in subset:
+ blurb = msg
+ blurb += "</TD>"
+ details.add_item(blurb)
+ _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
+ with file_writer.file_writer(_) as g:
+ details.render_html(g)
return True
- def fetch_news(self):
+ def fetch_news(self) -> bool:
count = 0
self.news.clear()
self.details.clear()
+ self.conn: Optional[Union[http.client.HTTPConnection,
+ http.client.HTTPSConnection]] = None
for uri in self.feed_uris:
+ url = None
if self.should_use_https():
- self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
- self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
+ url = f'https://{self.feed_site}{uri}'
+ logger.info(f'Fetching: {url}')
+ self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
else:
- self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
- self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
+ url = f'http://{self.feed_site}{uri}'
+ logger.info(f'Fetching: {url}')
+ self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
+ assert self.conn is not None
+ assert url is not None
self.conn.request(
"GET",
uri,
None,
{
"Accept": "*/*",
- "Cache-control": "max-age=59",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+ "Cache-control": "max-age=50",
},
)
try:
response = self.conn.getresponse()
- except:
- print("Exception in generic RSS renderer HTTP connection")
+ except Exception as e:
+ logger.exception(e)
+ logger.error(
+ f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
+ )
return False
if response.status != 200:
- print(
- (
- "%s: RSS fetch_news error, response: %d"
- % (self.page_title, response.status)
- )
+ logger.error(
+ f'Unexpected status {response.status} while fetching {url}; giving up.'
)
- self.debug_print(response.read())
return False
- rss = ET.fromstring(response.read())
+ raw = response.read()
+ logger.info(f'Status 200: got {len(raw)} bytes back from {url}')
+ rss = ET.fromstring(raw)
channel = rss[0]
- for item in channel.getchildren():
+ title_filter = set()
+ for item in list(channel):
title = self.find_title(item)
- if title is not None:
- title = self.munge_title(title)
description = item.findtext("description")
+ if title is not None:
+ title = self.munge_title(title, item)
+ else:
+ logger.info('Skipping RSS feed item with no title.')
+ continue
+ logger.debug(f'Considering RSS item {title}...')
if description is not None:
- description = self.munge_description(description)
+ description = self.munge_description(description, item)
+ else:
+ description = ""
image = self.find_image(item)
if image is not None:
image = self.munge_image(image)
link = item.findtext("link")
if link is not None:
link = self.munge_link(link)
-
- if title is None or not self.item_is_interesting_for_headlines(
- title, description, item
+ if not self.item_is_interesting_for_headlines(
+ title, description, item
):
- self.debug_print('Item "%s" is not interesting' % title)
+ logger.info(f'Skipping {title} because it\'s not interesting.')
continue
if self.should_profanity_filter() and (
- self.filter.contains_bad_words(title)
- or self.filter.contains_bad_words(description)
+ self.filter.contains_bad_word(title)
+ or self.filter.contains_bad_word(description)
):
- self.debug_print('Found bad words in item "%s"' % title)
+ logger.info(f'Skipping {title} because it contains profanity.')
+ continue
+
+ if title in title_filter:
+ logger.info(f'Skipping {title} because we already saw an item with the same title.')
continue
+ title_filter.add(title)
- blurb = u"""<DIV style="padding:8px;
- font-size:34pt;
- -webkit-column-break-inside:avoid;">"""
+ blurb = """<DIV style="padding:8px;
+ font-size:34pt;
+ -webkit-column-break-inside:avoid;">"""
if image is not None:
- blurb += u'<IMG SRC="%s" ALIGN=LEFT HEIGHT=115 ' % image
- blurb += u'style="padding:8px;">'
+ blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
+ blurb += 'style="padding:8px;">'
if link is None:
- blurb += u"<P><B>%s</B>" % title
+ blurb += f"<P><B>{title}</B>"
else:
- blurb += u'<P><B><A HREF="%s">%s</A></B>' % (link, title)
+ blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
pubdate = self.find_pubdate(item)
if pubdate is not None:
+ logger.debug(f'Raw pubdate={pubdate}')
pubdate = self.munge_pubdate(pubdate)
ts = parse(pubdate)
- blurb += u" <FONT COLOR=#cccccc>%s</FONT>" % (
- ts.strftime("%b %d")
- )
+ logger.debug(f'Translated pubdate into: {ts}')
+ blurb += f' <FONT COLOR=#cccccc>{ts.strftime("%b %d")}</FONT>'
- if description is not None and self.item_is_interesting_for_article(
- title, description, item
- ):
+ if self.item_is_interesting_for_article(title, description, item):
+ logger.info(f'Item {title} is also interesting as an article details page; creating...')
longblurb = blurb
-
- longblurb += u"<BR>"
+ longblurb += "<BR>"
longblurb += description
- longblurb += u"</DIV>"
+ longblurb += "</DIV>"
longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
self.details.add(longblurb)
-
- blurb += u"</DIV>"
+ else:
+ logger.info(f'Item {title} isn\'t interesting for article details page; skipped.')
+ blurb += "</DIV>"
self.news.add(blurb)
count += 1
+ logger.debug(f'Added {count} items so far...')
return count > 0