import http.client
import random
import re
+import sys
+import traceback
from typing import Dict, List, Optional, Union
import xml.etree.ElementTree as ET
self.page_title = page_title
self.news = grab_bag.grab_bag()
self.details = grab_bag.grab_bag()
- self.filter = profanity_filter.profanity_filter()
+ self.filter = profanity_filter.ProfanityFilter()
@abstractmethod
def debug_prefix(self) -> str:
def find_title(self, item: ET.Element) -> Optional[str]:
return item.findtext("title")
- def munge_title(self, title: str) -> str:
+ def munge_title(self, title: str, item: ET.Element) -> str:
return title
def find_description(self, item: ET.Element) -> Optional[str]:
return item.findtext("description")
- def munge_description(self, description: str) -> str:
+ def munge_description(
+ self,
+ description: str,
+ item: ET.Element
+ ) -> str:
description = re.sub("<[^>]+>", "", description)
return description
) -> bool:
return True
+ def do_headlines(self) -> bool:
+ return True
+
+ def do_details(self) -> bool:
+ return True
+
def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
pubdate = self.find_pubdate(item)
if pubdate is None:
raise Exception
def shuffle_news(self) -> bool:
- headlines = page_builder.page_builder()
- headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
- headlines.set_title("%s" % self.page_title)
- subset = self.news.subset(4)
- if subset is None:
- self.debug_print("Not enough messages to choose from.")
- return False
- for msg in subset:
- headlines.add_item(msg)
- headlines.set_custom_html(
- """
-<STYLE>
-a:link {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:visited {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:active {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-</STYLE>"""
- )
- _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
- with file_writer.file_writer(_) as f:
- headlines.render_html(f)
-
- details = page_builder.page_builder()
- details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
- details.set_custom_html(
- """
-<STYLE>
-a:link {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:visited {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-a:active {
- color: black;
- text-decoration: none;
- font-weight: bold;
-}
-</STYLE>"""
- )
- details.set_title(f"{self.page_title}")
- subset = self.details.subset(1)
- if subset is None:
- self.debug_print("Not enough details to choose from.")
- return False
- for msg in subset:
- blurb = msg
- blurb += "</TD>"
- details.add_item(blurb)
- _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
- with file_writer.file_writer(_) as g:
- details.render_html(g)
+ if self.do_headlines():
+ headlines = page_builder.page_builder()
+ headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
+ headlines.set_title("%s" % self.page_title)
+ subset = self.news.subset(4)
+ if subset is None:
+ self.debug_print("Not enough messages to choose from.")
+ return False
+ for msg in subset:
+ headlines.add_item(msg)
+ headlines.set_custom_html(
+ """
+ <STYLE>
+ a:link {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:visited {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:active {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ </STYLE>"""
+ )
+ _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
+ with file_writer.file_writer(_) as f:
+ headlines.render_html(f)
+
+ if self.do_details():
+ details = page_builder.page_builder()
+ details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
+ details.set_custom_html(
+ """
+ <STYLE>
+ a:link {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:visited {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ a:active {
+ color: black;
+ text-decoration: none;
+ font-weight: bold;
+ }
+ </STYLE>"""
+ )
+ details.set_title(f"{self.page_title}")
+ subset = self.details.subset(1)
+ if subset is None:
+ self.debug_print("Not enough details to choose from.")
+ return False
+ for msg in subset:
+ blurb = msg
+ blurb += "</TD>"
+ details.add_item(blurb)
+ _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
+ with file_writer.file_writer(_) as g:
+ details.render_html(g)
return True
def fetch_news(self) -> bool:
for uri in self.feed_uris:
if self.should_use_https():
self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
- self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
+ self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
else:
self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
- self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
+ self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
assert(self.conn is not None)
self.conn.request(
"GET",
None,
{
"Accept": "*/*",
- "Cache-control": "max-age=59",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+# "Cache-control": "max-age=50",
+# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
},
)
try:
response = self.conn.getresponse()
- except:
- print("Exception in generic RSS renderer HTTP connection")
+ except Exception as e:
+ traceback.print_exc(file=sys.stdout)
+ print(
+ f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
+ )
return False
if response.status != 200:
rss = ET.fromstring(response.read())
channel = rss[0]
+ title_filter = set()
for item in channel.getchildren():
title = self.find_title(item)
- if title is not None:
- title = self.munge_title(title)
description = item.findtext("description")
+ if title is not None:
+ title = self.munge_title(title, item)
if description is not None:
- description = self.munge_description(description)
+ description = self.munge_description(description, item)
else:
description = ""
image = self.find_image(item)
continue
if self.should_profanity_filter() and (
- self.filter.contains_bad_words(title)
- or self.filter.contains_bad_words(description)
+ self.filter.contains_bad_word(title)
+ or self.filter.contains_bad_word(description)
):
self.debug_print(f'Found bad words in item "{title}"')
continue
+ if title in title_filter:
+ self.debug_print(f'Already saw title {title}, skipping.')
+ title_filter.add(title)
+
blurb = """<DIV style="padding:8px;
- font-size:34pt;
- -webkit-column-break-inside:avoid;">"""
+ font-size:34pt;
+ -webkit-column-break-inside:avoid;">"""
if image is not None:
blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
blurb += 'style="padding:8px;">'
ts = parse(pubdate)
blurb += f' <FONT COLOR=#cccccc>{ts.strftime("%b %d")}</FONT>'
- if description is not None and self.item_is_interesting_for_article(
- title, description, item
- ):
+ if self.item_is_interesting_for_article(title, description, item):
longblurb = blurb
longblurb += "<BR>"
longblurb += description