3 from abc import abstractmethod
5 from dateutil.parser import parse
9 from typing import Dict, List, Optional, Union
10 import xml.etree.ElementTree as ET
16 import profanity_filter
19 logger = logging.getLogger(__file__)
22 class generic_news_rss_renderer(renderer.abstaining_renderer):
25 name_to_timeout_dict: Dict[str, int],
30 super().__init__(name_to_timeout_dict)
31 self.feed_site = feed_site
32 self.feed_uris = feed_uris
33 self.page_title = page_title
34 self.news = grab_bag.grab_bag()
35 self.details = grab_bag.grab_bag()
36 self.filter = profanity_filter.ProfanityFilter()
39 def get_headlines_page_prefix(self) -> str:
43 def get_details_page_prefix(self) -> str:
46 def get_headlines_page_priority(self) -> str:
49 def get_details_page_priority(self) -> str:
53 def should_use_https(self) -> bool:
56 def should_profanity_filter(self) -> bool:
59 def find_title(self, item: ET.Element) -> Optional[str]:
60 return item.findtext("title")
62 def munge_title(self, title: str, item: ET.Element) -> str:
65 def find_description(self, item: ET.Element) -> Optional[str]:
66 return item.findtext("description")
68 def munge_description(
73 description = re.sub("<[^>]+>", "", description)
76 def find_link(self, item: ET.Element) -> Optional[str]:
77 return item.findtext("link")
79 def munge_link(self, link: str) -> str:
82 def find_image(self, item: ET.Element) -> Optional[str]:
83 return item.findtext("image")
85 def munge_image(self, image: str) -> str:
88 def find_pubdate(self, item: ET.Element) -> Optional[str]:
89 return item.findtext("pubDate")
91 def munge_pubdate(self, pubdate: str) -> str:
94 def item_is_interesting_for_headlines(
95 self, title: str, description: str, item: ET.Element
99 def do_headlines(self) -> bool:
102 def do_details(self) -> bool:
105 def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
106 pubdate = self.find_pubdate(item)
109 pubdatetime = parse(pubdate)
110 tzinfo = pubdatetime.tzinfo
111 now = datetime.datetime.now(tzinfo)
112 delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
115 def item_is_interesting_for_article(
116 self, title: str, description: str, item: ET.Element
120 def periodic_render(self, key: str) -> bool:
121 if key == "Fetch News":
122 return self.fetch_news()
123 elif key == "Shuffle News":
124 return self.shuffle_news()
128 def shuffle_news(self) -> bool:
129 if self.do_headlines():
130 headlines = page_builder.page_builder()
131 headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
132 headlines.set_title("%s" % self.page_title)
133 subset = self.news.subset(4)
135 logger.warning('Not enough messages to select from in shuffle_news?!')
138 headlines.add_item(msg)
139 headlines.set_custom_html(
144 text-decoration: none;
149 text-decoration: none;
154 text-decoration: none;
159 _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
160 with file_writer.file_writer(_) as f:
161 headlines.render_html(f)
163 if self.do_details():
164 details = page_builder.page_builder()
165 details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
166 details.set_custom_html(
171 text-decoration: none;
176 text-decoration: none;
181 text-decoration: none;
186 details.set_title(self.page_title)
187 subset = self.details.subset(1)
189 logger.warning('Not enough details to choose from in do_details')
190 logger.debug("Not enough details to choose from.")
195 details.add_item(blurb)
196 _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
197 with file_writer.file_writer(_) as g:
198 details.render_html(g)
201 def fetch_news(self) -> bool:
205 self.conn: Optional[Union[http.client.HTTPConnection,
206 http.client.HTTPSConnection]] = None
208 for uri in self.feed_uris:
210 if self.should_use_https():
211 url = f'https://{self.feed_site}{uri}'
212 logger.info(f'Fetching: {url}')
213 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
215 url = f'http://{self.feed_site}{uri}'
216 logger.info(f'Fetching: {url}')
217 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
218 assert self.conn is not None
219 assert url is not None
226 "Cache-control": "max-age=50",
230 response = self.conn.getresponse()
231 except Exception as e:
234 f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
238 if response.status != 200:
240 f'Unexpected status {response.status} while fetching {url}; giving up.'
244 raw = response.read()
245 logger.info(f'Status 200: got {len(raw)} bytes back from {url}')
246 rss = ET.fromstring(raw)
249 for item in list(channel):
250 title = self.find_title(item)
251 description = item.findtext("description")
252 if title is not None:
253 title = self.munge_title(title, item)
255 logger.info('Skipping RSS feed item with no title.')
257 logger.debug(f'Considering RSS item {title}...')
258 if description is not None:
259 description = self.munge_description(description, item)
262 image = self.find_image(item)
263 if image is not None:
264 image = self.munge_image(image)
265 link = item.findtext("link")
267 link = self.munge_link(link)
268 if not self.item_is_interesting_for_headlines(
269 title, description, item
271 logger.info(f'Skipping {title} because it\'s not interesting.')
274 if self.should_profanity_filter() and (
275 self.filter.contains_bad_word(title)
276 or self.filter.contains_bad_word(description)
278 logger.info(f'Skipping {title} because it contains profanity.')
281 if title in title_filter:
282 logger.info(f'Skipping {title} because we already saw an item with the same title.')
284 title_filter.add(title)
286 blurb = """<DIV style="padding:8px;
288 -webkit-column-break-inside:avoid;">"""
289 if image is not None:
290 blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
291 blurb += 'style="padding:8px;">'
294 blurb += f"<P><B>{title}</B>"
296 blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
298 pubdate = self.find_pubdate(item)
299 if pubdate is not None:
300 logger.debug(f'Raw pubdate={pubdate}')
301 pubdate = self.munge_pubdate(pubdate)
303 logger.debug(f'Translated pubdate into: {ts}')
304 blurb += f' <FONT COLOR=#cccccc>{ts.strftime("%b %d")}</FONT>'
306 if self.item_is_interesting_for_article(title, description, item):
307 logger.info(f'Item {title} is also interesting as an article details page; creating...')
310 longblurb += description
311 longblurb += "</DIV>"
312 longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
313 self.details.add(longblurb)
315 logger.info(f'Item {title} isn\'t interesting for article details page; skipped.')
319 logger.debug(f'Added {count} items so far...')