Somewhat large overhaul to move the kiosk towards using normal python
[kiosk.git] / generic_news_rss_renderer.py
1 #!/usr/bin/env python3
2
3 from abc import abstractmethod
4 import datetime
5 from dateutil.parser import parse
6 import http.client
7 import logging
8 import re
9 from typing import Dict, List, Optional, Union
10 import xml.etree.ElementTree as ET
11
12 import file_writer
13 import grab_bag
14 import renderer
15 import page_builder
16 import profanity_filter
17
18
19 logger = logging.getLogger(__file__)
20
21
22 class generic_news_rss_renderer(renderer.abstaining_renderer):
23     def __init__(
24         self,
25         name_to_timeout_dict: Dict[str, int],
26         feed_site: str,
27         feed_uris: List[str],
28         page_title: str,
29     ):
30         super().__init__(name_to_timeout_dict)
31         self.feed_site = feed_site
32         self.feed_uris = feed_uris
33         self.page_title = page_title
34         self.news = grab_bag.grab_bag()
35         self.details = grab_bag.grab_bag()
36         self.filter = profanity_filter.ProfanityFilter()
37
38     @abstractmethod
39     def get_headlines_page_prefix(self) -> str:
40         pass
41
42     @abstractmethod
43     def get_details_page_prefix(self) -> str:
44         pass
45
46     def get_headlines_page_priority(self) -> str:
47         return "4"
48
49     def get_details_page_priority(self) -> str:
50         return "6"
51
52     @abstractmethod
53     def should_use_https(self) -> bool:
54         pass
55
56     def should_profanity_filter(self) -> bool:
57         return False
58
59     def find_title(self, item: ET.Element) -> Optional[str]:
60         return item.findtext("title")
61
62     def munge_title(self, title: str, item: ET.Element) -> str:
63         return title
64
65     def find_description(self, item: ET.Element) -> Optional[str]:
66         return item.findtext("description")
67
68     def munge_description(
69             self,
70             description: str,
71             item: ET.Element
72     ) -> str:
73         description = re.sub("<[^>]+>", "", description)
74         return description
75
76     def find_link(self, item: ET.Element) -> Optional[str]:
77         return item.findtext("link")
78
79     def munge_link(self, link: str) -> str:
80         return link
81
82     def find_image(self, item: ET.Element) -> Optional[str]:
83         return item.findtext("image")
84
85     def munge_image(self, image: str) -> str:
86         return image
87
88     def find_pubdate(self, item: ET.Element) -> Optional[str]:
89         return item.findtext("pubDate")
90
91     def munge_pubdate(self, pubdate: str) -> str:
92         return pubdate
93
94     def item_is_interesting_for_headlines(
95         self, title: str, description: str, item: ET.Element
96     ) -> bool:
97         return True
98
99     def do_headlines(self) -> bool:
100         return True
101
102     def do_details(self) -> bool:
103         return True
104
105     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
106         pubdate = self.find_pubdate(item)
107         if pubdate is None:
108             return False
109         pubdatetime = parse(pubdate)
110         tzinfo = pubdatetime.tzinfo
111         now = datetime.datetime.now(tzinfo)
112         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
113         return delta > n
114
115     def item_is_interesting_for_article(
116         self, title: str, description: str, item: ET.Element
117     ) -> bool:
118         return True
119
120     def periodic_render(self, key: str) -> bool:
121         if key == "Fetch News":
122             return self.fetch_news()
123         elif key == "Shuffle News":
124             return self.shuffle_news()
125         else:
126             raise Exception
127
128     def shuffle_news(self) -> bool:
129         if self.do_headlines():
130             headlines = page_builder.page_builder()
131             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
132             headlines.set_title("%s" % self.page_title)
133             subset = self.news.subset(4)
134             if subset is None:
135                 logger.warning('Not enough messages to select from in shuffle_news?!')
136                 return False
137             for msg in subset:
138                 headlines.add_item(msg)
139             headlines.set_custom_html(
140                 """
141     <STYLE>
142     a:link {
143       color: black;
144       text-decoration: none;
145       font-weight: bold;
146     }
147     a:visited {
148       color: black;
149       text-decoration: none;
150       font-weight: bold;
151     }
152     a:active {
153       color: black;
154       text-decoration: none;
155       font-weight: bold;
156     }
157     </STYLE>"""
158             )
159             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
160             with file_writer.file_writer(_) as f:
161                 headlines.render_html(f)
162
163         if self.do_details():
164             details = page_builder.page_builder()
165             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
166             details.set_custom_html(
167                 """
168     <STYLE>
169     a:link {
170       color: black;
171       text-decoration: none;
172       font-weight: bold;
173     }
174     a:visited {
175       color: black;
176       text-decoration: none;
177       font-weight: bold;
178     }
179     a:active {
180       color: black;
181       text-decoration: none;
182       font-weight: bold;
183     }
184     </STYLE>"""
185             )
186             details.set_title(self.page_title)
187             subset = self.details.subset(1)
188             if subset is None:
189                 logger.warning('Not enough details to choose from in do_details')
190                 logger.debug("Not enough details to choose from.")
191                 return False
192             for msg in subset:
193                 blurb = msg
194                 blurb += "</TD>"
195                 details.add_item(blurb)
196             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
197             with file_writer.file_writer(_) as g:
198                 details.render_html(g)
199         return True
200
201     def fetch_news(self) -> bool:
202         count = 0
203         self.news.clear()
204         self.details.clear()
205         self.conn: Optional[Union[http.client.HTTPConnection,
206                                   http.client.HTTPSConnection]] = None
207
208         for uri in self.feed_uris:
209             url = None
210             if self.should_use_https():
211                 url = f'https://{self.feed_site}{uri}'
212                 logger.info(f'Fetching: {url}')
213                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
214             else:
215                 url = f'http://{self.feed_site}{uri}'
216                 logger.info(f'Fetching: {url}')
217                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
218             assert self.conn is not None
219             assert url is not None
220             self.conn.request(
221                 "GET",
222                 uri,
223                 None,
224                 {
225                     "Accept": "*/*",
226                     "Cache-control": "max-age=50",
227                 },
228             )
229             try:
230                 response = self.conn.getresponse()
231             except Exception as e:
232                 logger.exception(e)
233                 logger.error(
234                     f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
235                 )
236                 return False
237
238             if response.status != 200:
239                 logger.error(
240                     f'Unexpected status {response.status} while fetching {url}; giving up.'
241                 )
242                 return False
243
244             raw = response.read()
245             logger.info(f'Status 200: got {len(raw)} bytes back from {url}')
246             rss = ET.fromstring(raw)
247             channel = rss[0]
248             title_filter = set()
249             for item in list(channel):
250                 title = self.find_title(item)
251                 description = item.findtext("description")
252                 if title is not None:
253                     title = self.munge_title(title, item)
254                 else:
255                     logger.info('Skipping RSS feed item with no title.')
256                     continue
257                 logger.debug(f'Considering RSS item {title}...')
258                 if description is not None:
259                     description = self.munge_description(description, item)
260                 else:
261                     description = ""
262                 image = self.find_image(item)
263                 if image is not None:
264                     image = self.munge_image(image)
265                 link = item.findtext("link")
266                 if link is not None:
267                     link = self.munge_link(link)
268                 if not self.item_is_interesting_for_headlines(
269                         title, description, item
270                 ):
271                     logger.info(f'Skipping {title} because it\'s not interesting.')
272                     continue
273
274                 if self.should_profanity_filter() and (
275                     self.filter.contains_bad_word(title)
276                     or self.filter.contains_bad_word(description)
277                 ):
278                     logger.info(f'Skipping {title} because it contains profanity.')
279                     continue
280
281                 if title in title_filter:
282                     logger.info(f'Skipping {title} because we already saw an item with the same title.')
283                     continue
284                 title_filter.add(title)
285
286                 blurb = """<DIV style="padding:8px;
287                                 font-size:34pt;
288                                 -webkit-column-break-inside:avoid;">"""
289                 if image is not None:
290                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
291                     blurb += 'style="padding:8px;">'
292
293                 if link is None:
294                     blurb += f"<P><B>{title}</B>"
295                 else:
296                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
297
298                 pubdate = self.find_pubdate(item)
299                 if pubdate is not None:
300                     logger.debug(f'Raw pubdate={pubdate}')
301                     pubdate = self.munge_pubdate(pubdate)
302                     ts = parse(pubdate)
303                     logger.debug(f'Translated pubdate into: {ts}')
304                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
305
306                 if self.item_is_interesting_for_article(title, description, item):
307                     logger.info(f'Item {title} is also interesting as an article details page; creating...')
308                     longblurb = blurb
309                     longblurb += "<BR>"
310                     longblurb += description
311                     longblurb += "</DIV>"
312                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
313                     self.details.add(longblurb)
314                 else:
315                     logger.info(f'Item {title} isn\'t interesting for article details page; skipped.')
316                 blurb += "</DIV>"
317                 self.news.add(blurb)
318                 count += 1
319                 logger.debug(f'Added {count} items so far...')
320         return count > 0