Merge branch 'master' of ssh://wwwjail.house/usr/local/git/base/kiosk
[kiosk.git] / generic_news_rss_renderer.py
1 #!/usr/bin/env python3
2
3 from abc import abstractmethod
4 import datetime
5 from dateutil.parser import parse
6 import http.client
7 import logging
8 import re
9 from typing import Dict, List, Optional, Union
10 import xml.etree.ElementTree as ET
11
12 from scottutilz import profanity_filter
13
14 import file_writer
15 import grab_bag
16 import renderer
17 import page_builder
18
19
20 logger = logging.getLogger(__name__)
21
22
23 class generic_news_rss_renderer(renderer.abstaining_renderer):
24     def __init__(
25         self,
26         name_to_timeout_dict: Dict[str, int],
27         feed_site: str,
28         feed_uris: List[str],
29         page_title: str,
30     ):
31         super().__init__(name_to_timeout_dict)
32         self.feed_site = feed_site
33         self.feed_uris = feed_uris
34         self.page_title = page_title
35         self.news = grab_bag.grab_bag()
36         self.details = grab_bag.grab_bag()
37         self.filter = profanity_filter.ProfanityFilter()
38
39     @abstractmethod
40     def get_headlines_page_prefix(self) -> str:
41         pass
42
43     @abstractmethod
44     def get_details_page_prefix(self) -> str:
45         pass
46
47     def get_headlines_page_priority(self) -> str:
48         return "4"
49
50     def get_details_page_priority(self) -> str:
51         return "6"
52
53     @abstractmethod
54     def should_use_https(self) -> bool:
55         pass
56
57     def should_profanity_filter(self) -> bool:
58         return False
59
60     def find_title(self, item: ET.Element) -> Optional[str]:
61         return item.findtext("title")
62
63     def munge_title(self, title: str, item: ET.Element) -> str:
64         return title
65
66     def find_description(self, item: ET.Element) -> Optional[str]:
67         return item.findtext("description")
68
69     def munge_description(self, description: str, item: ET.Element) -> str:
70         description = re.sub("<[^>]+>", "", description)
71         return description
72
73     def find_link(self, item: ET.Element) -> Optional[str]:
74         return item.findtext("link")
75
76     def munge_link(self, link: str) -> str:
77         return link
78
79     def find_image(self, item: ET.Element) -> Optional[str]:
80         return item.findtext("image")
81
82     def munge_image(self, image: str) -> str:
83         return image
84
85     def find_pubdate(self, item: ET.Element) -> Optional[str]:
86         return item.findtext("pubDate")
87
88     def munge_pubdate(self, pubdate: str) -> str:
89         return pubdate
90
91     def item_is_interesting_for_headlines(
92         self, title: str, description: str, item: ET.Element
93     ) -> bool:
94         return True
95
96     def do_headlines(self) -> bool:
97         return True
98
99     def do_details(self) -> bool:
100         return True
101
102     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
103         pubdate = self.find_pubdate(item)
104         if pubdate is None:
105             return False
106         pubdatetime = parse(pubdate)
107         tzinfo = pubdatetime.tzinfo
108         now = datetime.datetime.now(tzinfo)
109         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
110         return delta > n
111
112     def item_is_interesting_for_article(
113         self, title: str, description: str, item: ET.Element
114     ) -> bool:
115         return True
116
117     def periodic_render(self, key: str) -> bool:
118         if key == "Fetch News":
119             return self.fetch_news()
120         elif key == "Shuffle News":
121             return self.shuffle_news()
122         else:
123             raise Exception
124
125     def shuffle_news(self) -> bool:
126         if self.do_headlines():
127             headlines = page_builder.page_builder()
128             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
129             headlines.set_title("%s" % self.page_title)
130             subset = self.news.subset(4)
131             if subset is None:
132                 logger.warning("Not enough messages to select from in shuffle_news?!")
133                 return False
134             for msg in subset:
135                 headlines.add_item(msg)
136             headlines.set_custom_html(
137                 """
138     <STYLE>
139     a:link {
140       color: black;
141       text-decoration: none;
142       font-weight: bold;
143     }
144     a:visited {
145       color: black;
146       text-decoration: none;
147       font-weight: bold;
148     }
149     a:active {
150       color: black;
151       text-decoration: none;
152       font-weight: bold;
153     }
154     </STYLE>"""
155             )
156             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
157             with file_writer.file_writer(_) as f:
158                 headlines.render_html(f)
159
160         if self.do_details():
161             details = page_builder.page_builder()
162             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
163             details.set_custom_html(
164                 """
165     <STYLE>
166     a:link {
167       color: black;
168       text-decoration: none;
169       font-weight: bold;
170     }
171     a:visited {
172       color: black;
173       text-decoration: none;
174       font-weight: bold;
175     }
176     a:active {
177       color: black;
178       text-decoration: none;
179       font-weight: bold;
180     }
181     </STYLE>"""
182             )
183             details.set_title(self.page_title)
184             subset = self.details.subset(1)
185             if subset is None:
186                 logger.warning("Not enough details to choose from in do_details")
187                 logger.debug("Not enough details to choose from.")
188                 return False
189             for msg in subset:
190                 blurb = msg
191                 blurb += "</TD>"
192                 details.add_item(blurb)
193             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
194             with file_writer.file_writer(_) as g:
195                 details.render_html(g)
196         return True
197
198     def fetch_news(self) -> bool:
199         count = 0
200         self.news.clear()
201         self.details.clear()
202         self.conn: Optional[
203             Union[http.client.HTTPConnection, http.client.HTTPSConnection]
204         ] = None
205
206         for uri in self.feed_uris:
207             url = None
208             if self.should_use_https():
209                 url = f"https://{self.feed_site}{uri}"
210                 logger.info(f"Fetching: {url}")
211                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
212             else:
213                 url = f"http://{self.feed_site}{uri}"
214                 logger.info(f"Fetching: {url}")
215                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
216             assert self.conn is not None
217             assert url is not None
218             self.conn.request(
219                 "GET",
220                 uri,
221                 None,
222                 {
223                     "Accept": "*/*",
224                     "Cache-control": "max-age=50",
225                 },
226             )
227             try:
228                 response = self.conn.getresponse()
229             except Exception:
230                 logger.exception(
231                     f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
232                 )
233                 return False
234
235             if response.status != 200:
236                 logger.error(
237                     f"Unexpected status {response.status} while fetching {url}; giving up."
238                 )
239                 return False
240
241             raw = response.read()
242             logger.info(f"Status 200: got {len(raw)} bytes back from {url}")
243             rss = ET.fromstring(raw)
244             channel = rss[0]
245             title_filter = set()
246             for item in list(channel):
247                 title = self.find_title(item)
248                 description = item.findtext("description")
249                 if title is not None:
250                     title = self.munge_title(title, item)
251                 else:
252                     logger.info("Skipping RSS feed item with no title.")
253                     continue
254                 logger.debug(f"Considering RSS item {title}...")
255                 if description is not None:
256                     description = self.munge_description(description, item)
257                 else:
258                     description = ""
259                 image = self.find_image(item)
260                 if image is not None:
261                     image = self.munge_image(image)
262                 link = item.findtext("link")
263                 if link is not None:
264                     link = self.munge_link(link)
265                 if not self.item_is_interesting_for_headlines(title, description, item):
266                     logger.info(f"Skipping {title} because it's not interesting.")
267                     continue
268
269                 if self.should_profanity_filter() and (
270                     self.filter.contains_bad_word(title)
271                     or self.filter.contains_bad_word(description)
272                 ):
273                     logger.info(f"Skipping {title} because it contains profanity.")
274                     continue
275
276                 if title in title_filter:
277                     logger.info(
278                         f"Skipping {title} because we already saw an item with the same title."
279                     )
280                     continue
281                 title_filter.add(title)
282
283                 blurb = """<DIV style="padding:8px;
284                                 font-size:34pt;
285                                 -webkit-column-break-inside:avoid;">"""
286                 if image is not None:
287                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
288                     blurb += 'style="padding:8px;">'
289
290                 if link is None:
291                     blurb += f"<P><B>{title}</B>"
292                 else:
293                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
294
295                 pubdate = self.find_pubdate(item)
296                 if pubdate is not None:
297                     logger.debug(f"Raw pubdate={pubdate}")
298                     pubdate = self.munge_pubdate(pubdate)
299                     ts = parse(pubdate)
300                     logger.debug(f"Translated pubdate into: {ts}")
301                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
302
303                 if self.item_is_interesting_for_article(title, description, item):
304                     logger.info(
305                         f"Item {title} is also interesting as an article details page; creating..."
306                     )
307                     longblurb = blurb
308                     longblurb += "<BR>"
309                     longblurb += description
310                     longblurb += "</DIV>"
311                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
312                     self.details.add(longblurb)
313                 else:
314                     logger.info(
315                         f"Item {title} isn't interesting for article details page; skipped."
316                     )
317                 blurb += "</DIV>"
318                 self.news.add(blurb)
319                 count += 1
320                 logger.debug(f"Added {count} items so far...")
321         return count > 0