More changes related to running on new kiosk.house.
[kiosk.git] / generic_news_rss_renderer.py
1 #!/usr/bin/env python3
2
3 from abc import abstractmethod
4 import datetime
5 from dateutil.parser import parse
6 import http.client
7 import random
8 import re
9 import sys
10 import traceback
11 from typing import Dict, List, Optional, Union
12 import xml.etree.ElementTree as ET
13
14 import file_writer
15 import grab_bag
16 import renderer
17 import page_builder
18 import profanity_filter
19
20
21 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
22     def __init__(
23         self,
24         name_to_timeout_dict: Dict[str, int],
25         feed_site: str,
26         feed_uris: List[str],
27         page_title: str,
28     ):
29         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
30         self.debug = True
31         self.feed_site = feed_site
32         self.feed_uris = feed_uris
33         self.page_title = page_title
34         self.news = grab_bag.grab_bag()
35         self.details = grab_bag.grab_bag()
36         self.filter = profanity_filter.ProfanityFilter()
37
38     @abstractmethod
39     def debug_prefix(self) -> str:
40         pass
41
42     @abstractmethod
43     def get_headlines_page_prefix(self) -> str:
44         pass
45
46     @abstractmethod
47     def get_details_page_prefix(self) -> str:
48         pass
49
50     def get_headlines_page_priority(self) -> str:
51         return "4"
52
53     def get_details_page_priority(self) -> str:
54         return "6"
55
56     @abstractmethod
57     def should_use_https(self) -> bool:
58         pass
59
60     def should_profanity_filter(self) -> bool:
61         return False
62
63     def find_title(self, item: ET.Element) -> Optional[str]:
64         return item.findtext("title")
65
66     def munge_title(self, title: str, item: ET.Element) -> str:
67         return title
68
69     def find_description(self, item: ET.Element) -> Optional[str]:
70         return item.findtext("description")
71
72     def munge_description(
73             self,
74             description: str,
75             item: ET.Element
76     ) -> str:
77         description = re.sub("<[^>]+>", "", description)
78         return description
79
80     def find_link(self, item: ET.Element) -> Optional[str]:
81         return item.findtext("link")
82
83     def munge_link(self, link: str) -> str:
84         return link
85
86     def find_image(self, item: ET.Element) -> Optional[str]:
87         return item.findtext("image")
88
89     def munge_image(self, image: str) -> str:
90         return image
91
92     def find_pubdate(self, item: ET.Element) -> Optional[str]:
93         return item.findtext("pubDate")
94
95     def munge_pubdate(self, pubdate: str) -> str:
96         return pubdate
97
98     def item_is_interesting_for_headlines(
99         self, title: str, description: str, item: ET.Element
100     ) -> bool:
101         return True
102
103     def do_headlines(self) -> bool:
104         return True
105
106     def do_details(self) -> bool:
107         return True
108
109     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
110         pubdate = self.find_pubdate(item)
111         if pubdate is None:
112             return False
113         pubdatetime = parse(pubdate)
114         tzinfo = pubdatetime.tzinfo
115         now = datetime.datetime.now(tzinfo)
116         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
117         return delta > n
118
119     def item_is_interesting_for_article(
120         self, title: str, description: str, item: ET.Element
121     ) -> bool:
122         return True
123
124     def periodic_render(self, key: str) -> bool:
125         if key == "Fetch News":
126             return self.fetch_news()
127         elif key == "Shuffle News":
128             return self.shuffle_news()
129         else:
130             raise Exception
131
132     def shuffle_news(self) -> bool:
133         if self.do_headlines():
134             headlines = page_builder.page_builder()
135             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
136             headlines.set_title("%s" % self.page_title)
137             subset = self.news.subset(4)
138             if subset is None:
139                 self.debug_print("Not enough messages to choose from.")
140                 return False
141             for msg in subset:
142                 headlines.add_item(msg)
143             headlines.set_custom_html(
144                 """
145     <STYLE>
146     a:link {
147       color: black;
148       text-decoration: none;
149       font-weight: bold;
150     }
151     a:visited {
152       color: black;
153       text-decoration: none;
154       font-weight: bold;
155     }
156     a:active {
157       color: black;
158       text-decoration: none;
159       font-weight: bold;
160     }
161     </STYLE>"""
162             )
163             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
164             with file_writer.file_writer(_) as f:
165                 headlines.render_html(f)
166
167         if self.do_details():
168             details = page_builder.page_builder()
169             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
170             details.set_custom_html(
171                 """
172     <STYLE>
173     a:link {
174       color: black;
175       text-decoration: none;
176       font-weight: bold;
177     }
178     a:visited {
179       color: black;
180       text-decoration: none;
181       font-weight: bold;
182     }
183     a:active {
184       color: black;
185       text-decoration: none;
186       font-weight: bold;
187     }
188     </STYLE>"""
189             )
190             details.set_title(f"{self.page_title}")
191             subset = self.details.subset(1)
192             if subset is None:
193                 self.debug_print("Not enough details to choose from.")
194                 return False
195             for msg in subset:
196                 blurb = msg
197                 blurb += "</TD>"
198                 details.add_item(blurb)
199             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
200             with file_writer.file_writer(_) as g:
201                 details.render_html(g)
202         return True
203
204     def fetch_news(self) -> bool:
205         count = 0
206         self.news.clear()
207         self.details.clear()
208         self.conn: Optional[Union[http.client.HTTPConnection,
209                                   http.client.HTTPSConnection]] = None
210
211         for uri in self.feed_uris:
212             if self.should_use_https():
213                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
214                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
215             else:
216                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
217                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
218             assert(self.conn is not None)
219             self.conn.request(
220                 "GET",
221                 uri,
222                 None,
223                 {
224                     "Accept": "*/*",
225 #                    "Cache-control": "max-age=50",
226 #                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
227                 },
228             )
229             try:
230                 response = self.conn.getresponse()
231             except Exception as e:
232                 traceback.print_exc(file=sys.stdout)
233                 print(
234                     f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
235                 )
236                 return False
237
238             if response.status != 200:
239                 print(
240                     f"{self.page_title}: RSS fetch_news error, response: {response.status}"
241                 )
242                 self.debug_print(str(response.read()))
243                 return False
244
245             rss = ET.fromstring(response.read())
246             channel = rss[0]
247             title_filter = set()
248             for item in channel.getchildren():
249                 title = self.find_title(item)
250                 description = item.findtext("description")
251                 if title is not None:
252                     title = self.munge_title(title, item)
253                 if description is not None:
254                     description = self.munge_description(description, item)
255                 else:
256                     description = ""
257                 image = self.find_image(item)
258                 if image is not None:
259                     image = self.munge_image(image)
260                 link = item.findtext("link")
261                 if link is not None:
262                     link = self.munge_link(link)
263
264                 if title is None or not self.item_is_interesting_for_headlines(
265                     title, description, item
266                 ):
267                     self.debug_print(f'Item "{title}" is not interesting')
268                     continue
269
270                 if self.should_profanity_filter() and (
271                     self.filter.contains_bad_word(title)
272                     or self.filter.contains_bad_word(description)
273                 ):
274                     self.debug_print(f'Found bad words in item "{title}"')
275                     continue
276
277                 if title in title_filter:
278                     self.debug_print(f'Already saw title {title}, skipping.')
279                 title_filter.add(title)
280
281                 blurb = """<DIV style="padding:8px;
282                                 font-size:34pt;
283                                 -webkit-column-break-inside:avoid;">"""
284                 if image is not None:
285                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
286                     blurb += 'style="padding:8px;">'
287
288                 if link is None:
289                     blurb += f"<P><B>{title}</B>"
290                 else:
291                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
292
293                 pubdate = self.find_pubdate(item)
294                 if pubdate is not None:
295                     pubdate = self.munge_pubdate(pubdate)
296                     ts = parse(pubdate)
297                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
298
299                 if self.item_is_interesting_for_article(title, description, item):
300                     longblurb = blurb
301                     longblurb += "<BR>"
302                     longblurb += description
303                     longblurb += "</DIV>"
304                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
305                     self.details.add(longblurb)
306                 blurb += "</DIV>"
307                 self.news.add(blurb)
308                 count += 1
309         return count > 0