e6d45335e47f87750fd17441f5f1d11af78065c9
[kiosk.git] / generic_news_rss_renderer.py
1 #!/usr/bin/env python3
2
3 from abc import abstractmethod
4 import datetime
5 from dateutil.parser import parse
6 import http.client
7 import random
8 import re
9 from typing import Dict, List, Optional, Union
10 import xml.etree.ElementTree as ET
11
12 import file_writer
13 import grab_bag
14 import renderer
15 import page_builder
16 import profanity_filter
17
18
19 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
20     def __init__(
21         self,
22         name_to_timeout_dict: Dict[str, int],
23         feed_site: str,
24         feed_uris: List[str],
25         page_title: str,
26     ):
27         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
28         self.debug = True
29         self.feed_site = feed_site
30         self.feed_uris = feed_uris
31         self.page_title = page_title
32         self.news = grab_bag.grab_bag()
33         self.details = grab_bag.grab_bag()
34         self.filter = profanity_filter.ProfanityFilter()
35
36     @abstractmethod
37     def debug_prefix(self) -> str:
38         pass
39
40     @abstractmethod
41     def get_headlines_page_prefix(self) -> str:
42         pass
43
44     @abstractmethod
45     def get_details_page_prefix(self) -> str:
46         pass
47
48     def get_headlines_page_priority(self) -> str:
49         return "4"
50
51     def get_details_page_priority(self) -> str:
52         return "6"
53
54     @abstractmethod
55     def should_use_https(self) -> bool:
56         pass
57
58     def should_profanity_filter(self) -> bool:
59         return False
60
61     def find_title(self, item: ET.Element) -> Optional[str]:
62         return item.findtext("title")
63
64     def munge_title(self, title: str, item: ET.Element) -> str:
65         return title
66
67     def find_description(self, item: ET.Element) -> Optional[str]:
68         return item.findtext("description")
69
70     def munge_description(
71             self,
72             description: str,
73             item: ET.Element
74     ) -> str:
75         description = re.sub("<[^>]+>", "", description)
76         return description
77
78     def find_link(self, item: ET.Element) -> Optional[str]:
79         return item.findtext("link")
80
81     def munge_link(self, link: str) -> str:
82         return link
83
84     def find_image(self, item: ET.Element) -> Optional[str]:
85         return item.findtext("image")
86
87     def munge_image(self, image: str) -> str:
88         return image
89
90     def find_pubdate(self, item: ET.Element) -> Optional[str]:
91         return item.findtext("pubDate")
92
93     def munge_pubdate(self, pubdate: str) -> str:
94         return pubdate
95
96     def item_is_interesting_for_headlines(
97         self, title: str, description: str, item: ET.Element
98     ) -> bool:
99         return True
100
101     def do_headlines(self) -> bool:
102         return True
103
104     def do_details(self) -> bool:
105         return True
106
107     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
108         pubdate = self.find_pubdate(item)
109         if pubdate is None:
110             return False
111         pubdatetime = parse(pubdate)
112         tzinfo = pubdatetime.tzinfo
113         now = datetime.datetime.now(tzinfo)
114         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
115         return delta > n
116
117     def item_is_interesting_for_article(
118         self, title: str, description: str, item: ET.Element
119     ) -> bool:
120         return True
121
122     def periodic_render(self, key: str) -> bool:
123         if key == "Fetch News":
124             return self.fetch_news()
125         elif key == "Shuffle News":
126             return self.shuffle_news()
127         else:
128             raise Exception
129
130     def shuffle_news(self) -> bool:
131         if self.do_headlines():
132             headlines = page_builder.page_builder()
133             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
134             headlines.set_title("%s" % self.page_title)
135             subset = self.news.subset(4)
136             if subset is None:
137                 self.debug_print("Not enough messages to choose from.")
138                 return False
139             for msg in subset:
140                 headlines.add_item(msg)
141             headlines.set_custom_html(
142                 """
143     <STYLE>
144     a:link {
145       color: black;
146       text-decoration: none;
147       font-weight: bold;
148     }
149     a:visited {
150       color: black;
151       text-decoration: none;
152       font-weight: bold;
153     }
154     a:active {
155       color: black;
156       text-decoration: none;
157       font-weight: bold;
158     }
159     </STYLE>"""
160             )
161             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
162             with file_writer.file_writer(_) as f:
163                 headlines.render_html(f)
164
165         if self.do_details():
166             details = page_builder.page_builder()
167             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
168             details.set_custom_html(
169                 """
170     <STYLE>
171     a:link {
172       color: black;
173       text-decoration: none;
174       font-weight: bold;
175     }
176     a:visited {
177       color: black;
178       text-decoration: none;
179       font-weight: bold;
180     }
181     a:active {
182       color: black;
183       text-decoration: none;
184       font-weight: bold;
185     }
186     </STYLE>"""
187             )
188             details.set_title(f"{self.page_title}")
189             subset = self.details.subset(1)
190             if subset is None:
191                 self.debug_print("Not enough details to choose from.")
192                 return False
193             for msg in subset:
194                 blurb = msg
195                 blurb += "</TD>"
196                 details.add_item(blurb)
197             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
198             with file_writer.file_writer(_) as g:
199                 details.render_html(g)
200         return True
201
202     def fetch_news(self) -> bool:
203         count = 0
204         self.news.clear()
205         self.details.clear()
206         self.conn: Optional[Union[http.client.HTTPConnection,
207                                   http.client.HTTPSConnection]] = None
208
209         for uri in self.feed_uris:
210             if self.should_use_https():
211                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
212                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
213             else:
214                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
215                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
216             assert(self.conn is not None)
217             self.conn.request(
218                 "GET",
219                 uri,
220                 None,
221                 {
222                     "Accept": "*/*",
223                     "Cache-control": "max-age=59",
224                     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
225                 },
226             )
227             try:
228                 response = self.conn.getresponse()
229             except Exception as e:
230                 traceback.print_exc(file=sys.stdout)
231                 print(
232                     f"Exception in generic RSS renderer HTTP connection fetching {self.feed_site}{uri}"
233                 )
234                 return False
235
236             if response.status != 200:
237                 print(
238                     f"{self.page_title}: RSS fetch_news error, response: {response.status}"
239                 )
240                 self.debug_print(str(response.read()))
241                 return False
242
243             rss = ET.fromstring(response.read())
244             channel = rss[0]
245             title_filter = set()
246             for item in channel.getchildren():
247                 title = self.find_title(item)
248                 description = item.findtext("description")
249                 if title is not None:
250                     title = self.munge_title(title, item)
251                 if description is not None:
252                     description = self.munge_description(description, item)
253                 else:
254                     description = ""
255                 image = self.find_image(item)
256                 if image is not None:
257                     image = self.munge_image(image)
258                 link = item.findtext("link")
259                 if link is not None:
260                     link = self.munge_link(link)
261
262                 if title is None or not self.item_is_interesting_for_headlines(
263                     title, description, item
264                 ):
265                     self.debug_print(f'Item "{title}" is not interesting')
266                     continue
267
268                 if self.should_profanity_filter() and (
269                     self.filter.contains_bad_word(title)
270                     or self.filter.contains_bad_word(description)
271                 ):
272                     self.debug_print(f'Found bad words in item "{title}"')
273                     continue
274
275                 if title in title_filter:
276                     self.debug_print(f'Already saw title {title}, skipping.')
277                 title_filter.add(title)
278
279                 blurb = """<DIV style="padding:8px;
280                                 font-size:34pt;
281                                 -webkit-column-break-inside:avoid;">"""
282                 if image is not None:
283                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
284                     blurb += 'style="padding:8px;">'
285
286                 if link is None:
287                     blurb += f"<P><B>{title}</B>"
288                 else:
289                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
290
291                 pubdate = self.find_pubdate(item)
292                 if pubdate is not None:
293                     pubdate = self.munge_pubdate(pubdate)
294                     ts = parse(pubdate)
295                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
296
297                 if self.item_is_interesting_for_article(title, description, item):
298                     longblurb = blurb
299                     longblurb += "<BR>"
300                     longblurb += description
301                     longblurb += "</DIV>"
302                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
303                     self.details.add(longblurb)
304                 blurb += "</DIV>"
305                 self.news.add(blurb)
306                 count += 1
307         return count > 0