Use pyutils logging prepend message functionality.
[kiosk.git] / generic_news_rss_renderer.py
1 #!/usr/bin/env python3
2
3 from abc import abstractmethod
4 import datetime
5 from dateutil.parser import parse
6 import http.client
7 import logging
8 import re
9 from typing import Dict, List, Optional, Union
10 import xml.etree.ElementTree as ET
11
12 from scottutilz import profanity_filter
13
14 import file_writer
15 import grab_bag
16 import renderer
17 import page_builder
18
19
20 logger = logging.getLogger(__file__)
21
22
23 class generic_news_rss_renderer(renderer.abstaining_renderer):
24     def __init__(
25         self,
26         name_to_timeout_dict: Dict[str, int],
27         feed_site: str,
28         feed_uris: List[str],
29         page_title: str,
30     ):
31         super().__init__(name_to_timeout_dict)
32         self.feed_site = feed_site
33         self.feed_uris = feed_uris
34         self.page_title = page_title
35         self.news = grab_bag.grab_bag()
36         self.details = grab_bag.grab_bag()
37         self.filter = profanity_filter.ProfanityFilter()
38
39     @abstractmethod
40     def get_headlines_page_prefix(self) -> str:
41         pass
42
43     @abstractmethod
44     def get_details_page_prefix(self) -> str:
45         pass
46
47     def get_headlines_page_priority(self) -> str:
48         return "4"
49
50     def get_details_page_priority(self) -> str:
51         return "6"
52
53     @abstractmethod
54     def should_use_https(self) -> bool:
55         pass
56
57     def should_profanity_filter(self) -> bool:
58         return False
59
60     def find_title(self, item: ET.Element) -> Optional[str]:
61         return item.findtext("title")
62
63     def munge_title(self, title: str, item: ET.Element) -> str:
64         return title
65
66     def find_description(self, item: ET.Element) -> Optional[str]:
67         return item.findtext("description")
68
69     def munge_description(
70             self,
71             description: str,
72             item: ET.Element
73     ) -> str:
74         description = re.sub("<[^>]+>", "", description)
75         return description
76
77     def find_link(self, item: ET.Element) -> Optional[str]:
78         return item.findtext("link")
79
80     def munge_link(self, link: str) -> str:
81         return link
82
83     def find_image(self, item: ET.Element) -> Optional[str]:
84         return item.findtext("image")
85
86     def munge_image(self, image: str) -> str:
87         return image
88
89     def find_pubdate(self, item: ET.Element) -> Optional[str]:
90         return item.findtext("pubDate")
91
92     def munge_pubdate(self, pubdate: str) -> str:
93         return pubdate
94
95     def item_is_interesting_for_headlines(
96         self, title: str, description: str, item: ET.Element
97     ) -> bool:
98         return True
99
100     def do_headlines(self) -> bool:
101         return True
102
103     def do_details(self) -> bool:
104         return True
105
106     def is_item_older_than_n_days(self, item: ET.Element, n: int) -> bool:
107         pubdate = self.find_pubdate(item)
108         if pubdate is None:
109             return False
110         pubdatetime = parse(pubdate)
111         tzinfo = pubdatetime.tzinfo
112         now = datetime.datetime.now(tzinfo)
113         delta = (now - pubdatetime).total_seconds() / (60 * 60 * 24)
114         return delta > n
115
116     def item_is_interesting_for_article(
117         self, title: str, description: str, item: ET.Element
118     ) -> bool:
119         return True
120
121     def periodic_render(self, key: str) -> bool:
122         if key == "Fetch News":
123             return self.fetch_news()
124         elif key == "Shuffle News":
125             return self.shuffle_news()
126         else:
127             raise Exception
128
129     def shuffle_news(self) -> bool:
130         if self.do_headlines():
131             headlines = page_builder.page_builder()
132             headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
133             headlines.set_title("%s" % self.page_title)
134             subset = self.news.subset(4)
135             if subset is None:
136                 logger.warning('Not enough messages to select from in shuffle_news?!')
137                 return False
138             for msg in subset:
139                 headlines.add_item(msg)
140             headlines.set_custom_html(
141                 """
142     <STYLE>
143     a:link {
144       color: black;
145       text-decoration: none;
146       font-weight: bold;
147     }
148     a:visited {
149       color: black;
150       text-decoration: none;
151       font-weight: bold;
152     }
153     a:active {
154       color: black;
155       text-decoration: none;
156       font-weight: bold;
157     }
158     </STYLE>"""
159             )
160             _ = f"{self.get_headlines_page_prefix()}_{self.get_headlines_page_priority()}_25900.html"
161             with file_writer.file_writer(_) as f:
162                 headlines.render_html(f)
163
164         if self.do_details():
165             details = page_builder.page_builder()
166             details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
167             details.set_custom_html(
168                 """
169     <STYLE>
170     a:link {
171       color: black;
172       text-decoration: none;
173       font-weight: bold;
174     }
175     a:visited {
176       color: black;
177       text-decoration: none;
178       font-weight: bold;
179     }
180     a:active {
181       color: black;
182       text-decoration: none;
183       font-weight: bold;
184     }
185     </STYLE>"""
186             )
187             details.set_title(self.page_title)
188             subset = self.details.subset(1)
189             if subset is None:
190                 logger.warning('Not enough details to choose from in do_details')
191                 logger.debug("Not enough details to choose from.")
192                 return False
193             for msg in subset:
194                 blurb = msg
195                 blurb += "</TD>"
196                 details.add_item(blurb)
197             _ = f"{self.get_details_page_prefix()}_{self.get_details_page_priority()}_86400.html"
198             with file_writer.file_writer(_) as g:
199                 details.render_html(g)
200         return True
201
202     def fetch_news(self) -> bool:
203         count = 0
204         self.news.clear()
205         self.details.clear()
206         self.conn: Optional[Union[http.client.HTTPConnection,
207                                   http.client.HTTPSConnection]] = None
208
209         for uri in self.feed_uris:
210             url = None
211             if self.should_use_https():
212                 url = f'https://{self.feed_site}{uri}'
213                 logger.info(f'Fetching: {url}')
214                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=10)
215             else:
216                 url = f'http://{self.feed_site}{uri}'
217                 logger.info(f'Fetching: {url}')
218                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=10)
219             assert self.conn is not None
220             assert url is not None
221             self.conn.request(
222                 "GET",
223                 uri,
224                 None,
225                 {
226                     "Accept": "*/*",
227                     "Cache-control": "max-age=50",
228                 },
229             )
230             try:
231                 response = self.conn.getresponse()
232             except Exception as e:
233                 logger.exception(e)
234                 logger.error(
235                     f"Exception in generic RSS renderer HTTP connection fetching {url}; giving up."
236                 )
237                 return False
238
239             if response.status != 200:
240                 logger.error(
241                     f'Unexpected status {response.status} while fetching {url}; giving up.'
242                 )
243                 return False
244
245             raw = response.read()
246             logger.info(f'Status 200: got {len(raw)} bytes back from {url}')
247             rss = ET.fromstring(raw)
248             channel = rss[0]
249             title_filter = set()
250             for item in list(channel):
251                 title = self.find_title(item)
252                 description = item.findtext("description")
253                 if title is not None:
254                     title = self.munge_title(title, item)
255                 else:
256                     logger.info('Skipping RSS feed item with no title.')
257                     continue
258                 logger.debug(f'Considering RSS item {title}...')
259                 if description is not None:
260                     description = self.munge_description(description, item)
261                 else:
262                     description = ""
263                 image = self.find_image(item)
264                 if image is not None:
265                     image = self.munge_image(image)
266                 link = item.findtext("link")
267                 if link is not None:
268                     link = self.munge_link(link)
269                 if not self.item_is_interesting_for_headlines(
270                         title, description, item
271                 ):
272                     logger.info(f'Skipping {title} because it\'s not interesting.')
273                     continue
274
275                 if self.should_profanity_filter() and (
276                     self.filter.contains_bad_word(title)
277                     or self.filter.contains_bad_word(description)
278                 ):
279                     logger.info(f'Skipping {title} because it contains profanity.')
280                     continue
281
282                 if title in title_filter:
283                     logger.info(f'Skipping {title} because we already saw an item with the same title.')
284                     continue
285                 title_filter.add(title)
286
287                 blurb = """<DIV style="padding:8px;
288                                 font-size:34pt;
289                                 -webkit-column-break-inside:avoid;">"""
290                 if image is not None:
291                     blurb += f'<IMG SRC="{image}" ALIGN=LEFT HEIGHT=115 '
292                     blurb += 'style="padding:8px;">'
293
294                 if link is None:
295                     blurb += f"<P><B>{title}</B>"
296                 else:
297                     blurb += f'<P><B><A HREF="{link}">{title}</A></B>'
298
299                 pubdate = self.find_pubdate(item)
300                 if pubdate is not None:
301                     logger.debug(f'Raw pubdate={pubdate}')
302                     pubdate = self.munge_pubdate(pubdate)
303                     ts = parse(pubdate)
304                     logger.debug(f'Translated pubdate into: {ts}')
305                     blurb += f'  <FONT COLOR=#cccccc>{ts.strftime("%b&nbsp;%d")}</FONT>'
306
307                 if self.item_is_interesting_for_article(title, description, item):
308                     logger.info(f'Item {title} is also interesting as an article details page; creating...')
309                     longblurb = blurb
310                     longblurb += "<BR>"
311                     longblurb += description
312                     longblurb += "</DIV>"
313                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
314                     self.details.add(longblurb)
315                 else:
316                     logger.info(f'Item {title} isn\'t interesting for article details page; skipped.')
317                 blurb += "</DIV>"
318                 self.news.add(blurb)
319                 count += 1
320                 logger.debug(f'Added {count} items so far...')
321         return count > 0