generic_news_rss_renderer.py

   1 import datetime
   2 from dateutil.parser import parse
   3 import file_writer
   4 import grab_bag
   5 import renderer
   6 import http.client
   7 import page_builder
   8 import profanity_filter
   9 import random
  10 import re
  11 import xml.etree.ElementTree as ET
  12
  13
  14 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
  15     def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title):
  16         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict, False)
  17         self.debug = 1
  18         self.feed_site = feed_site
  19         self.feed_uris = feed_uris
  20         self.page_title = page_title
  21         self.news = grab_bag.grab_bag()
  22         self.details = grab_bag.grab_bag()
  23         self.filter = profanity_filter.profanity_filter()
  24
  25     def debug_prefix(self):
  26         pass
  27
  28     def get_headlines_page_prefix(self):
  29         pass
  30
  31     def get_details_page_prefix(self):
  32         pass
  33
  34     def get_headlines_page_priority(self):
  35         return "4"
  36
  37     def get_details_page_priority(self):
  38         return "6"
  39
  40     def should_use_https(self):
  41         pass
  42
  43     def should_profanity_filter(self):
  44         return False
  45
  46     def find_title(self, item):
  47         return item.findtext("title")
  48
  49     def munge_title(self, title):
  50         return title
  51
  52     def find_description(self, item):
  53         return item.findtext("description")
  54
  55     def munge_description(self, description):
  56         description = re.sub("<[^>]+>", "", description)
  57         return description
  58
  59     def find_link(self, item):
  60         return item.findtext("link")
  61
  62     def munge_link(self, link):
  63         return link
  64
  65     def find_image(self, item):
  66         return item.findtext("image")
  67
  68     def munge_image(self, image):
  69         return image
  70
  71     def find_pubdate(self, item):
  72         return item.findtext("pubDate")
  73
  74     def munge_pubdate(self, pubdate):
  75         return pubdate
  76
  77     def item_is_interesting_for_headlines(self, title, description, item):
  78         return True
  79
  80     def is_item_older_than_n_days(self, item, n):
  81         pubdate = self.find_pubdate(item)
  82         if pubdate is not None:
  83             pubdate = parse(pubdate)
  84             tzinfo = pubdate.tzinfo
  85             now = datetime.datetime.now(tzinfo)
  86             delta = (now - pubdate).total_seconds() / (60 * 60 * 24)
  87             if delta > n:
  88                 return True
  89         return False
  90
  91     def item_is_interesting_for_article(self, title, description, item):
  92         return True
  93
  94     def periodic_render(self, key):
  95         if key == "Fetch News":
  96             return self.fetch_news()
  97         elif key == "Shuffle News":
  98             return self.shuffle_news()
  99         else:
 100             raise error("Unexpected operation")
 101
 102     def shuffle_news(self):
 103         headlines = page_builder.page_builder()
 104         headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 105         headlines.set_title("%s" % self.page_title)
 106         subset = self.news.subset(4)
 107         if subset is None:
 108             self.debug_print("Not enough messages to choose from.")
 109             return False
 110         for msg in subset:
 111             headlines.add_item(msg)
 112         headlines.set_custom_html(
 113             """
 114 <STYLE>
 115 a:link {
 116   color: black;
 117   text-decoration: none;
 118   font-weight: bold;
 119 }
 120 a:visited {
 121   color: black;
 122   text-decoration: none;
 123   font-weight: bold;
 124 }
 125 a:active {
 126   color: black;
 127   text-decoration: none;
 128   font-weight: bold;
 129 }
 130 </STYLE>"""
 131         )
 132         f = file_writer.file_writer(
 133             "%s_%s_25900.html"
 134             % (self.get_headlines_page_prefix(), self.get_headlines_page_priority())
 135         )
 136         headlines.render_html(f)
 137         f.close()
 138
 139         details = page_builder.page_builder()
 140         details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 141         details.set_custom_html(
 142             """
 143 <STYLE>
 144 a:link {
 145   color: black;
 146   text-decoration: none;
 147   font-weight: bold;
 148 }
 149 a:visited {
 150   color: black;
 151   text-decoration: none;
 152   font-weight: bold;
 153 }
 154 a:active {
 155   color: black;
 156   text-decoration: none;
 157   font-weight: bold;
 158 }
 159 </STYLE>"""
 160         )
 161         details.set_title("%s" % self.page_title)
 162         subset = self.details.subset(1)
 163         if subset is None:
 164             self.debug_print("Not enough details to choose from.")
 165             return False
 166         for msg in subset:
 167             blurb = msg
 168             blurb += u"</TD>"
 169             details.add_item(blurb)
 170         g = file_writer.file_writer(
 171             "%s_%s_86400.html"
 172             % (self.get_details_page_prefix(), self.get_details_page_priority())
 173         )
 174         details.render_html(g)
 175         g.close()
 176         return True
 177
 178     def fetch_news(self):
 179         count = 0
 180         self.news.clear()
 181         self.details.clear()
 182
 183         for uri in self.feed_uris:
 184             if self.should_use_https():
 185                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
 186                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
 187             else:
 188                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
 189                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
 190             self.conn.request(
 191                 "GET",
 192                 uri,
 193                 None,
 194                 {
 195                     "Accept": "*/*",
 196                     "Cache-control": "max-age=59",
 197                     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
 198                 },
 199             )
 200             try:
 201                 response = self.conn.getresponse()
 202             except:
 203                 print("Exception in generic RSS renderer HTTP connection")
 204                 return False
 205
 206             if response.status != 200:
 207                 print(
 208                     (
 209                         "%s: RSS fetch_news error, response: %d"
 210                         % (self.page_title, response.status)
 211                     )
 212                 )
 213                 self.debug_print(response.read())
 214                 return False
 215
 216             rss = ET.fromstring(response.read())
 217             channel = rss[0]
 218             for item in channel.getchildren():
 219                 title = self.find_title(item)
 220                 if title is not None:
 221                     title = self.munge_title(title)
 222                 description = item.findtext("description")
 223                 if description is not None:
 224                     description = self.munge_description(description)
 225                 image = self.find_image(item)
 226                 if image is not None:
 227                     image = self.munge_image(image)
 228                 link = item.findtext("link")
 229                 if link is not None:
 230                     link = self.munge_link(link)
 231
 232                 if title is None or not self.item_is_interesting_for_headlines(
 233                     title, description, item
 234                 ):
 235                     self.debug_print('Item "%s" is not interesting' % title)
 236                     continue
 237
 238                 if self.should_profanity_filter() and (
 239                     self.filter.contains_bad_words(title)
 240                     or self.filter.contains_bad_words(description)
 241                 ):
 242                     self.debug_print('Found bad words in item "%s"' % title)
 243                     continue
 244
 245                 blurb = u"""<DIV style="padding:8px;
 246                                  font-size:34pt;
 247                                  -webkit-column-break-inside:avoid;">"""
 248                 if image is not None:
 249                     blurb += u'<IMG SRC="%s" ALIGN=LEFT HEIGHT=115 ' % image
 250                     blurb += u'style="padding:8px;">'
 251
 252                 if link is None:
 253                     blurb += u"<P><B>%s</B>" % title
 254                 else:
 255                     blurb += u'<P><B><A HREF="%s">%s</A></B>' % (link, title)
 256
 257                 pubdate = self.find_pubdate(item)
 258                 if pubdate is not None:
 259                     pubdate = self.munge_pubdate(pubdate)
 260                     ts = parse(pubdate)
 261                     blurb += u"  <FONT COLOR=#cccccc>%s</FONT>" % (
 262                         ts.strftime("%b&nbsp;%d")
 263                     )
 264
 265                 if description is not None and self.item_is_interesting_for_article(
 266                     title, description, item
 267                 ):
 268                     longblurb = blurb
 269
 270                     longblurb += u"<BR>"
 271                     longblurb += description
 272                     longblurb += u"</DIV>"
 273                     longblurb = longblurb.replace("font-size:34pt", "font-size:44pt")
 274                     self.details.add(longblurb)
 275
 276                 blurb += u"</DIV>"
 277                 self.news.add(blurb)
 278                 count += 1
 279         return count > 0