generic_news_rss_renderer.py

   1 import datetime
   2 from dateutil.parser import parse
   3 import file_writer
   4 import grab_bag
   5 import renderer
   6 import http.client
   7 import page_builder
   8 import profanity_filter
   9 import random
  10 import re
  11 import xml.etree.ElementTree as ET
  12
  13 class generic_news_rss_renderer(renderer.debuggable_abstaining_renderer):
  14     def __init__(self, name_to_timeout_dict, feed_site, feed_uris, page_title):
  15         super(generic_news_rss_renderer, self).__init__(name_to_timeout_dict,
  16                                                         False)
  17         self.debug = 1
  18         self.feed_site = feed_site
  19         self.feed_uris = feed_uris
  20         self.page_title = page_title
  21         self.news = grab_bag.grab_bag()
  22         self.details = grab_bag.grab_bag()
  23         self.filter = profanity_filter.profanity_filter()
  24
  25     def debug_prefix(self):
  26         pass
  27
  28     def get_headlines_page_prefix(self):
  29         pass
  30
  31     def get_details_page_prefix(self):
  32         pass
  33
  34     def get_headlines_page_priority(self):
  35         return "4"
  36
  37     def get_details_page_priority(self):
  38         return "6"
  39
  40     def should_use_https(self):
  41         pass
  42
  43     def should_profanity_filter(self):
  44         return False
  45
  46     def find_title(self, item):
  47         return item.findtext('title')
  48
  49     def munge_title(self, title):
  50         return title
  51
  52     def find_description(self, item):
  53         return item.findtext('description')
  54
  55     def munge_description(self, description):
  56         description = re.sub('<[^>]+>', '', description)
  57         return description
  58
  59     def find_link(self, item):
  60         return item.findtext('link')
  61
  62     def munge_link(self, link):
  63         return link
  64
  65     def find_image(self, item):
  66         return item.findtext('image')
  67
  68     def munge_image(self, image):
  69         return image
  70
  71     def find_pubdate(self, item):
  72         return item.findtext('pubDate')
  73
  74     def munge_pubdate(self, pubdate):
  75         return pubdate
  76
  77     def item_is_interesting_for_headlines(self, title, description, item):
  78         return True
  79
  80     def is_item_older_than_n_days(self, item, n):
  81         pubdate = self.find_pubdate(item)
  82         if pubdate is not None:
  83             pubdate = parse(pubdate)
  84             tzinfo = pubdate.tzinfo
  85             now = datetime.datetime.now(tzinfo)
  86             delta = (now - pubdate).total_seconds() / (60 * 60 * 24)
  87             if (delta > n):
  88                 return True
  89         return False
  90
  91     def item_is_interesting_for_article(self, title, description, item):
  92         return True
  93
  94     def periodic_render(self, key):
  95         if key == "Fetch News":
  96             return self.fetch_news()
  97         elif key == "Shuffle News":
  98             return self.shuffle_news()
  99         else:
 100             raise error('Unexpected operation')
 101
 102     def shuffle_news(self):
 103         headlines = page_builder.page_builder()
 104         headlines.set_layout(page_builder.page_builder.LAYOUT_FOUR_ITEMS)
 105         headlines.set_title("%s" % self.page_title)
 106         subset = self.news.subset(4)
 107         if subset is None:
 108             self.debug_print("Not enough messages to choose from.")
 109             return False
 110         for msg in subset:
 111             headlines.add_item(msg)
 112         headlines.set_custom_html("""
 113 <STYLE>
 114 a:link {
 115   color: black;
 116   text-decoration: none;
 117   font-weight: bold;
 118 }
 119 a:visited {
 120   color: black;
 121   text-decoration: none;
 122   font-weight: bold;
 123 }
 124 a:active {
 125   color: black;
 126   text-decoration: none;
 127   font-weight: bold;
 128 }
 129 </STYLE>""")
 130         f = file_writer.file_writer('%s_%s_25900.html' % (
 131             self.get_headlines_page_prefix(),
 132             self.get_headlines_page_priority()))
 133         headlines.render_html(f)
 134         f.close()
 135
 136         details = page_builder.page_builder()
 137         details.set_layout(page_builder.page_builder.LAYOUT_ONE_ITEM)
 138         details.set_custom_html("""
 139 <STYLE>
 140 a:link {
 141   color: black;
 142   text-decoration: none;
 143   font-weight: bold;
 144 }
 145 a:visited {
 146   color: black;
 147   text-decoration: none;
 148   font-weight: bold;
 149 }
 150 a:active {
 151   color: black;
 152   text-decoration: none;
 153   font-weight: bold;
 154 }
 155 </STYLE>""")
 156         details.set_title("%s" % self.page_title)
 157         subset = self.details.subset(1)
 158         if subset is None:
 159             self.debug_print("Not enough details to choose from.");
 160             return False
 161         for msg in subset:
 162             blurb = msg
 163             blurb += u'</TD>'
 164             details.add_item(blurb)
 165         g = file_writer.file_writer('%s_%s_86400.html' % (
 166             self.get_details_page_prefix(),
 167             self.get_details_page_priority()))
 168         details.render_html(g)
 169         g.close()
 170         return True
 171
 172     def fetch_news(self):
 173         count = 0
 174         self.news.clear()
 175         self.details.clear()
 176
 177         for uri in self.feed_uris:
 178             if self.should_use_https():
 179                 self.debug_print("Fetching: https://%s%s" % (self.feed_site, uri))
 180                 self.conn = http.client.HTTPSConnection(self.feed_site, timeout=20)
 181             else:
 182                 self.debug_print("Fetching: http://%s%s" % (self.feed_site, uri))
 183                 self.conn = http.client.HTTPConnection(self.feed_site, timeout=20)
 184             self.conn.request(
 185                 "GET",
 186                 uri,
 187                 None,
 188                 { "Accept": "*/*",
 189                   "Cache-control": "max-age=59",
 190                   "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"})
 191             try:
 192                 response = self.conn.getresponse()
 193             except:
 194                 print("Exception in generic RSS renderer HTTP connection")
 195                 return False
 196
 197             if response.status != 200:
 198                 print(("%s: RSS fetch_news error, response: %d" % (self.page_title,
 199                                                                   response.status)))
 200                 self.debug_print(response.read())
 201                 return False
 202
 203             rss = ET.fromstring(response.read())
 204             channel = rss[0]
 205             for item in channel.getchildren():
 206                 title = self.find_title(item)
 207                 if title is not None:
 208                     title = self.munge_title(title)
 209                 description = item.findtext('description')
 210                 if description is not None:
 211                     description = self.munge_description(description)
 212                 image = self.find_image(item)
 213                 if image is not None:
 214                     image = self.munge_image(image)
 215                 link = item.findtext('link')
 216                 if link is not None:
 217                     link = self.munge_link(link)
 218
 219                 if (title is None or
 220                     not self.item_is_interesting_for_headlines(title,
 221                                                                description,
 222                                                                item)):
 223                     self.debug_print('Item "%s" is not interesting' % title)
 224                     continue
 225
 226                 if (self.should_profanity_filter() and
 227                     (self.filter.contains_bad_words(title) or
 228                     self.filter.contains_bad_words(description))):
 229                     self.debug_print('Found bad words in item "%s"' % title)
 230                     continue
 231
 232                 blurb = u"""<DIV style="padding:8px;
 233                                  font-size:34pt;
 234                                  -webkit-column-break-inside:avoid;">"""
 235                 if image is not None:
 236                     blurb += u'<IMG SRC="%s" ALIGN=LEFT HEIGHT=115 ' % image
 237                     blurb += u'style="padding:8px;">'
 238
 239                 if link is None:
 240                     blurb += u'<P><B>%s</B>' % title
 241                 else:
 242                     blurb += u'<P><B><A HREF="%s">%s</A></B>' % (link, title)
 243
 244                 pubdate = self.find_pubdate(item)
 245                 if pubdate is not None:
 246                     pubdate = self.munge_pubdate(pubdate)
 247                     ts = parse(pubdate)
 248                     blurb += u"  <FONT COLOR=#cccccc>%s</FONT>" % (
 249                         ts.strftime("%b&nbsp;%d"))
 250
 251                 if (description is not None and
 252                     self.item_is_interesting_for_article(title,
 253                                                          description,
 254                                                          item)):
 255                     longblurb = blurb
 256
 257                     longblurb += u"<BR>"
 258                     longblurb += description
 259                     longblurb += u"</DIV>"
 260                     longblurb = longblurb.replace("font-size:34pt",
 261                                                   "font-size:44pt")
 262                     self.details.add(longblurb)
 263
 264                 blurb += u"</DIV>"
 265                 self.news.add(blurb)
 266                 count += 1
 267         return count > 0