970 lines
38 KiB
Python
970 lines
38 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: UTF-8 -*-
|
||
|
"""Planet aggregator library.
|
||
|
|
||
|
This package is a library for developing web sites or software that
|
||
|
aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
|
||
|
combined feed.
|
||
|
"""
|
||
|
|
||
|
__version__ = "2.0"
|
||
|
__authors__ = [ "Scott James Remnant <scott@netsplit.com>",
|
||
|
"Jeff Waugh <jdub@perkypants.org>" ]
|
||
|
__license__ = "Python"
|
||
|
|
||
|
import locale
|
||
|
|
||
|
# Modules available without separate import
|
||
|
import cache
|
||
|
import feedparser
|
||
|
import sanitize
|
||
|
import htmltmpl
|
||
|
import sgmllib
|
||
|
try:
|
||
|
import logging
|
||
|
except:
|
||
|
import compat_logging as logging
|
||
|
|
||
|
# Limit the effect of "from planet import *"
|
||
|
__all__ = ("cache", "feedparser", "htmltmpl", "logging",
|
||
|
"Planet", "Channel", "NewsItem")
|
||
|
|
||
|
|
||
|
import os
|
||
|
import md5
|
||
|
import time
|
||
|
import dbhash
|
||
|
import re
|
||
|
|
||
|
try:
|
||
|
from xml.sax.saxutils import escape
|
||
|
except:
|
||
|
def escape(data):
|
||
|
return data.replace("&","&").replace(">",">").replace("<","<")
|
||
|
|
||
|
# Version information (for generator headers)
|
||
|
VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)
|
||
|
|
||
|
# Default User-Agent header to send when retreiving feeds
|
||
|
USER_AGENT = VERSION + " " + feedparser.USER_AGENT
|
||
|
|
||
|
# Default cache directory
|
||
|
CACHE_DIRECTORY = "cache"
|
||
|
|
||
|
# Default number of items to display from a new feed
|
||
|
NEW_FEED_ITEMS = 10
|
||
|
|
||
|
# Useful common date/time formats
|
||
|
TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
|
||
|
TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"
|
||
|
|
||
|
|
||
|
# Log instance to use here
|
||
|
log = logging.getLogger("planet")
|
||
|
try:
|
||
|
log.warning
|
||
|
except:
|
||
|
log.warning = log.warn
|
||
|
|
||
|
# Defaults for the template file config sections
|
||
|
ENCODING = "utf-8"
|
||
|
ITEMS_PER_PAGE = 60
|
||
|
DAYS_PER_PAGE = 0
|
||
|
OUTPUT_DIR = "output"
|
||
|
DATE_FORMAT = "%B %d, %Y %I:%M %p"
|
||
|
NEW_DATE_FORMAT = "%B %d, %Y"
|
||
|
ACTIVITY_THRESHOLD = 0
|
||
|
|
||
|
class stripHtml(sgmllib.SGMLParser):
|
||
|
"remove all tags from the data"
|
||
|
def __init__(self, data):
|
||
|
sgmllib.SGMLParser.__init__(self)
|
||
|
self.result=''
|
||
|
self.feed(data)
|
||
|
self.close()
|
||
|
def handle_data(self, data):
|
||
|
if data: self.result+=data
|
||
|
|
||
|
def template_info(item, date_format):
|
||
|
"""Produce a dictionary of template information."""
|
||
|
info = {}
|
||
|
|
||
|
#set the locale so that the dates at the feeds will be in english
|
||
|
lc=locale.getlocale()
|
||
|
if lc[0] == None:
|
||
|
try:
|
||
|
locale.setlocale(locale.LC_ALL, '')
|
||
|
except:
|
||
|
pass
|
||
|
elif lc[0].find("tr") != -1:
|
||
|
try:
|
||
|
locale.setlocale(locale.LC_ALL, '')
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
for key in item.keys():
|
||
|
if item.key_type(key) == item.DATE:
|
||
|
date = item.get_as_date(key)
|
||
|
info[key] = time.strftime(date_format, date)
|
||
|
info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
|
||
|
info[key + "_822"] = time.strftime(TIMEFMT_822, date)
|
||
|
else:
|
||
|
info[key] = item[key]
|
||
|
if 'title' in item.keys():
|
||
|
info['title_plain'] = stripHtml(info['title']).result
|
||
|
|
||
|
return info
|
||
|
|
||
|
|
||
|
class Planet:
|
||
|
"""A set of channels.
|
||
|
|
||
|
This class represents a set of channels for which the items will
|
||
|
be aggregated together into one combined feed.
|
||
|
|
||
|
Properties:
|
||
|
user_agent User-Agent header to fetch feeds with.
|
||
|
cache_directory Directory to store cached channels in.
|
||
|
new_feed_items Number of items to display from a new feed.
|
||
|
filter A regular expression that articles must match.
|
||
|
exclude A regular expression that articles must not match.
|
||
|
"""
|
||
|
def __init__(self, config):
|
||
|
self.config = config
|
||
|
|
||
|
self._channels = []
|
||
|
|
||
|
self.user_agent = USER_AGENT
|
||
|
self.cache_directory = CACHE_DIRECTORY
|
||
|
self.new_feed_items = NEW_FEED_ITEMS
|
||
|
self.filter = None
|
||
|
self.exclude = None
|
||
|
|
||
|
def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
|
||
|
"""Get a template value from the configuration, with a default."""
|
||
|
if self.config.has_option(template, option):
|
||
|
return self.config.get(template, option, raw=raw, vars=None)
|
||
|
elif self.config.has_option("Planet", option):
|
||
|
return self.config.get("Planet", option, raw=raw, vars=None)
|
||
|
else:
|
||
|
return default
|
||
|
|
||
|
def gather_channel_info(self, template_file="Planet"):
|
||
|
date_format = self.tmpl_config_get(template_file,
|
||
|
"date_format", DATE_FORMAT, raw=1)
|
||
|
|
||
|
activity_threshold = int(self.tmpl_config_get(template_file,
|
||
|
"activity_threshold",
|
||
|
ACTIVITY_THRESHOLD))
|
||
|
|
||
|
if activity_threshold:
|
||
|
activity_horizon = \
|
||
|
time.gmtime(time.time()-86400*activity_threshold)
|
||
|
else:
|
||
|
activity_horizon = 0
|
||
|
|
||
|
channels = {}
|
||
|
channels_list = []
|
||
|
for channel in self.channels(hidden=1):
|
||
|
channels[channel] = template_info(channel, date_format)
|
||
|
channels_list.append(channels[channel])
|
||
|
|
||
|
# identify inactive feeds
|
||
|
if activity_horizon:
|
||
|
latest = channel.items(sorted=1)
|
||
|
if len(latest)==0 or latest[0].date < activity_horizon:
|
||
|
channels[channel]["message"] = \
|
||
|
"no activity in %d days" % activity_threshold
|
||
|
|
||
|
# report channel level errors
|
||
|
if not channel.url_status: continue
|
||
|
status = int(channel.url_status)
|
||
|
if status == 403:
|
||
|
channels[channel]["message"] = "403: forbidden"
|
||
|
elif status == 404:
|
||
|
channels[channel]["message"] = "404: not found"
|
||
|
elif status == 408:
|
||
|
channels[channel]["message"] = "408: request timeout"
|
||
|
elif status == 410:
|
||
|
channels[channel]["message"] = "410: gone"
|
||
|
elif status == 500:
|
||
|
channels[channel]["message"] = "internal server error"
|
||
|
elif status >= 400:
|
||
|
channels[channel]["message"] = "http status %s" % status
|
||
|
|
||
|
return channels, channels_list
|
||
|
|
||
|
def gather_items_info(self, channels, template_file="Planet", channel_list=None):
|
||
|
items_list = []
|
||
|
prev_date = []
|
||
|
prev_channel = None
|
||
|
|
||
|
date_format = self.tmpl_config_get(template_file,
|
||
|
"date_format", DATE_FORMAT, raw=1)
|
||
|
items_per_page = int(self.tmpl_config_get(template_file,
|
||
|
"items_per_page", ITEMS_PER_PAGE))
|
||
|
days_per_page = int(self.tmpl_config_get(template_file,
|
||
|
"days_per_page", DAYS_PER_PAGE))
|
||
|
new_date_format = self.tmpl_config_get(template_file,
|
||
|
"new_date_format", NEW_DATE_FORMAT, raw=1)
|
||
|
|
||
|
for newsitem in self.items(max_items=items_per_page,
|
||
|
max_days=days_per_page,
|
||
|
channels=channel_list):
|
||
|
item_info = template_info(newsitem, date_format)
|
||
|
chan_info = channels[newsitem._channel]
|
||
|
for k, v in chan_info.items():
|
||
|
item_info["channel_" + k] = v
|
||
|
|
||
|
# Check for the start of a new day
|
||
|
if prev_date[:3] != newsitem.date[:3]:
|
||
|
prev_date = newsitem.date
|
||
|
item_info["new_date"] = time.strftime(new_date_format,
|
||
|
newsitem.date)
|
||
|
|
||
|
# Check for the start of a new channel
|
||
|
if item_info.has_key("new_date") \
|
||
|
or prev_channel != newsitem._channel:
|
||
|
prev_channel = newsitem._channel
|
||
|
item_info["new_channel"] = newsitem._channel.url
|
||
|
|
||
|
items_list.append(item_info)
|
||
|
|
||
|
return items_list
|
||
|
|
||
|
def run(self, planet_name, planet_link, template_files, offline = False):
|
||
|
log = logging.getLogger("planet.runner")
|
||
|
|
||
|
# Create a planet
|
||
|
log.info("Loading cached data")
|
||
|
if self.config.has_option("Planet", "cache_directory"):
|
||
|
self.cache_directory = self.config.get("Planet", "cache_directory")
|
||
|
if self.config.has_option("Planet", "new_feed_items"):
|
||
|
self.new_feed_items = int(self.config.get("Planet", "new_feed_items"))
|
||
|
self.user_agent = "%s +%s %s" % (planet_name, planet_link,
|
||
|
self.user_agent)
|
||
|
if self.config.has_option("Planet", "filter"):
|
||
|
self.filter = self.config.get("Planet", "filter")
|
||
|
|
||
|
# The other configuration blocks are channels to subscribe to
|
||
|
for feed_url in self.config.sections():
|
||
|
if feed_url == "Planet" or feed_url in template_files:
|
||
|
continue
|
||
|
log.info(feed_url)
|
||
|
# Create a channel, configure it and subscribe it
|
||
|
channel = Channel(self, feed_url)
|
||
|
self.subscribe(channel)
|
||
|
|
||
|
# Update it
|
||
|
try:
|
||
|
if not offline and not channel.url_status == '410':
|
||
|
channel.update()
|
||
|
except KeyboardInterrupt:
|
||
|
raise
|
||
|
except:
|
||
|
log.exception("Update of <%s> failed", feed_url)
|
||
|
|
||
|
def generate_all_files(self, template_files, planet_name,
|
||
|
planet_link, planet_feed, owner_name, owner_email):
|
||
|
|
||
|
log = logging.getLogger("planet.runner")
|
||
|
# Go-go-gadget-template
|
||
|
for template_file in template_files:
|
||
|
manager = htmltmpl.TemplateManager()
|
||
|
log.info("Processing template %s", template_file)
|
||
|
try:
|
||
|
template = manager.prepare(template_file)
|
||
|
except htmltmpl.TemplateError:
|
||
|
template = manager.prepare(os.path.basename(template_file))
|
||
|
# Read the configuration
|
||
|
output_dir = self.tmpl_config_get(template_file,
|
||
|
"output_dir", OUTPUT_DIR)
|
||
|
date_format = self.tmpl_config_get(template_file,
|
||
|
"date_format", DATE_FORMAT, raw=1)
|
||
|
encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)
|
||
|
|
||
|
# We treat each template individually
|
||
|
base = os.path.splitext(os.path.basename(template_file))[0]
|
||
|
url = os.path.join(planet_link, base)
|
||
|
output_file = os.path.join(output_dir, base)
|
||
|
|
||
|
# Gather information
|
||
|
channels, channels_list = self.gather_channel_info(template_file)
|
||
|
items_list = self.gather_items_info(channels, template_file)
|
||
|
|
||
|
# Gather item information
|
||
|
|
||
|
# Process the template
|
||
|
tp = htmltmpl.TemplateProcessor(html_escape=0)
|
||
|
tp.set("Items", items_list)
|
||
|
tp.set("Channels", channels_list)
|
||
|
|
||
|
# Generic information
|
||
|
tp.set("generator", VERSION)
|
||
|
tp.set("name", planet_name)
|
||
|
tp.set("link", planet_link)
|
||
|
tp.set("owner_name", owner_name)
|
||
|
tp.set("owner_email", owner_email)
|
||
|
tp.set("url", url)
|
||
|
|
||
|
if planet_feed:
|
||
|
tp.set("feed", planet_feed)
|
||
|
tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')
|
||
|
|
||
|
# Update time
|
||
|
date = time.localtime()
|
||
|
tp.set("date", time.strftime(date_format, date))
|
||
|
tp.set("date_iso", time.strftime(TIMEFMT_ISO, date))
|
||
|
tp.set("date_822", time.strftime(TIMEFMT_822, date))
|
||
|
|
||
|
try:
|
||
|
log.info("Writing %s", output_file)
|
||
|
output_fd = open(output_file, "w")
|
||
|
if encoding.lower() in ("utf-8", "utf8"):
|
||
|
# UTF-8 output is the default because we use that internally
|
||
|
output_fd.write(tp.process(template))
|
||
|
elif encoding.lower() in ("xml", "html", "sgml"):
|
||
|
# Magic for Python 2.3 users
|
||
|
output = tp.process(template).decode("utf-8")
|
||
|
output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
|
||
|
else:
|
||
|
# Must be a "known" encoding
|
||
|
output = tp.process(template).decode("utf-8")
|
||
|
output_fd.write(output.encode(encoding, "replace"))
|
||
|
output_fd.close()
|
||
|
except KeyboardInterrupt:
|
||
|
raise
|
||
|
except:
|
||
|
log.exception("Write of %s failed", output_file)
|
||
|
|
||
|
def channels(self, hidden=0, sorted=1):
|
||
|
"""Return the list of channels."""
|
||
|
channels = []
|
||
|
for channel in self._channels:
|
||
|
if hidden or not channel.has_key("hidden"):
|
||
|
channels.append((channel.name, channel))
|
||
|
|
||
|
if sorted:
|
||
|
channels.sort()
|
||
|
|
||
|
return [ c[-1] for c in channels ]
|
||
|
|
||
|
def find_by_basename(self, basename):
|
||
|
for channel in self._channels:
|
||
|
if basename == channel.cache_basename(): return channel
|
||
|
|
||
|
def subscribe(self, channel):
|
||
|
"""Subscribe the planet to the channel."""
|
||
|
self._channels.append(channel)
|
||
|
|
||
|
def unsubscribe(self, channel):
|
||
|
"""Unsubscribe the planet from the channel."""
|
||
|
self._channels.remove(channel)
|
||
|
|
||
|
def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
|
||
|
"""Return an optionally filtered list of items in the channel.
|
||
|
|
||
|
The filters are applied in the following order:
|
||
|
|
||
|
If hidden is true then items in hidden channels and hidden items
|
||
|
will be returned.
|
||
|
|
||
|
If sorted is true then the item list will be sorted with the newest
|
||
|
first.
|
||
|
|
||
|
If max_items is non-zero then this number of items, at most, will
|
||
|
be returned.
|
||
|
|
||
|
If max_days is non-zero then any items older than the newest by
|
||
|
this number of days won't be returned. Requires sorted=1 to work.
|
||
|
|
||
|
|
||
|
The sharp-eyed will note that this looks a little strange code-wise,
|
||
|
it turns out that Python gets *really* slow if we try to sort the
|
||
|
actual items themselves. Also we use mktime here, but it's ok
|
||
|
because we discard the numbers and just need them to be relatively
|
||
|
consistent between each other.
|
||
|
"""
|
||
|
planet_filter_re = None
|
||
|
if self.filter:
|
||
|
planet_filter_re = re.compile(self.filter, re.I)
|
||
|
planet_exclude_re = None
|
||
|
if self.exclude:
|
||
|
planet_exclude_re = re.compile(self.exclude, re.I)
|
||
|
|
||
|
items = []
|
||
|
seen_guids = {}
|
||
|
if not channels: channels=self.channels(hidden=hidden, sorted=0)
|
||
|
for channel in channels:
|
||
|
for item in channel._items.values():
|
||
|
if hidden or not item.has_key("hidden"):
|
||
|
|
||
|
channel_filter_re = None
|
||
|
if channel.filter:
|
||
|
channel_filter_re = re.compile(channel.filter,
|
||
|
re.I)
|
||
|
channel_exclude_re = None
|
||
|
if channel.exclude:
|
||
|
channel_exclude_re = re.compile(channel.exclude,
|
||
|
re.I)
|
||
|
if (planet_filter_re or planet_exclude_re \
|
||
|
or channel_filter_re or channel_exclude_re):
|
||
|
title = ""
|
||
|
if item.has_key("title"):
|
||
|
title = item.title
|
||
|
content = item.get_content("content")
|
||
|
|
||
|
if planet_filter_re:
|
||
|
if not (planet_filter_re.search(title) \
|
||
|
or planet_filter_re.search(content)):
|
||
|
continue
|
||
|
|
||
|
if planet_exclude_re:
|
||
|
if (planet_exclude_re.search(title) \
|
||
|
or planet_exclude_re.search(content)):
|
||
|
continue
|
||
|
|
||
|
if channel_filter_re:
|
||
|
if not (channel_filter_re.search(title) \
|
||
|
or channel_filter_re.search(content)):
|
||
|
continue
|
||
|
|
||
|
if channel_exclude_re:
|
||
|
if (channel_exclude_re.search(title) \
|
||
|
or channel_exclude_re.search(content)):
|
||
|
continue
|
||
|
|
||
|
if not seen_guids.has_key(item.id):
|
||
|
seen_guids[item.id] = 1;
|
||
|
items.append((time.mktime(item.date), item.order, item))
|
||
|
|
||
|
# Sort the list
|
||
|
if sorted:
|
||
|
items.sort()
|
||
|
items.reverse()
|
||
|
|
||
|
# Apply max_items filter
|
||
|
if len(items) and max_items:
|
||
|
items = items[:max_items]
|
||
|
|
||
|
# Apply max_days filter
|
||
|
if len(items) and max_days:
|
||
|
max_count = 0
|
||
|
max_time = items[0][0] - max_days * 84600
|
||
|
for item in items:
|
||
|
if item[0] > max_time:
|
||
|
max_count += 1
|
||
|
else:
|
||
|
items = items[:max_count]
|
||
|
break
|
||
|
|
||
|
return [ i[-1] for i in items ]
|
||
|
|
||
|
class Channel(cache.CachedInfo):
|
||
|
"""A list of news items.
|
||
|
|
||
|
This class represents a list of news items taken from the feed of
|
||
|
a website or other source.
|
||
|
|
||
|
Properties:
|
||
|
url URL of the feed.
|
||
|
url_etag E-Tag of the feed URL.
|
||
|
url_modified Last modified time of the feed URL.
|
||
|
url_status Last HTTP status of the feed URL.
|
||
|
hidden Channel should be hidden (True if exists).
|
||
|
name Name of the feed owner, or feed title.
|
||
|
next_order Next order number to be assigned to NewsItem
|
||
|
|
||
|
updated Correct UTC-Normalised update time of the feed.
|
||
|
last_updated Correct UTC-Normalised time the feed was last updated.
|
||
|
|
||
|
id An identifier the feed claims is unique (*).
|
||
|
title One-line title (*).
|
||
|
link Link to the original format feed (*).
|
||
|
tagline Short description of the feed (*).
|
||
|
info Longer description of the feed (*).
|
||
|
|
||
|
modified Date the feed claims to have been modified (*).
|
||
|
|
||
|
author Name of the author (*).
|
||
|
publisher Name of the publisher (*).
|
||
|
generator Name of the feed generator (*).
|
||
|
category Category name (*).
|
||
|
copyright Copyright information for humans to read (*).
|
||
|
license Link to the licence for the content (*).
|
||
|
docs Link to the specification of the feed format (*).
|
||
|
language Primary language (*).
|
||
|
errorreportsto E-Mail address to send error reports to (*).
|
||
|
|
||
|
image_url URL of an associated image (*).
|
||
|
image_link Link to go with the associated image (*).
|
||
|
image_title Alternative text of the associated image (*).
|
||
|
image_width Width of the associated image (*).
|
||
|
image_height Height of the associated image (*).
|
||
|
|
||
|
filter A regular expression that articles must match.
|
||
|
exclude A regular expression that articles must not match.
|
||
|
|
||
|
Properties marked (*) will only be present if the original feed
|
||
|
contained them. Note that the optional 'modified' date field is simply
|
||
|
a claim made by the item and parsed from the information given, 'updated'
|
||
|
(and 'last_updated') are far more reliable sources of information.
|
||
|
|
||
|
Some feeds may define additional properties to those above.
|
||
|
"""
|
||
|
IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
|
||
|
"url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")
|
||
|
|
||
|
def __init__(self, planet, url):
|
||
|
if not os.path.isdir(planet.cache_directory):
|
||
|
os.makedirs(planet.cache_directory)
|
||
|
cache_filename = cache.filename(planet.cache_directory, url)
|
||
|
cache_file = dbhash.open(cache_filename, "c", 0666)
|
||
|
|
||
|
cache.CachedInfo.__init__(self, cache_file, url, root=1)
|
||
|
|
||
|
self._items = {}
|
||
|
self._planet = planet
|
||
|
self._expired = []
|
||
|
self.url = url
|
||
|
# retain the original URL for error reporting
|
||
|
self.configured_url = url
|
||
|
self.url_etag = None
|
||
|
self.url_status = None
|
||
|
self.url_modified = None
|
||
|
self.name = None
|
||
|
self.updated = None
|
||
|
self.last_updated = None
|
||
|
self.filter = None
|
||
|
self.exclude = None
|
||
|
self.next_order = "0"
|
||
|
self.cache_read()
|
||
|
self.cache_read_entries()
|
||
|
|
||
|
if planet.config.has_section(url):
|
||
|
for option in planet.config.options(url):
|
||
|
value = planet.config.get(url, option)
|
||
|
self.set_as_string(option, value, cached=0)
|
||
|
|
||
|
def has_item(self, id_):
|
||
|
"""Check whether the item exists in the channel."""
|
||
|
return self._items.has_key(id_)
|
||
|
|
||
|
def get_item(self, id_):
|
||
|
"""Return the item from the channel."""
|
||
|
return self._items[id_]
|
||
|
|
||
|
# Special methods
|
||
|
__contains__ = has_item
|
||
|
|
||
|
def items(self, hidden=0, sorted=0):
|
||
|
"""Return the item list."""
|
||
|
items = []
|
||
|
for item in self._items.values():
|
||
|
if hidden or not item.has_key("hidden"):
|
||
|
items.append((time.mktime(item.date), item.order, item))
|
||
|
|
||
|
if sorted:
|
||
|
items.sort()
|
||
|
items.reverse()
|
||
|
|
||
|
return [ i[-1] for i in items ]
|
||
|
|
||
|
def __iter__(self):
|
||
|
"""Iterate the sorted item list."""
|
||
|
return iter(self.items(sorted=1))
|
||
|
|
||
|
def cache_read_entries(self):
|
||
|
"""Read entry information from the cache."""
|
||
|
keys = self._cache.keys()
|
||
|
for key in keys:
|
||
|
if key.find(" ") != -1: continue
|
||
|
if self.has_key(key): continue
|
||
|
|
||
|
item = NewsItem(self, key)
|
||
|
self._items[key] = item
|
||
|
|
||
|
def cache_basename(self):
|
||
|
return cache.filename('',self._id)
|
||
|
|
||
|
def cache_write(self, sync=1):
|
||
|
|
||
|
"""Write channel and item information to the cache."""
|
||
|
for item in self._items.values():
|
||
|
item.cache_write(sync=0)
|
||
|
for item in self._expired:
|
||
|
item.cache_clear(sync=0)
|
||
|
cache.CachedInfo.cache_write(self, sync)
|
||
|
|
||
|
self._expired = []
|
||
|
|
||
|
def feed_information(self):
|
||
|
"""
|
||
|
Returns a description string for the feed embedded in this channel.
|
||
|
|
||
|
This will usually simply be the feed url embedded in <>, but in the
|
||
|
case where the current self.url has changed from the original
|
||
|
self.configured_url the string will contain both pieces of information.
|
||
|
This is so that the URL in question is easier to find in logging
|
||
|
output: getting an error about a URL that doesn't appear in your config
|
||
|
file is annoying.
|
||
|
"""
|
||
|
if self.url == self.configured_url:
|
||
|
return "<%s>" % self.url
|
||
|
else:
|
||
|
return "<%s> (formerly <%s>)" % (self.url, self.configured_url)
|
||
|
|
||
|
def update(self):
|
||
|
"""Download the feed to refresh the information.
|
||
|
|
||
|
This does the actual work of pulling down the feed and if it changes
|
||
|
updates the cached information about the feed and entries within it.
|
||
|
"""
|
||
|
info = feedparser.parse(self.url,
|
||
|
etag=self.url_etag, modified=self.url_modified,
|
||
|
agent=self._planet.user_agent)
|
||
|
if info.has_key("status"):
|
||
|
self.url_status = str(info.status)
|
||
|
elif info.has_key("entries") and len(info.entries)>0:
|
||
|
self.url_status = str(200)
|
||
|
elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
|
||
|
self.url_status = str(408)
|
||
|
else:
|
||
|
self.url_status = str(500)
|
||
|
|
||
|
if self.url_status == '301' and \
|
||
|
(info.has_key("entries") and len(info.entries)>0):
|
||
|
log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
|
||
|
try:
|
||
|
os.link(cache.filename(self._planet.cache_directory, self.url),
|
||
|
cache.filename(self._planet.cache_directory, info.url))
|
||
|
except:
|
||
|
pass
|
||
|
self.url = info.url
|
||
|
elif self.url_status == '304':
|
||
|
log.info("Feed %s unchanged", self.feed_information())
|
||
|
return
|
||
|
elif self.url_status == '410':
|
||
|
log.info("Feed %s gone", self.feed_information())
|
||
|
self.cache_write()
|
||
|
return
|
||
|
elif self.url_status == '408':
|
||
|
log.warning("Feed %s timed out", self.feed_information())
|
||
|
return
|
||
|
elif int(self.url_status) >= 400:
|
||
|
log.error("Error %s while updating feed %s",
|
||
|
self.url_status, self.feed_information())
|
||
|
return
|
||
|
else:
|
||
|
log.info("Updating feed %s", self.feed_information())
|
||
|
|
||
|
self.url_etag = info.has_key("etag") and info.etag or None
|
||
|
self.url_modified = info.has_key("modified") and info.modified or None
|
||
|
if self.url_etag is not None:
|
||
|
log.debug("E-Tag: %s", self.url_etag)
|
||
|
if self.url_modified is not None:
|
||
|
log.debug("Last Modified: %s",
|
||
|
time.strftime(TIMEFMT_ISO, self.url_modified))
|
||
|
|
||
|
self.update_info(info.feed)
|
||
|
self.update_entries(info.entries)
|
||
|
self.cache_write()
|
||
|
|
||
|
def update_info(self, feed):
|
||
|
"""Update information from the feed.
|
||
|
|
||
|
This reads the feed information supplied by feedparser and updates
|
||
|
the cached information about the feed. These are the various
|
||
|
potentially interesting properties that you might care about.
|
||
|
"""
|
||
|
for key in feed.keys():
|
||
|
if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
|
||
|
# Ignored fields
|
||
|
pass
|
||
|
elif feed.has_key(key + "_parsed"):
|
||
|
# Ignore unparsed date fields
|
||
|
pass
|
||
|
elif key.endswith("_detail"):
|
||
|
# retain name and email sub-fields
|
||
|
if feed[key].has_key('name') and feed[key].name:
|
||
|
self.set_as_string(key.replace("_detail","_name"), \
|
||
|
feed[key].name)
|
||
|
if feed[key].has_key('email') and feed[key].email:
|
||
|
self.set_as_string(key.replace("_detail","_email"), \
|
||
|
feed[key].email)
|
||
|
elif key == "items":
|
||
|
# Ignore items field
|
||
|
pass
|
||
|
elif key.endswith("_parsed"):
|
||
|
# Date fields
|
||
|
if feed[key] is not None:
|
||
|
self.set_as_date(key[:-len("_parsed")], feed[key])
|
||
|
elif key == "image":
|
||
|
# Image field: save all the information
|
||
|
if feed[key].has_key("url"):
|
||
|
self.set_as_string(key + "_url", feed[key].url)
|
||
|
if feed[key].has_key("link"):
|
||
|
self.set_as_string(key + "_link", feed[key].link)
|
||
|
if feed[key].has_key("title"):
|
||
|
self.set_as_string(key + "_title", feed[key].title)
|
||
|
if feed[key].has_key("width"):
|
||
|
self.set_as_string(key + "_width", str(feed[key].width))
|
||
|
if feed[key].has_key("height"):
|
||
|
self.set_as_string(key + "_height", str(feed[key].height))
|
||
|
elif isinstance(feed[key], (str, unicode)):
|
||
|
# String fields
|
||
|
try:
|
||
|
detail = key + '_detail'
|
||
|
if feed.has_key(detail) and feed[detail].has_key('type'):
|
||
|
if feed[detail].type == 'text/html':
|
||
|
feed[key] = sanitize.HTML(feed[key])
|
||
|
elif feed[detail].type == 'text/plain':
|
||
|
feed[key] = escape(feed[key])
|
||
|
self.set_as_string(key, feed[key])
|
||
|
except KeyboardInterrupt:
|
||
|
raise
|
||
|
except:
|
||
|
log.exception("Ignored '%s' of <%s>, unknown format",
|
||
|
key, self.url)
|
||
|
|
||
|
def update_entries(self, entries):
|
||
|
"""Update entries from the feed.
|
||
|
|
||
|
This reads the entries supplied by feedparser and updates the
|
||
|
cached information about them. It's at this point we update
|
||
|
the 'updated' timestamp and keep the old one in 'last_updated',
|
||
|
these provide boundaries for acceptable entry times.
|
||
|
|
||
|
If this is the first time a feed has been updated then most of the
|
||
|
items will be marked as hidden, according to Planet.new_feed_items.
|
||
|
|
||
|
If the feed does not contain items which, according to the sort order,
|
||
|
should be there; those items are assumed to have been expired from
|
||
|
the feed or replaced and are removed from the cache.
|
||
|
"""
|
||
|
if not len(entries):
|
||
|
return
|
||
|
|
||
|
self.last_updated = self.updated
|
||
|
self.updated = time.gmtime()
|
||
|
|
||
|
new_items = []
|
||
|
feed_items = []
|
||
|
for entry in entries:
|
||
|
# Try really hard to find some kind of unique identifier
|
||
|
if entry.has_key("id"):
|
||
|
entry_id = cache.utf8(entry.id)
|
||
|
elif entry.has_key("link"):
|
||
|
entry_id = cache.utf8(entry.link)
|
||
|
elif entry.has_key("title"):
|
||
|
entry_id = (self.url + "/"
|
||
|
+ md5.new(cache.utf8(entry.title)).hexdigest())
|
||
|
elif entry.has_key("summary"):
|
||
|
entry_id = (self.url + "/"
|
||
|
+ md5.new(cache.utf8(entry.summary)).hexdigest())
|
||
|
else:
|
||
|
log.error("Unable to find or generate id, entry ignored")
|
||
|
continue
|
||
|
|
||
|
# Create the item if necessary and update
|
||
|
if self.has_item(entry_id):
|
||
|
item = self._items[entry_id]
|
||
|
else:
|
||
|
item = NewsItem(self, entry_id)
|
||
|
self._items[entry_id] = item
|
||
|
new_items.append(item)
|
||
|
item.update(entry)
|
||
|
feed_items.append(entry_id)
|
||
|
|
||
|
# Hide excess items the first time through
|
||
|
if self.last_updated is None and self._planet.new_feed_items \
|
||
|
and len(feed_items) > self._planet.new_feed_items:
|
||
|
item.hidden = "yes"
|
||
|
log.debug("Marked <%s> as hidden (new feed)", entry_id)
|
||
|
|
||
|
# Assign order numbers in reverse
|
||
|
new_items.reverse()
|
||
|
for item in new_items:
|
||
|
item.order = self.next_order = str(int(self.next_order) + 1)
|
||
|
|
||
|
# Check for expired or replaced items
|
||
|
feed_count = len(feed_items)
|
||
|
log.debug("Items in Feed: %d", feed_count)
|
||
|
for item in self.items(sorted=1):
|
||
|
if feed_count < 1:
|
||
|
break
|
||
|
elif item.id in feed_items:
|
||
|
feed_count -= 1
|
||
|
elif item._channel.url_status != '226':
|
||
|
del(self._items[item.id])
|
||
|
self._expired.append(item)
|
||
|
log.debug("Removed expired or replaced item <%s>", item.id)
|
||
|
|
||
|
def get_name(self, key):
|
||
|
"""Return the key containing the name."""
|
||
|
for key in ("name", "title"):
|
||
|
if self.has_key(key) and self.key_type(key) != self.NULL:
|
||
|
return self.get_as_string(key)
|
||
|
|
||
|
return ""
|
||
|
|
||
|
class NewsItem(cache.CachedInfo):
|
||
|
"""An item of news.
|
||
|
|
||
|
This class represents a single item of news on a channel. They're
|
||
|
created by members of the Channel class and accessible through it.
|
||
|
|
||
|
Properties:
|
||
|
id Channel-unique identifier for this item.
|
||
|
id_hash Relatively short, printable cryptographic hash of id
|
||
|
date Corrected UTC-Normalised update time, for sorting.
|
||
|
order Order in which items on the same date can be sorted.
|
||
|
hidden Item should be hidden (True if exists).
|
||
|
|
||
|
title One-line title (*).
|
||
|
link Link to the original format text (*).
|
||
|
summary Short first-page summary (*).
|
||
|
content Full HTML content.
|
||
|
|
||
|
modified Date the item claims to have been modified (*).
|
||
|
issued Date the item claims to have been issued (*).
|
||
|
created Date the item claims to have been created (*).
|
||
|
expired Date the item claims to expire (*).
|
||
|
|
||
|
author Name of the author (*).
|
||
|
publisher Name of the publisher (*).
|
||
|
category Category name (*).
|
||
|
comments Link to a page to enter comments (*).
|
||
|
license Link to the licence for the content (*).
|
||
|
source_name Name of the original source of this item (*).
|
||
|
source_link Link to the original source of this item (*).
|
||
|
|
||
|
Properties marked (*) will only be present if the original feed
|
||
|
contained them. Note that the various optional date fields are
|
||
|
simply claims made by the item and parsed from the information
|
||
|
given, 'date' is a far more reliable source of information.
|
||
|
|
||
|
Some feeds may define additional properties to those above.
|
||
|
"""
|
||
|
IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
|
||
|
"guidislink", "date", "tags")
|
||
|
|
||
|
def __init__(self, channel, id_):
|
||
|
cache.CachedInfo.__init__(self, channel._cache, id_)
|
||
|
|
||
|
self._channel = channel
|
||
|
self.id = id_
|
||
|
self.id_hash = md5.new(id_).hexdigest()
|
||
|
self.date = None
|
||
|
self.order = None
|
||
|
self.content = None
|
||
|
self.cache_read()
|
||
|
|
||
|
def update(self, entry):
|
||
|
"""Update the item from the feedparser entry given."""
|
||
|
for key in entry.keys():
|
||
|
if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
|
||
|
# Ignored fields
|
||
|
pass
|
||
|
elif entry.has_key(key + "_parsed"):
|
||
|
# Ignore unparsed date fields
|
||
|
pass
|
||
|
elif key.endswith("_detail"):
|
||
|
# retain name, email, and language sub-fields
|
||
|
if entry[key].has_key('name') and entry[key].name:
|
||
|
self.set_as_string(key.replace("_detail","_name"), \
|
||
|
entry[key].name)
|
||
|
if entry[key].has_key('email') and entry[key].email:
|
||
|
self.set_as_string(key.replace("_detail","_email"), \
|
||
|
entry[key].email)
|
||
|
if entry[key].has_key('language') and entry[key].language and \
|
||
|
(not self._channel.has_key('language') or \
|
||
|
entry[key].language != self._channel.language):
|
||
|
self.set_as_string(key.replace("_detail","_language"), \
|
||
|
entry[key].language)
|
||
|
elif key.endswith("_parsed"):
|
||
|
# Date fields
|
||
|
if entry[key] is not None:
|
||
|
self.set_as_date(key[:-len("_parsed")], entry[key])
|
||
|
elif key == "source":
|
||
|
# Source field: save both url and value
|
||
|
if entry[key].has_key("value"):
|
||
|
self.set_as_string(key + "_name", entry[key].value)
|
||
|
if entry[key].has_key("url"):
|
||
|
self.set_as_string(key + "_link", entry[key].url)
|
||
|
elif key == "content":
|
||
|
# Content field: concatenate the values
|
||
|
value = ""
|
||
|
for item in entry[key]:
|
||
|
if item.type == 'text/html':
|
||
|
item.value = sanitize.HTML(item.value)
|
||
|
elif item.type == 'text/plain':
|
||
|
item.value = escape(item.value)
|
||
|
if item.has_key('language') and item.language and \
|
||
|
(not self._channel.has_key('language') or
|
||
|
item.language != self._channel.language) :
|
||
|
self.set_as_string(key + "_language", item.language)
|
||
|
value += cache.utf8(item.value)
|
||
|
self.set_as_string(key, value)
|
||
|
elif isinstance(entry[key], (str, unicode)):
|
||
|
# String fields
|
||
|
try:
|
||
|
detail = key + '_detail'
|
||
|
if entry.has_key(detail):
|
||
|
if entry[detail].has_key('type'):
|
||
|
if entry[detail].type == 'text/html':
|
||
|
entry[key] = sanitize.HTML(entry[key])
|
||
|
elif entry[detail].type == 'text/plain':
|
||
|
entry[key] = escape(entry[key])
|
||
|
self.set_as_string(key, entry[key])
|
||
|
except KeyboardInterrupt:
|
||
|
raise
|
||
|
except:
|
||
|
log.exception("Ignored '%s' of <%s>, unknown format",
|
||
|
key, self.id)
|
||
|
|
||
|
# Generate the date field if we need to
|
||
|
self.get_date("date")
|
||
|
|
||
|
def get_date(self, key):
|
||
|
"""Get (or update) the date key.
|
||
|
|
||
|
We check whether the date the entry claims to have been changed is
|
||
|
since we last updated this feed and when we pulled the feed off the
|
||
|
site.
|
||
|
|
||
|
If it is then it's probably not bogus, and we'll sort accordingly.
|
||
|
|
||
|
If it isn't then we bound it appropriately, this ensures that
|
||
|
entries appear in posting sequence but don't overlap entries
|
||
|
added in previous updates and don't creep into the next one.
|
||
|
"""
|
||
|
|
||
|
for other_key in ("updated", "modified", "published", "issued", "created"):
|
||
|
if self.has_key(other_key):
|
||
|
date = self.get_as_date(other_key)
|
||
|
break
|
||
|
else:
|
||
|
date = None
|
||
|
|
||
|
if date is not None:
|
||
|
if date > self._channel.updated:
|
||
|
date = self._channel.updated
|
||
|
# elif date < self._channel.last_updated:
|
||
|
# date = self._channel.updated
|
||
|
elif self.has_key(key) and self.key_type(key) != self.NULL:
|
||
|
return self.get_as_date(key)
|
||
|
else:
|
||
|
date = self._channel.updated
|
||
|
|
||
|
self.set_as_date(key, date)
|
||
|
return date
|
||
|
|
||
|
def get_content(self, key):
|
||
|
"""Return the key containing the content."""
|
||
|
for key in ("content", "tagline", "summary"):
|
||
|
if self.has_key(key) and self.key_type(key) != self.NULL:
|
||
|
return self.get_as_string(key)
|
||
|
|
||
|
return ""
|