949 lines
37 KiB
Python
949 lines
37 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: UTF-8 -*-
|
||
"""Planet aggregator library.
|
||
|
||
This package is a library for developing web sites or software that
|
||
aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
|
||
combined feed.
|
||
"""
|
||
|
||
__version__ = "1.0"
|
||
__authors__ = [ "Scott James Remnant <scott@netsplit.com>",
|
||
"Jeff Waugh <jdub@perkypants.org>" ]
|
||
__license__ = "Python"
|
||
|
||
|
||
# Modules available without separate import
|
||
import cache
|
||
import feedparser
|
||
import sanitize
|
||
import htmltmpl
|
||
import sgmllib
|
||
try:
|
||
import logging
|
||
except:
|
||
import compat_logging as logging
|
||
|
||
# Limit the effect of "from planet import *"
|
||
__all__ = ("cache", "feedparser", "htmltmpl", "logging",
|
||
"Planet", "Channel", "NewsItem")
|
||
|
||
|
||
import locale
|
||
import os
|
||
import md5
|
||
import time
|
||
import dbhash
|
||
import re
|
||
import xml.sax.saxutils
|
||
|
||
|
||
# Version information (for generator headers)
|
||
VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)
|
||
|
||
# Default User-Agent header to send when retreiving feeds
|
||
USER_AGENT = VERSION + " " + feedparser.USER_AGENT
|
||
|
||
# Default cache directory
|
||
CACHE_DIRECTORY = "cache"
|
||
|
||
# Default number of items to display from a new feed
|
||
NEW_FEED_ITEMS = 10
|
||
|
||
# Useful common date/time formats
|
||
TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
|
||
TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"
|
||
|
||
|
||
# Log instance to use here
|
||
log = logging.getLogger("planet")
|
||
try:
|
||
log.warning
|
||
except:
|
||
log.warning = log.warn
|
||
|
||
# Defaults for the template file config sections
|
||
ENCODING = "utf-8"
|
||
ITEMS_PER_PAGE = 60
|
||
DAYS_PER_PAGE = 0
|
||
OUTPUT_DIR = "output"
|
||
DATE_FORMAT = "%B %d, %Y %I:%M %p"
|
||
NEW_DATE_FORMAT = "%B %d, %Y"
|
||
ACTIVITY_THRESHOLD = 0
|
||
|
||
class stripHtml(sgmllib.SGMLParser):
|
||
"remove all tags from the data"
|
||
def __init__(self, data):
|
||
sgmllib.SGMLParser.__init__(self)
|
||
self.result=''
|
||
self.feed(data)
|
||
self.close()
|
||
def handle_data(self, data):
|
||
if data: self.result+=data
|
||
|
||
def template_info(item, date_format):
|
||
"""Produce a dictionary of template information."""
|
||
info = {}
|
||
for key in item.keys():
|
||
if item.key_type(key) == item.DATE:
|
||
date = item.get_as_date(key)
|
||
info[key] = time.strftime(date_format, date)
|
||
info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
|
||
info[key + "_822"] = time.strftime(TIMEFMT_822, date)
|
||
else:
|
||
info[key] = item[key]
|
||
if 'title' in item.keys():
|
||
info['title_plain'] = stripHtml(info['title']).result
|
||
|
||
return info
|
||
|
||
|
||
class Planet:
|
||
"""A set of channels.
|
||
|
||
This class represents a set of channels for which the items will
|
||
be aggregated together into one combined feed.
|
||
|
||
Properties:
|
||
user_agent User-Agent header to fetch feeds with.
|
||
cache_directory Directory to store cached channels in.
|
||
new_feed_items Number of items to display from a new feed.
|
||
filter A regular expression that articles must match.
|
||
exclude A regular expression that articles must not match.
|
||
"""
|
||
def __init__(self, config):
|
||
self.config = config
|
||
|
||
self._channels = []
|
||
|
||
self.user_agent = USER_AGENT
|
||
self.cache_directory = CACHE_DIRECTORY
|
||
self.new_feed_items = NEW_FEED_ITEMS
|
||
self.filter = None
|
||
self.exclude = None
|
||
|
||
def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
|
||
"""Get a template value from the configuration, with a default."""
|
||
if self.config.has_option(template, option):
|
||
return self.config.get(template, option, raw=raw, vars=None)
|
||
elif self.config.has_option("Planet", option):
|
||
return self.config.get("Planet", option, raw=raw, vars=None)
|
||
else:
|
||
return default
|
||
|
||
def gather_channel_info(self, template_file="Planet"):
|
||
date_format = self.tmpl_config_get(template_file,
|
||
"date_format", DATE_FORMAT, raw=1)
|
||
|
||
activity_threshold = int(self.tmpl_config_get(template_file,
|
||
"activity_threshold",
|
||
ACTIVITY_THRESHOLD))
|
||
|
||
if activity_threshold:
|
||
activity_horizon = \
|
||
time.gmtime(time.time()-86400*activity_threshold)
|
||
else:
|
||
activity_horizon = 0
|
||
|
||
channels = {}
|
||
channels_list = []
|
||
for channel in self.channels(hidden=1):
|
||
channels[channel] = template_info(channel, date_format)
|
||
channels_list.append(channels[channel])
|
||
|
||
# identify inactive feeds
|
||
if activity_horizon:
|
||
latest = channel.items(sorted=1)
|
||
if len(latest)==0 or latest[0].date < activity_horizon:
|
||
channels[channel]["message"] = \
|
||
"no activity in %d days" % activity_threshold
|
||
|
||
# report channel level errors
|
||
if not channel.url_status: continue
|
||
status = int(channel.url_status)
|
||
if status == 403:
|
||
channels[channel]["message"] = "403: forbidden"
|
||
elif status == 404:
|
||
channels[channel]["message"] = "404: not found"
|
||
elif status == 408:
|
||
channels[channel]["message"] = "408: request timeout"
|
||
elif status == 410:
|
||
channels[channel]["message"] = "410: gone"
|
||
elif status == 500:
|
||
channels[channel]["message"] = "internal server error"
|
||
elif status >= 400:
|
||
channels[channel]["message"] = "http status %s" % status
|
||
|
||
return channels, channels_list
|
||
|
||
def gather_items_info(self, channels, template_file="Planet", channel_list=None):
|
||
items_list = []
|
||
prev_date = []
|
||
prev_channel = None
|
||
|
||
date_format = self.tmpl_config_get(template_file,
|
||
"date_format", DATE_FORMAT, raw=1)
|
||
items_per_page = int(self.tmpl_config_get(template_file,
|
||
"items_per_page", ITEMS_PER_PAGE))
|
||
days_per_page = int(self.tmpl_config_get(template_file,
|
||
"days_per_page", DAYS_PER_PAGE))
|
||
new_date_format = self.tmpl_config_get(template_file,
|
||
"new_date_format", NEW_DATE_FORMAT, raw=1)
|
||
|
||
for newsitem in self.items(max_items=items_per_page,
|
||
max_days=days_per_page,
|
||
channels=channel_list):
|
||
newsitem.date = time.localtime(time.mktime(newsitem.date)+7200)
|
||
item_info = template_info(newsitem, date_format)
|
||
chan_info = channels[newsitem._channel]
|
||
for k, v in chan_info.items():
|
||
item_info["channel_" + k] = v
|
||
|
||
# Check for the start of a new day
|
||
if prev_date[:3] != newsitem.date[:3]:
|
||
prev_date = newsitem.date
|
||
item_info["new_date"] = time.strftime(new_date_format,
|
||
newsitem.date)
|
||
|
||
# Check for the start of a new channel
|
||
if item_info.has_key("new_date") \
|
||
or prev_channel != newsitem._channel:
|
||
prev_channel = newsitem._channel
|
||
item_info["new_channel"] = newsitem._channel.url
|
||
|
||
items_list.append(item_info)
|
||
|
||
return items_list
|
||
|
||
def run(self, planet_name, planet_link, template_files, offline = False):
|
||
log = logging.getLogger("planet.runner")
|
||
|
||
# Create a planet
|
||
log.info("Loading cached data")
|
||
if self.config.has_option("Planet", "cache_directory"):
|
||
self.cache_directory = self.config.get("Planet", "cache_directory")
|
||
if self.config.has_option("Planet", "new_feed_items"):
|
||
self.new_feed_items = int(self.config.get("Planet", "new_feed_items"))
|
||
self.user_agent = "%s +%s %s" % (planet_name, planet_link,
|
||
self.user_agent)
|
||
if self.config.has_option("Planet", "filter"):
|
||
self.filter = self.config.get("Planet", "filter")
|
||
|
||
# The other configuration blocks are channels to subscribe to
|
||
for feed_url in self.config.sections():
|
||
if feed_url == "Planet" or feed_url in template_files:
|
||
continue
|
||
|
||
# Create a channel, configure it and subscribe it
|
||
channel = Channel(self, feed_url)
|
||
self.subscribe(channel)
|
||
|
||
# Update it
|
||
try:
|
||
if not offline and not channel.url_status == '410':
|
||
channel.update()
|
||
except KeyboardInterrupt:
|
||
raise
|
||
except:
|
||
log.exception("Update of <%s> failed", feed_url)
|
||
|
||
def generate_all_files(self, template_files, planet_name,
|
||
planet_link, planet_feed, owner_name, owner_email):
|
||
|
||
log = logging.getLogger("planet.runner")
|
||
# Go-go-gadget-template
|
||
for template_file in template_files:
|
||
manager = htmltmpl.TemplateManager()
|
||
log.info("Processing template %s", template_file)
|
||
template = manager.prepare(template_file)
|
||
# Read the configuration
|
||
output_dir = self.tmpl_config_get(template_file,
|
||
"output_dir", OUTPUT_DIR)
|
||
date_format = self.tmpl_config_get(template_file,
|
||
"date_format", DATE_FORMAT, raw=1)
|
||
encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)
|
||
|
||
# We treat each template individually
|
||
base = os.path.splitext(os.path.basename(template_file))[0]
|
||
url = os.path.join(planet_link, base)
|
||
output_file = os.path.join(output_dir, base)
|
||
|
||
# Gather information
|
||
channels, channels_list = self.gather_channel_info(template_file)
|
||
items_list = self.gather_items_info(channels, template_file)
|
||
|
||
# Gather item information
|
||
|
||
# Process the template
|
||
tp = htmltmpl.TemplateProcessor(html_escape=0)
|
||
tp.set("Items", items_list)
|
||
tp.set("Channels", channels_list)
|
||
|
||
# Generic information
|
||
tp.set("generator", VERSION)
|
||
tp.set("name", planet_name)
|
||
tp.set("link", planet_link)
|
||
tp.set("owner_name", owner_name)
|
||
tp.set("owner_email", owner_email)
|
||
tp.set("url", url)
|
||
|
||
if planet_feed:
|
||
tp.set("feed", planet_feed)
|
||
tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')
|
||
|
||
# Update time
|
||
date = time.localtime()
|
||
tp.set("date", time.strftime(date_format, date))
|
||
tp.set("date_iso", time.strftime(TIMEFMT_ISO, date))
|
||
tp.set("date_822", time.strftime(TIMEFMT_822, date))
|
||
|
||
try:
|
||
log.info("Writing %s", output_file)
|
||
output_fd = open(output_file, "w")
|
||
if encoding.lower() in ("utf-8", "utf8"):
|
||
# UTF-8 output is the default because we use that internally
|
||
output_fd.write(tp.process(template))
|
||
elif encoding.lower() in ("xml", "html", "sgml"):
|
||
# Magic for Python 2.3 users
|
||
output = tp.process(template).decode("utf-8")
|
||
output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
|
||
else:
|
||
# Must be a "known" encoding
|
||
output = tp.process(template).decode("utf-8")
|
||
output_fd.write(output.encode(encoding, "replace"))
|
||
output_fd.close()
|
||
except KeyboardInterrupt:
|
||
raise
|
||
except:
|
||
log.exception("Write of %s failed", output_file)
|
||
|
||
def channels(self, hidden=0, sorted=1):
|
||
"""Return the list of channels."""
|
||
channels = []
|
||
for channel in self._channels:
|
||
if hidden or not channel.has_key("hidden"):
|
||
channels.append((channel.name, channel))
|
||
|
||
if sorted:
|
||
locale.setlocale(locale.LC_ALL,"tr_TR.UTF-8")
|
||
channels.sort(key=lambda x: locale.strxfrm(x[0]))
|
||
locale.setlocale(locale.LC_ALL,"C")
|
||
|
||
|
||
return [ c[-1] for c in channels ]
|
||
|
||
def find_by_basename(self, basename):
|
||
for channel in self._channels:
|
||
if basename == channel.cache_basename(): return channel
|
||
|
||
def subscribe(self, channel):
|
||
"""Subscribe the planet to the channel."""
|
||
self._channels.append(channel)
|
||
|
||
def unsubscribe(self, channel):
|
||
"""Unsubscribe the planet from the channel."""
|
||
self._channels.remove(channel)
|
||
|
||
def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
|
||
"""Return an optionally filtered list of items in the channel.
|
||
|
||
The filters are applied in the following order:
|
||
|
||
If hidden is true then items in hidden channels and hidden items
|
||
will be returned.
|
||
|
||
If sorted is true then the item list will be sorted with the newest
|
||
first.
|
||
|
||
If max_items is non-zero then this number of items, at most, will
|
||
be returned.
|
||
|
||
If max_days is non-zero then any items older than the newest by
|
||
this number of days won't be returned. Requires sorted=1 to work.
|
||
|
||
|
||
The sharp-eyed will note that this looks a little strange code-wise,
|
||
it turns out that Python gets *really* slow if we try to sort the
|
||
actual items themselves. Also we use mktime here, but it's ok
|
||
because we discard the numbers and just need them to be relatively
|
||
consistent between each other.
|
||
"""
|
||
planet_filter_re = None
|
||
if self.filter:
|
||
planet_filter_re = re.compile(self.filter, re.I)
|
||
planet_exclude_re = None
|
||
if self.exclude:
|
||
planet_exclude_re = re.compile(self.exclude, re.I)
|
||
|
||
items = []
|
||
seen_guids = {}
|
||
if not channels: channels=self.channels(hidden=hidden, sorted=0)
|
||
for channel in channels:
|
||
for item in channel._items.values():
|
||
if hidden or not item.has_key("hidden"):
|
||
|
||
channel_filter_re = None
|
||
if channel.filter:
|
||
channel_filter_re = re.compile(channel.filter,
|
||
re.I)
|
||
channel_exclude_re = None
|
||
if channel.exclude:
|
||
channel_exclude_re = re.compile(channel.exclude,
|
||
re.I)
|
||
if (planet_filter_re or planet_exclude_re \
|
||
or channel_filter_re or channel_exclude_re):
|
||
title = ""
|
||
if item.has_key("title"):
|
||
title = item.title
|
||
content = item.get_content("content")
|
||
|
||
if planet_filter_re:
|
||
if not (planet_filter_re.search(title) \
|
||
or planet_filter_re.search(content)):
|
||
continue
|
||
|
||
if planet_exclude_re:
|
||
if (planet_exclude_re.search(title) \
|
||
or planet_exclude_re.search(content)):
|
||
continue
|
||
|
||
if channel_filter_re:
|
||
if not (channel_filter_re.search(title) \
|
||
or channel_filter_re.search(content)):
|
||
continue
|
||
|
||
if channel_exclude_re:
|
||
if (channel_exclude_re.search(title) \
|
||
or channel_exclude_re.search(content)):
|
||
continue
|
||
|
||
if not seen_guids.has_key(item.id):
|
||
seen_guids[item.id] = 1;
|
||
items.append((time.mktime(item.date), item.order, item))
|
||
|
||
# Sort the list
|
||
if sorted:
|
||
items.sort()
|
||
items.reverse()
|
||
|
||
# Apply max_items filter
|
||
if len(items) and max_items:
|
||
items = items[:max_items]
|
||
|
||
# Apply max_days filter
|
||
if len(items) and max_days:
|
||
max_count = 0
|
||
max_time = items[0][0] - max_days * 84600
|
||
for item in items:
|
||
if item[0] > max_time:
|
||
max_count += 1
|
||
else:
|
||
items = items[:max_count]
|
||
break
|
||
|
||
return [ i[-1] for i in items ]
|
||
|
||
class Channel(cache.CachedInfo):
|
||
"""A list of news items.
|
||
|
||
This class represents a list of news items taken from the feed of
|
||
a website or other source.
|
||
|
||
Properties:
|
||
url URL of the feed.
|
||
url_etag E-Tag of the feed URL.
|
||
url_modified Last modified time of the feed URL.
|
||
url_status Last HTTP status of the feed URL.
|
||
hidden Channel should be hidden (True if exists).
|
||
name Name of the feed owner, or feed title.
|
||
next_order Next order number to be assigned to NewsItem
|
||
|
||
updated Correct UTC-Normalised update time of the feed.
|
||
last_updated Correct UTC-Normalised time the feed was last updated.
|
||
|
||
id An identifier the feed claims is unique (*).
|
||
title One-line title (*).
|
||
link Link to the original format feed (*).
|
||
tagline Short description of the feed (*).
|
||
info Longer description of the feed (*).
|
||
|
||
modified Date the feed claims to have been modified (*).
|
||
|
||
author Name of the author (*).
|
||
publisher Name of the publisher (*).
|
||
generator Name of the feed generator (*).
|
||
category Category name (*).
|
||
copyright Copyright information for humans to read (*).
|
||
license Link to the licence for the content (*).
|
||
docs Link to the specification of the feed format (*).
|
||
language Primary language (*).
|
||
errorreportsto E-Mail address to send error reports to (*).
|
||
|
||
image_url URL of an associated image (*).
|
||
image_link Link to go with the associated image (*).
|
||
image_title Alternative text of the associated image (*).
|
||
image_width Width of the associated image (*).
|
||
image_height Height of the associated image (*).
|
||
|
||
filter A regular expression that articles must match.
|
||
exclude A regular expression that articles must not match.
|
||
|
||
Properties marked (*) will only be present if the original feed
|
||
contained them. Note that the optional 'modified' date field is simply
|
||
a claim made by the item and parsed from the information given, 'updated'
|
||
(and 'last_updated') are far more reliable sources of information.
|
||
|
||
Some feeds may define additional properties to those above.
|
||
"""
|
||
IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
|
||
"url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")
|
||
|
||
def __init__(self, planet, url):
|
||
if not os.path.isdir(planet.cache_directory):
|
||
os.makedirs(planet.cache_directory)
|
||
cache_filename = cache.filename(planet.cache_directory, url)
|
||
cache_file = dbhash.open(cache_filename, "c", 0666)
|
||
|
||
cache.CachedInfo.__init__(self, cache_file, url, root=1)
|
||
|
||
self._items = {}
|
||
self._planet = planet
|
||
self._expired = []
|
||
self.url = url
|
||
# retain the original URL for error reporting
|
||
self.configured_url = url
|
||
self.url_etag = None
|
||
self.url_status = None
|
||
self.url_modified = None
|
||
self.name = None
|
||
self.updated = None
|
||
self.last_updated = None
|
||
self.filter = None
|
||
self.exclude = None
|
||
self.next_order = "0"
|
||
self.cache_read()
|
||
self.cache_read_entries()
|
||
|
||
if planet.config.has_section(url):
|
||
for option in planet.config.options(url):
|
||
value = planet.config.get(url, option)
|
||
self.set_as_string(option, value, cached=0)
|
||
|
||
def has_item(self, id_):
|
||
"""Check whether the item exists in the channel."""
|
||
return self._items.has_key(id_)
|
||
|
||
def get_item(self, id_):
|
||
"""Return the item from the channel."""
|
||
return self._items[id_]
|
||
|
||
# Special methods
|
||
__contains__ = has_item
|
||
|
||
def items(self, hidden=0, sorted=0):
|
||
"""Return the item list."""
|
||
items = []
|
||
for item in self._items.values():
|
||
if hidden or not item.has_key("hidden"):
|
||
items.append((time.mktime(item.date), item.order, item))
|
||
|
||
if sorted:
|
||
items.sort()
|
||
items.reverse()
|
||
|
||
return [ i[-1] for i in items ]
|
||
|
||
def __iter__(self):
|
||
"""Iterate the sorted item list."""
|
||
return iter(self.items(sorted=1))
|
||
|
||
def cache_read_entries(self):
|
||
"""Read entry information from the cache."""
|
||
keys = self._cache.keys()
|
||
for key in keys:
|
||
if key.find(" ") != -1: continue
|
||
if self.has_key(key): continue
|
||
|
||
item = NewsItem(self, key)
|
||
self._items[key] = item
|
||
|
||
def cache_basename(self):
|
||
return cache.filename('',self._id)
|
||
|
||
def cache_write(self, sync=1):
|
||
"""Write channel and item information to the cache."""
|
||
for item in self._items.values():
|
||
item.cache_write(sync=0)
|
||
for item in self._expired:
|
||
item.cache_clear(sync=0)
|
||
cache.CachedInfo.cache_write(self, sync)
|
||
|
||
self._expired = []
|
||
|
||
def feed_information(self):
|
||
"""
|
||
Returns a description string for the feed embedded in this channel.
|
||
|
||
This will usually simply be the feed url embedded in <>, but in the
|
||
case where the current self.url has changed from the original
|
||
self.configured_url the string will contain both pieces of information.
|
||
This is so that the URL in question is easier to find in logging
|
||
output: getting an error about a URL that doesn't appear in your config
|
||
file is annoying.
|
||
"""
|
||
if self.url == self.configured_url:
|
||
return "<%s>" % self.url
|
||
else:
|
||
return "<%s> (formerly <%s>)" % (self.url, self.configured_url)
|
||
|
||
def update(self):
|
||
"""Download the feed to refresh the information.
|
||
|
||
This does the actual work of pulling down the feed and if it changes
|
||
updates the cached information about the feed and entries within it.
|
||
"""
|
||
info = feedparser.parse(self.url,
|
||
etag=self.url_etag, modified=self.url_modified,
|
||
agent=self._planet.user_agent)
|
||
if info.has_key("status"):
|
||
self.url_status = str(info.status)
|
||
elif info.has_key("entries") and len(info.entries)>0:
|
||
self.url_status = str(200)
|
||
elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
|
||
self.url_status = str(408)
|
||
else:
|
||
self.url_status = str(500)
|
||
|
||
if self.url_status == '301' and (info.has_key("entries") and len(info.entries)>0):
|
||
if self.url != info.url:
|
||
log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
|
||
os.link(cache.filename(self._planet.cache_directory, self.url),
|
||
cache.filename(self._planet.cache_directory, info.url))
|
||
self.url != info.url
|
||
elif self.url_status == '304':
|
||
log.info("Feed %s unchanged", self.feed_information())
|
||
return
|
||
elif self.url_status == '410':
|
||
log.info("Feed %s gone", self.feed_information())
|
||
self.cache_write()
|
||
return
|
||
elif self.url_status == '408':
|
||
log.warning("Feed %s timed out", self.feed_information())
|
||
return
|
||
elif int(self.url_status) >= 400:
|
||
log.error("Error %s while updating feed %s",
|
||
self.url_status, self.feed_information())
|
||
return
|
||
else:
|
||
log.info("Updating feed %s", self.feed_information())
|
||
|
||
self.url_etag = info.has_key("etag") and info.etag or None
|
||
self.url_modified = info.has_key("modified") and info.modified or None
|
||
if self.url_etag is not None:
|
||
log.debug("E-Tag: %s", self.url_etag)
|
||
if self.url_modified is not None:
|
||
log.debug("Last Modified: %s",
|
||
time.strftime(TIMEFMT_ISO, self.url_modified))
|
||
|
||
self.update_info(info.feed)
|
||
self.update_entries(info.entries)
|
||
self.cache_write()
|
||
|
||
def update_info(self, feed):
|
||
"""Update information from the feed.
|
||
|
||
This reads the feed information supplied by feedparser and updates
|
||
the cached information about the feed. These are the various
|
||
potentially interesting properties that you might care about.
|
||
"""
|
||
for key in feed.keys():
|
||
if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
|
||
# Ignored fields
|
||
pass
|
||
elif feed.has_key(key + "_parsed"):
|
||
# Ignore unparsed date fields
|
||
pass
|
||
elif key.endswith("_detail"):
|
||
# retain name and email sub-fields
|
||
if feed[key].has_key('name') and feed[key].name:
|
||
self.set_as_string(key.replace("_detail","_name"), \
|
||
feed[key].name)
|
||
if feed[key].has_key('email') and feed[key].email:
|
||
self.set_as_string(key.replace("_detail","_email"), \
|
||
feed[key].email)
|
||
elif key == "items":
|
||
# Ignore items field
|
||
pass
|
||
elif key.endswith("_parsed"):
|
||
# Date fields
|
||
if feed[key] is not None:
|
||
self.set_as_date(key[:-len("_parsed")], feed[key])
|
||
elif key == "image":
|
||
# Image field: save all the information
|
||
if feed[key].has_key("url"):
|
||
self.set_as_string(key + "_url", feed[key].url)
|
||
if feed[key].has_key("link"):
|
||
self.set_as_string(key + "_link", feed[key].link)
|
||
if feed[key].has_key("title"):
|
||
self.set_as_string(key + "_title", feed[key].title)
|
||
if feed[key].has_key("width"):
|
||
self.set_as_string(key + "_width", str(feed[key].width))
|
||
if feed[key].has_key("height"):
|
||
self.set_as_string(key + "_height", str(feed[key].height))
|
||
elif isinstance(feed[key], (str, unicode)):
|
||
# String fields
|
||
try:
|
||
detail = key + '_detail'
|
||
if feed.has_key(detail) and feed[detail].has_key('type'):
|
||
if feed[detail].type == 'text/html':
|
||
feed[key] = sanitize.HTML(feed[key])
|
||
elif feed[detail].type == 'text/plain':
|
||
feed[key] = xml.sax.saxutils.escape(feed[key])
|
||
self.set_as_string(key, feed[key])
|
||
except KeyboardInterrupt:
|
||
raise
|
||
except:
|
||
log.exception("Ignored '%s' of <%s>, unknown format",
|
||
key, self.url)
|
||
|
||
def update_entries(self, entries):
|
||
"""Update entries from the feed.
|
||
|
||
This reads the entries supplied by feedparser and updates the
|
||
cached information about them. It's at this point we update
|
||
the 'updated' timestamp and keep the old one in 'last_updated',
|
||
these provide boundaries for acceptable entry times.
|
||
|
||
If this is the first time a feed has been updated then most of the
|
||
items will be marked as hidden, according to Planet.new_feed_items.
|
||
|
||
If the feed does not contain items which, according to the sort order,
|
||
should be there; those items are assumed to have been expired from
|
||
the feed or replaced and are removed from the cache.
|
||
"""
|
||
if not len(entries):
|
||
return
|
||
|
||
self.last_updated = self.updated
|
||
self.updated = time.gmtime()
|
||
|
||
new_items = []
|
||
feed_items = []
|
||
for entry in entries:
|
||
# Try really hard to find some kind of unique identifier
|
||
if entry.has_key("id"):
|
||
entry_id = cache.utf8(entry.id)
|
||
elif entry.has_key("link"):
|
||
entry_id = cache.utf8(entry.link)
|
||
elif entry.has_key("title"):
|
||
entry_id = (self.url + "/"
|
||
+ md5.new(cache.utf8(entry.title)).hexdigest())
|
||
elif entry.has_key("summary"):
|
||
entry_id = (self.url + "/"
|
||
+ md5.new(cache.utf8(entry.summary)).hexdigest())
|
||
else:
|
||
log.error("Unable to find or generate id, entry ignored")
|
||
continue
|
||
|
||
# Create the item if necessary and update
|
||
if self.has_item(entry_id):
|
||
item = self._items[entry_id]
|
||
else:
|
||
item = NewsItem(self, entry_id)
|
||
self._items[entry_id] = item
|
||
new_items.append(item)
|
||
item.update(entry)
|
||
feed_items.append(entry_id)
|
||
|
||
# Hide excess items the first time through
|
||
if self.last_updated is None and self._planet.new_feed_items \
|
||
and len(feed_items) > self._planet.new_feed_items:
|
||
item.hidden = "yes"
|
||
log.debug("Marked <%s> as hidden (new feed)", entry_id)
|
||
|
||
# Assign order numbers in reverse
|
||
new_items.reverse()
|
||
for item in new_items:
|
||
item.order = self.next_order = str(int(self.next_order) + 1)
|
||
|
||
# Check for expired or replaced items
|
||
feed_count = len(feed_items)
|
||
log.debug("Items in Feed: %d", feed_count)
|
||
for item in self.items(sorted=1):
|
||
if feed_count < 1:
|
||
break
|
||
elif item.id in feed_items:
|
||
feed_count -= 1
|
||
elif item._channel.url_status != '226':
|
||
del(self._items[item.id])
|
||
self._expired.append(item)
|
||
log.debug("Removed expired or replaced item <%s>", item.id)
|
||
|
||
def get_name(self, key):
|
||
"""Return the key containing the name."""
|
||
for key in ("name", "title"):
|
||
if self.has_key(key) and self.key_type(key) != self.NULL:
|
||
return self.get_as_string(key)
|
||
|
||
return ""
|
||
|
||
class NewsItem(cache.CachedInfo):
|
||
"""An item of news.
|
||
|
||
This class represents a single item of news on a channel. They're
|
||
created by members of the Channel class and accessible through it.
|
||
|
||
Properties:
|
||
id Channel-unique identifier for this item.
|
||
id_hash Relatively short, printable cryptographic hash of id
|
||
date Corrected UTC-Normalised update time, for sorting.
|
||
order Order in which items on the same date can be sorted.
|
||
hidden Item should be hidden (True if exists).
|
||
|
||
title One-line title (*).
|
||
link Link to the original format text (*).
|
||
summary Short first-page summary (*).
|
||
content Full HTML content.
|
||
|
||
modified Date the item claims to have been modified (*).
|
||
issued Date the item claims to have been issued (*).
|
||
created Date the item claims to have been created (*).
|
||
expired Date the item claims to expire (*).
|
||
|
||
author Name of the author (*).
|
||
publisher Name of the publisher (*).
|
||
category Category name (*).
|
||
comments Link to a page to enter comments (*).
|
||
license Link to the licence for the content (*).
|
||
source_name Name of the original source of this item (*).
|
||
source_link Link to the original source of this item (*).
|
||
|
||
Properties marked (*) will only be present if the original feed
|
||
contained them. Note that the various optional date fields are
|
||
simply claims made by the item and parsed from the information
|
||
given, 'date' is a far more reliable source of information.
|
||
|
||
Some feeds may define additional properties to those above.
|
||
"""
|
||
IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
|
||
"guidislink", "date", "tags")
|
||
|
||
def __init__(self, channel, id_):
|
||
cache.CachedInfo.__init__(self, channel._cache, id_)
|
||
|
||
self._channel = channel
|
||
self.id = id_
|
||
self.id_hash = md5.new(id_).hexdigest()
|
||
self.date = None
|
||
self.order = None
|
||
self.content = None
|
||
self.cache_read()
|
||
|
||
def update(self, entry):
|
||
"""Update the item from the feedparser entry given."""
|
||
for key in entry.keys():
|
||
if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
|
||
# Ignored fields
|
||
pass
|
||
elif entry.has_key(key + "_parsed"):
|
||
# Ignore unparsed date fields
|
||
pass
|
||
elif key.endswith("_detail"):
|
||
# retain name, email, and language sub-fields
|
||
if entry[key].has_key('name') and entry[key].name:
|
||
self.set_as_string(key.replace("_detail","_name"), \
|
||
entry[key].name)
|
||
if entry[key].has_key('email') and entry[key].email:
|
||
self.set_as_string(key.replace("_detail","_email"), \
|
||
entry[key].email)
|
||
if entry[key].has_key('language') and entry[key].language and \
|
||
(not self._channel.has_key('language') or \
|
||
entry[key].language != self._channel.language):
|
||
self.set_as_string(key.replace("_detail","_language"), \
|
||
entry[key].language)
|
||
elif key.endswith("_parsed"):
|
||
# Date fields
|
||
if entry[key] is not None:
|
||
self.set_as_date(key[:-len("_parsed")], entry[key])
|
||
elif key == "source":
|
||
# Source field: save both url and value
|
||
if entry[key].has_key("value"):
|
||
self.set_as_string(key + "_name", entry[key].value)
|
||
if entry[key].has_key("url"):
|
||
self.set_as_string(key + "_link", entry[key].url)
|
||
elif key == "content":
|
||
# Content field: concatenate the values
|
||
value = ""
|
||
for item in entry[key]:
|
||
if item.type == 'text/html':
|
||
item.value = sanitize.HTML(item.value)
|
||
elif item.type == 'text/plain':
|
||
item.value = xml.sax.saxutils.escape(item.value)
|
||
if item.has_key('language') and item.language and \
|
||
(not self._channel.has_key('language') or
|
||
item.language != self._channel.language) :
|
||
self.set_as_string(key + "_language", item.language)
|
||
value += cache.utf8(item.value)
|
||
self.set_as_string(key, value)
|
||
elif isinstance(entry[key], (str, unicode)):
|
||
# String fields
|
||
try:
|
||
detail = key + '_detail'
|
||
if entry.has_key(detail):
|
||
if entry[detail].has_key('type'):
|
||
if entry[detail].type == 'text/html':
|
||
entry[key] = sanitize.HTML(entry[key])
|
||
elif entry[detail].type == 'text/plain':
|
||
entry[key] = xml.sax.saxutils.escape(entry[key])
|
||
self.set_as_string(key, entry[key])
|
||
except KeyboardInterrupt:
|
||
raise
|
||
except:
|
||
log.exception("Ignored '%s' of <%s>, unknown format",
|
||
key, self.id)
|
||
|
||
# Generate the date field if we need to
|
||
self.get_date("date")
|
||
|
||
def get_date(self, key):
|
||
"""Get (or update) the date key.
|
||
|
||
We check whether the date the entry claims to have been changed is
|
||
since we last updated this feed and when we pulled the feed off the
|
||
site.
|
||
|
||
If it is then it's probably not bogus, and we'll sort accordingly.
|
||
|
||
If it isn't then we bound it appropriately, this ensures that
|
||
entries appear in posting sequence but don't overlap entries
|
||
added in previous updates and don't creep into the next one.
|
||
"""
|
||
|
||
for other_key in ("updated", "modified", "published", "issued", "created"):
|
||
if self.has_key(other_key):
|
||
date = self.get_as_date(other_key)
|
||
break
|
||
else:
|
||
date = None
|
||
|
||
if date is not None:
|
||
if date > self._channel.updated:
|
||
date = self._channel.updated
|
||
# elif date < self._channel.last_updated:
|
||
# date = self._channel.updated
|
||
elif self.has_key(key) and self.key_type(key) != self.NULL:
|
||
return self.get_as_date(key)
|
||
else:
|
||
date = self._channel.updated
|
||
|
||
self.set_as_date(key, date)
|
||
return date
|
||
|
||
def get_content(self, key):
|
||
"""Return the key containing the content."""
|
||
for key in ("content", "tagline", "summary"):
|
||
if self.has_key(key) and self.key_type(key) != self.NULL:
|
||
return self.get_as_string(key)
|
||
|
||
return ""
|