#!/usr/bin/python
"""Simple code to merge items from an arbitrary list of RSS feeds.
TODO:
currently assumes everything is RSS 2.0
ignores timezone in pubDate
no support for Atom
Design
get list of rss feeds
loop over list
download rss feed
use caching?
use etag/304 support?
import rss feed into xml node tree
sort xml node tree by date
output new rss feed from xml node tree
"""
import re
import sys
from datetime import datetime
from datetime import timedelta
from datetime import tzinfo
from xml.dom import minidom
RFC2822_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
RFC2822_REGEX = re.compile(r'^(\w{3}), (\d{1,2}) (\w{3}) (\d{4}) '
r'(\d{1,2}):(\d{2}):(\d{2}) ([+-]\d+)$')
RFC2822_NO_TZ = "%a, %d %b %Y %H:%M:%S"
def openAnything(source):
"""URI, filename, or string --> stream
This function lets you define parsers that take any input source
(URL, pathname to local or network file, or actual data as a string)
and deal with it in a uniform manner. Returned object is guaranteed
to have all the basic stdio read methods (read, readline, readlines).
Just .close() the object when you're done with it.
Examples:
>>> from xml.dom import minidom
>>> sock = openAnything("http://localhost/kant.xml")
>>> doc = minidom.parse(sock)
>>> sock.close()
>>> sock = openAnything("c:\\inetpub\\wwwroot\\kant.xml")
>>> doc = minidom.parse(sock)
>>> sock.close()
>>> sock = openAnything("andor")
>>> doc = minidom.parse(sock)
>>> sock.close()
taken from "Dive Into Python" by Mark Pilgrim
"""
if hasattr(source, "read"):
return source
if source == '-':
import sys
return sys.stdin
# try to open with urllib (if source is http, ftp, or file URL)
import urllib
try:
return urllib.urlopen(source)
except (IOError, OSError):
pass
# try to open with native open function (if source is pathname)
try:
return open(source)
except (IOError, OSError):
pass
# treat source as string
import StringIO
return StringIO.StringIO(str(source))
class FixedOffsetTZ(tzinfo):
"""Convert an RFC2822 style offset to a tzinfo object."""
def __init__(self, offset_string):
# convert offset from [+-]HHMM to int(minutes)
parts = re.match('^([+-])(\d{1,2})(\d{2})', offset_string).groups()
offset = int("%s%s" % (parts[0], int(parts[1]) * 60 + int(parts[2])))
self.__offset = timedelta(minutes = offset)
self.__name = str(self.__offset)
def utcoffset(self, dt):
return self.__offset
def tzname(self, dt):
return self.__name
def dst(self, dt):
return 0
def pubDate_to_datetime(time_string):
"""python's time.strptime is lame."""
"""
m = re.match('^(.*?) ([+-]\d+)$', time_string)
if not m:
raise ValueError("string does not match RFC2822 time format: %s" %
time_string)
t = datetime.strptime(m.groups()[0], RFC2822_NO_TZ)
tz = FixedOffsetTZ(m.groups()[1])
t.replace(tzinfo=tz)
return t
"""
months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
match = RFC2822_REGEX.match(time_string).groups()
tz = FixedOffsetTZ(match[7])
return datetime(int(match[3]), months[match[2]], int(match[1]),
int(match[4]), int(match[5]), int(match[6]), 0, tz)
def importFile(file):
merged_items = {}
f = openAnything(file)
xmldoc = minidom.parse(f).documentElement
f.close()
pubDates = xmldoc.getElementsByTagName('pubDate')
for pubDate in pubDates:
# interpret date from pubDate.firstChild.nodeValue as unixtime
key = pubDate_to_datetime(pubDate.firstChild.nodeValue)
# insert pubDate.parentNode into list indexed by pubDate
merged_items.setdefault(key, []).append(pubDate.parentNode)
return merged_items
def createRSSDocument(title='title', link='link', description='desc'):
template = """%(title)s
%(link)s
"""
return minidom.parseString(template % {'title': title, 'link': link,
'description': description})
def addItem(xmldoc, **kwargs):
"""Naive method for adding an item to an RSS document."""
channel = xmldoc.getElementsByTagName('channel')[0]
new_item = channel.appendChild(xmldoc.createElement('item'))
for tag in ('title', 'guid', 'link', 'pubDate'):
new_tag = xmldoc.createElement(tag)
new_text = xmldoc.createTextNode(kwargs[tag])
new_tag.appendChild(new_text)
new_item.appendChild(new_tag)
new_tag = xmldoc.createElement('description')
new_text = xmldoc.createCDATASection(kwargs['description'])
new_tag.appendChild(new_text)
new_item.appendChild(new_tag)
def printMergedItems(merged_items):
keys = merged_items.keys()
keys.sort(reverse=True)
for key in keys:
for item in merged_items[key]:
title = item.getElementsByTagName('title')
print "%s %s" % (key.isoformat(), title[0].firstChild.nodeValue)
def createMergedDocument(merged_items):
xmldoc = createRSSDocument() # TODO: add title, link etc
channel = xmldoc.getElementsByTagName('channel')[0]
keys = merged_items.keys()
keys.sort(reverse=True)
for key in keys:
for item in merged_items[key]:
channel.appendChild(item.cloneNode(True))
return xmldoc
if __name__ == "__main__":
merged_items = {} # dict of arrays, keyed by datetime objects
for file in sys.argv[1:]:
merged_items.update(importFile(file))
new_doc = createMergedDocument(merged_items)
print new_doc.toprettyxml()