#!/usr/bin/python

"""Simple code to merge items from an arbitrary list of RSS feeds.

TODO:
  currently assumes everything is RSS 2.0
  ignores timezone in pubDate
  no support for Atom


Design
 get list of rss feeds
 loop over list
   download rss feed
     use caching?
     use etag/304 support?
   import rss feed into xml node tree
 sort xml node tree by date
 output new rss feed from xml node tree
"""

import re
import sys
from datetime import datetime
from datetime import timedelta
from datetime import tzinfo
from xml.dom import minidom

RFC2822_FORMAT = "%a, %d %b %Y %H:%M:%S %Z"
RFC2822_REGEX = re.compile(r'^(\w{3}), (\d{1,2}) (\w{3}) (\d{4}) '
                           r'(\d{1,2}):(\d{2}):(\d{2}) ([+-]\d+)$')
RFC2822_NO_TZ = "%a, %d %b %Y %H:%M:%S"

def openAnything(source):            
  """URI, filename, or string --> stream

  This function lets you define parsers that take any input source
  (URL, pathname to local or network file, or actual data as a string)
  and deal with it in a uniform manner.  Returned object is guaranteed
  to have all the basic stdio read methods (read, readline, readlines).
  Just .close() the object when you're done with it.
  
  Examples:
  >>> from xml.dom import minidom
  >>> sock = openAnything("http://localhost/kant.xml")
  >>> doc = minidom.parse(sock)
  >>> sock.close()
  >>> sock = openAnything("c:\\inetpub\\wwwroot\\kant.xml")
  >>> doc = minidom.parse(sock)
  >>> sock.close()
  >>> sock = openAnything("<ref id='conjunction'><text>and</text><text>or</text></ref>")
  >>> doc = minidom.parse(sock)
  >>> sock.close()

  taken from "Dive Into Python" by Mark Pilgrim
  """
  if hasattr(source, "read"):
    return source

  if source == '-':
    import sys
    return sys.stdin

  # try to open with urllib (if source is http, ftp, or file URL)
  import urllib                         
  try:                                  
    return urllib.urlopen(source)     
  except (IOError, OSError):            
    pass                              
  
  # try to open with native open function (if source is pathname)
  try:                                  
    return open(source)               
  except (IOError, OSError):            
    pass                              
  
  # treat source as string
  import StringIO                       
  return StringIO.StringIO(str(source)) 


class FixedOffsetTZ(tzinfo):
  """Convert an RFC2822 style offset to a tzinfo object."""

  def __init__(self, offset_string):
    # convert offset from [+-]HHMM to int(minutes)
    parts = re.match('^([+-])(\d{1,2})(\d{2})', offset_string).groups()
    offset = int("%s%s" % (parts[0], int(parts[1]) * 60 + int(parts[2])))
    self.__offset = timedelta(minutes = offset)
    self.__name = str(self.__offset)

  def utcoffset(self, dt):
    return self.__offset

  def tzname(self, dt):
    return self.__name

  def dst(self, dt):
    return 0


def pubDate_to_datetime(time_string):
  """python's time.strptime is lame."""

  """
  m = re.match('^(.*?) ([+-]\d+)$', time_string)
  if not m:
    raise ValueError("string does not match RFC2822 time format: %s" %
	time_string)

  t = datetime.strptime(m.groups()[0], RFC2822_NO_TZ)
  tz = FixedOffsetTZ(m.groups()[1])

  t.replace(tzinfo=tz)

  return t
  """
  months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
            'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
  match = RFC2822_REGEX.match(time_string).groups()
  tz = FixedOffsetTZ(match[7])
  return datetime(int(match[3]), months[match[2]], int(match[1]),
      int(match[4]), int(match[5]), int(match[6]), 0, tz)

def importFile(file):
  merged_items = {}
  f = openAnything(file)
  xmldoc = minidom.parse(f).documentElement
  f.close()

  pubDates = xmldoc.getElementsByTagName('pubDate')

  for pubDate in pubDates:
    # interpret date from pubDate.firstChild.nodeValue as unixtime
    key = pubDate_to_datetime(pubDate.firstChild.nodeValue)
    # insert pubDate.parentNode into list indexed by pubDate
    merged_items.setdefault(key, []).append(pubDate.parentNode)

  return merged_items

def createRSSDocument(title='title', link='link', description='desc'):
  template = """<?xml version='1.0' encoding='UTF-8'?><rss
  xmlns:atom='http://www.w3.org/2005/Atom'
  xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'
  xmlns:exif='http://schemas.google.com/photos/exif/2007'
  xmlns:geo='http://www.w3.org/2003/01/geo/wgs84_pos#'
  xmlns:gml='http://www.opengis.net/gml'
  xmlns:georss='http://www.georss.org/georss'
  xmlns:photo='http://www.pheed.com/pheed/'
  xmlns:media='http://search.yahoo.com/mrss/'
  xmlns:batch='http://schemas.google.com/gdata/batch'
  xmlns:gphoto='http://schemas.google.com/photos/2007' version='2.0'>
  <channel>
  <title>%(title)s</title>
  <link>%(link)s</link>
  <description><![CDATA[%(description)s]]></description>
  </channel>
  </rss>"""

  return minidom.parseString(template % {'title': title, 'link': link,
    'description': description})

def addItem(xmldoc, **kwargs):
  """Naive method for adding an item to an RSS document."""

  channel = xmldoc.getElementsByTagName('channel')[0]
  new_item = channel.appendChild(xmldoc.createElement('item'))

  for tag in ('title', 'guid', 'link', 'pubDate'):
    new_tag = xmldoc.createElement(tag)
    new_text = xmldoc.createTextNode(kwargs[tag])
    new_tag.appendChild(new_text)
    new_item.appendChild(new_tag)

  new_tag = xmldoc.createElement('description')
  new_text = xmldoc.createCDATASection(kwargs['description'])
  new_tag.appendChild(new_text)
  new_item.appendChild(new_tag)

def printMergedItems(merged_items):
  keys = merged_items.keys()
  keys.sort(reverse=True)

  for key in keys:
    for item in merged_items[key]:
      title = item.getElementsByTagName('title')
      print "%s %s" % (key.isoformat(), title[0].firstChild.nodeValue)

def createMergedDocument(merged_items):

  xmldoc = createRSSDocument() # TODO: add title, link etc
  channel = xmldoc.getElementsByTagName('channel')[0]
  
  keys = merged_items.keys()
  keys.sort(reverse=True)

  for key in keys:
    for item in merged_items[key]:
      channel.appendChild(item.cloneNode(True))

  return xmldoc

if __name__ == "__main__":
  merged_items = {} # dict of arrays, keyed by datetime objects
  for file in sys.argv[1:]:
    merged_items.update(importFile(file))

  new_doc = createMergedDocument(merged_items)

  print new_doc.toprettyxml()

