From b9604d8330de3a09749682e9888d1c08132ab25b Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Thu, 2 Nov 2006 11:59:25 -0500
Subject: [PATCH 01/39] Different approach to threading

---
 planet/config.py     |  6 ++++
 planet/feedparser.py | 23 ++++++------
 planet/spider.py     | 84 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/planet/config.py b/planet/config.py
index 5c8ffe3..9526f36 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -100,6 +100,7 @@ def __init__():
     define_planet('owner_email', '')
     define_planet('output_theme', '')
     define_planet('output_dir', 'output')
+    define_planet('spider_threads', 0) 
 
     define_planet_list('template_files')
     define_planet_list('bill_of_materials')
@@ -282,6 +283,11 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
         except:
             logger.exception("Unable to read %s readinglist", list)
 
+def http_cache_directory():
+    if parser.has_option('Planet', 'http_cache_directory'):
+        parser.get('Planet', 'http_cache_directory')
+    else:
+        return os.path.join(cache_directory(), 'sources/http')
 
 def cache_sources_directory():
     if parser.has_option('Planet', 'cache_sources_directory'):
diff --git a/planet/feedparser.py b/planet/feedparser.py
index 00675e1..7bb7c60 100755
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """
 
-__version__ = "4.2-pre-" + "$Revision: 1.144 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.142 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
@@ -218,9 +218,6 @@ class FeedParserDict(UserDict):
     def __getitem__(self, key):
         if key == 'category':
             return UserDict.__getitem__(self, 'tags')[0]['term']
-        if key == 'enclosures':
-            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
-            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
         if key == 'categories':
             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
         realkey = self.keymap.get(key, key)
@@ -1306,15 +1303,15 @@ class _FeedParserMixin:
             attrsD.setdefault('type', 'application/atom+xml')
         else:
             attrsD.setdefault('type', 'text/html')
-        context = self._getContext()
         attrsD = self._itsAnHrefDamnIt(attrsD)
         if attrsD.has_key('href'):
             attrsD['href'] = self.resolveURI(attrsD['href'])
-            if attrsD.get('rel')=='enclosure' and not context.get('id'):
-                context['id'] = attrsD.get('href')
         expectingText = self.infeed or self.inentry or self.insource
+        context = self._getContext()
         context.setdefault('links', [])
         context['links'].append(FeedParserDict(attrsD))
+        if attrsD['rel'] == 'enclosure':
+            self._start_enclosure(attrsD)
         if attrsD.has_key('href'):
             expectingText = 0
             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
@@ -1360,7 +1357,6 @@ class _FeedParserMixin:
             self._start_content(attrsD)
         else:
             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
-    _start_dc_description = _start_description
 
     def _start_abstract(self, attrsD):
         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
@@ -1372,7 +1368,6 @@ class _FeedParserMixin:
             value = self.popContent('description')
         self._summaryKey = None
     _end_abstract = _end_description
-    _end_dc_description = _end_description
 
     def _start_info(self, attrsD):
         self.pushContent('info', attrsD, 'text/plain', 1)
@@ -1432,8 +1427,7 @@ class _FeedParserMixin:
     def _start_enclosure(self, attrsD):
         attrsD = self._itsAnHrefDamnIt(attrsD)
         context = self._getContext()
-        attrsD['rel']='enclosure'
-        context.setdefault('links', []).append(FeedParserDict(attrsD))
+        context.setdefault('enclosures', []).append(FeedParserDict(attrsD))
         href = attrsD.get('href')
         if href and not context.get('id'):
             context['id'] = href
@@ -3254,7 +3248,7 @@ def _stripDoctype(data):
 
     return version, data, dict(replacement and safe_pattern.findall(replacement))
     
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], resp_headers=None):
     '''Parse a feed from a URL, file, stream, or string'''
     result = FeedParserDict()
     result['feed'] = FeedParserDict()
@@ -3263,6 +3257,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result['bozo'] = 0
     if type(handlers) == types.InstanceType:
         handlers = [handlers]
+    if resp_headers:
+       f = None
+       data = url_file_stream_or_string
     try:
         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
         data = f.read()
@@ -3307,6 +3304,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result['status'] = f.status
     if hasattr(f, 'headers'):
         result['headers'] = f.headers.dict
+    if resp_headers:
+        result['headers'] = resp_headers 
     if hasattr(f, 'close'):
         f.close()
 
diff --git a/planet/spider.py b/planet/spider.py
index 7e6d91b..ce6cbdd 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -4,7 +4,7 @@ and write each as a set of entries in a cache directory.
 """
 
 # Standard library modules
-import time, calendar, re, os
+import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
 import planet, config, feedparser, reconstitute, shell
@@ -116,8 +116,11 @@ def scrub(feed, data):
                     source.author_detail.has_key('name'):
                     source.author_detail['name'] = \
                         str(stripHtml(source.author_detail.name))
+def _is_http_uri(uri):
+    parsed = urlparse.urlparse(uri)
+    return parsed[0] in ['http', 'https']
 
-def spiderFeed(feed, only_if_new=0):
+def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
     """ Spider (fetch) a single feed """
     log = planet.logger
 
@@ -125,6 +128,7 @@ def spiderFeed(feed, only_if_new=0):
     sources = config.cache_sources_directory()
     if not os.path.exists(sources):
         os.makedirs(sources, 0700)
+
     feed_source = filename(sources, feed)
     feed_info = feedparser.parse(feed_source)
     if feed_info.feed and only_if_new:
@@ -135,14 +139,17 @@ def spiderFeed(feed, only_if_new=0):
         return
 
     # read feed itself
-    modified = None
-    try:
-        modified=time.strptime(
-            feed_info.feed.get('planet_http_last_modified', None))
-    except:
-        pass
-    data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
-        etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
+    if content:
+        data = feedparser.parse(content, resp_headers)
+    else:
+        modified = None
+        try:
+            modified=time.strptime(
+                feed_info.feed.get('planet_http_last_modified', None))
+        except:
+            pass
+        data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
+            etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
 
     # capture http status
     if not data.has_key("status"):
@@ -319,12 +326,62 @@ def spiderFeed(feed, only_if_new=0):
 def spiderPlanet(only_if_new = False):
     """ Spider (fetch) an entire planet """
     log = planet.getLogger(config.log_level(),config.log_format())
-    planet.setTimeout(config.feed_timeout())
 
     global index
     index = True
 
-    for feed in config.subscriptions():
+    if config.spider_threads():
+        import Queue
+        from threading import Thread
+        import httplib2
+
+        work_queue = Queue()
+        awaiting_parsing = Queue()
+
+        def _spider_proc():
+            h = httplib2.Http(config.http_cache_directory())
+            while True:
+                # The non-blocking get will throw an exception when the queue 
+                # is empty which will terminate the thread.
+                uri = work_queue.get(block=False):
+                log.info("Fetching %s", uri)
+                (resp, content) = h.request(uri)
+                awaiting_parsing.put(block=True, (resp, content, uri))
+
+        # Load the work_queue with all the HTTP(S) uris.
+        map(work_queue.put, [uri for uri in config.subscriptions if _is_http_uri(uri)])
+
+        # Start all the worker threads
+        threads = dict([(i, Thread(target=_spider_proc)) for i in range(config.spider_threads())])
+        for t in threads.itervalues():
+            t.start()
+
+        # Process the results as they arrive
+        while work_queue.qsize() and awaiting_parsing.qsize() and threads:
+            item = awaiting_parsing.get(False)
+            if not item and threads:
+                time.sleep(1)
+            while item:
+                try:
+                    (resp_headers, content, uri) = item
+                    spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                except Exception, e:
+                    import sys, traceback
+                    type, value, tb = sys.exc_info()
+                    log.error('Error processing %s', feed)
+                    for line in (traceback.format_exception_only(type, value) +
+                        traceback.format_tb(tb)):
+                        log.error(line.rstrip())
+                item = awaiting_parsing.get(False)
+            for index in threads:
+                if not threads[index].isAlive():
+                    del threads[index]
+                    
+
+    planet.setTimeout(config.feed_timeout())
+    # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
+    unthreaded_work_queue = [uri for uri in config.subscriptions if not config.spider_threads() or not _is_http_uri(uri)]
+    for feed in unthreaded_work_queue:
         try:
             spiderFeed(feed, only_if_new=only_if_new)
         except Exception,e:
@@ -334,3 +391,6 @@ def spiderPlanet(only_if_new = False):
             for line in (traceback.format_exception_only(type, value) +
                 traceback.format_tb(tb)):
                 log.error(line.rstrip())
+
+
+

From 58bb4b6e05400491528d51d4d6df13a03d9d7649 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Thu, 2 Nov 2006 13:29:01 -0500
Subject: [PATCH 02/39] Seems to working now

---
 planet.py        |  5 ++++-
 planet/spider.py | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/planet.py b/planet.py
index 62fb7ac..a285f6c 100755
--- a/planet.py
+++ b/planet.py
@@ -54,7 +54,10 @@ if __name__ == "__main__":
 
     if not offline:
         from planet import spider
-        spider.spiderPlanet(only_if_new=only_if_new)
+        try:
+            spider.spiderPlanet(only_if_new=only_if_new)
+        except Exception, e:
+            print e
 
     from planet import splice
     doc = splice.splice()
diff --git a/planet/spider.py b/planet/spider.py
index ce6cbdd..3e98365 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -330,40 +330,45 @@ def spiderPlanet(only_if_new = False):
     global index
     index = True
 
-    if config.spider_threads():
-        import Queue
+    if int(config.spider_threads()):
+        from Queue import Queue, Empty
         from threading import Thread
         import httplib2
 
         work_queue = Queue()
         awaiting_parsing = Queue()
 
-        def _spider_proc():
+        def _spider_proc(thread_index):
             h = httplib2.Http(config.http_cache_directory())
-            while True:
-                # The non-blocking get will throw an exception when the queue 
-                # is empty which will terminate the thread.
-                uri = work_queue.get(block=False):
-                log.info("Fetching %s", uri)
-                (resp, content) = h.request(uri)
-                awaiting_parsing.put(block=True, (resp, content, uri))
+            try:
+                while True:
+                    # The non-blocking get will throw an exception when the queue 
+                    # is empty which will terminate the thread.
+                    uri = work_queue.get(block=False)
+                    log.info("Fetching %s via %d", uri, thread_index)
+                    (resp, content) = h.request(uri)
+                    awaiting_parsing.put(block=True, item=(resp, content, uri))
+            except Empty, e:
+                log.info("Thread %d finished", thread_index)
+                pass
 
         # Load the work_queue with all the HTTP(S) uris.
-        map(work_queue.put, [uri for uri in config.subscriptions if _is_http_uri(uri)])
+        map(work_queue.put, [uri for uri in config.subscriptions() if _is_http_uri(uri)])
 
         # Start all the worker threads
-        threads = dict([(i, Thread(target=_spider_proc)) for i in range(config.spider_threads())])
+        threads = dict([(i, Thread(target=_spider_proc, args=(i,))) for i in range(int(config.spider_threads()))])
         for t in threads.itervalues():
             t.start()
 
         # Process the results as they arrive
-        while work_queue.qsize() and awaiting_parsing.qsize() and threads:
-            item = awaiting_parsing.get(False)
-            if not item and threads:
+        while work_queue.qsize() or awaiting_parsing.qsize() or threads:
+            if awaiting_parsing.qsize() == 0 and threads:
                 time.sleep(1)
-            while item:
+            while awaiting_parsing.qsize():
+                item = awaiting_parsing.get(False)
                 try:
                     (resp_headers, content, uri) = item
+                    log.info("Parsing pre-fetched %s", uri)
                     spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
                 except Exception, e:
                     import sys, traceback
@@ -372,15 +377,15 @@ def spiderPlanet(only_if_new = False):
                     for line in (traceback.format_exception_only(type, value) +
                         traceback.format_tb(tb)):
                         log.error(line.rstrip())
-                item = awaiting_parsing.get(False)
-            for index in threads:
+            for index in threads.keys():
                 if not threads[index].isAlive():
                     del threads[index]
+    log.info("Finished threaded part of processing.")
                     
 
     planet.setTimeout(config.feed_timeout())
     # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
-    unthreaded_work_queue = [uri for uri in config.subscriptions if not config.spider_threads() or not _is_http_uri(uri)]
+    unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
     for feed in unthreaded_work_queue:
         try:
             spiderFeed(feed, only_if_new=only_if_new)

From 217e850e41147d3c9274c0bf91b78fcf27ec8971 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Thu, 2 Nov 2006 14:48:47 -0500
Subject: [PATCH 03/39] Still having problems with channel_name.

---
 planet/feedparser.py | 16 +++++++++++-----
 planet/spider.py     |  8 ++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/planet/feedparser.py b/planet/feedparser.py
index 7bb7c60..d24d82a 100755
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """
 
-__version__ = "4.2-pre-" + "$Revision: 1.142 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.145 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
@@ -218,6 +218,9 @@ class FeedParserDict(UserDict):
     def __getitem__(self, key):
         if key == 'category':
             return UserDict.__getitem__(self, 'tags')[0]['term']
+        if key == 'enclosures':
+            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
+            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
         if key == 'categories':
             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
         realkey = self.keymap.get(key, key)
@@ -1303,15 +1306,15 @@ class _FeedParserMixin:
             attrsD.setdefault('type', 'application/atom+xml')
         else:
             attrsD.setdefault('type', 'text/html')
+        context = self._getContext()
         attrsD = self._itsAnHrefDamnIt(attrsD)
         if attrsD.has_key('href'):
             attrsD['href'] = self.resolveURI(attrsD['href'])
+            if attrsD.get('rel')=='enclosure' and not context.get('id'):
+                context['id'] = attrsD.get('href')
         expectingText = self.infeed or self.inentry or self.insource
-        context = self._getContext()
         context.setdefault('links', [])
         context['links'].append(FeedParserDict(attrsD))
-        if attrsD['rel'] == 'enclosure':
-            self._start_enclosure(attrsD)
         if attrsD.has_key('href'):
             expectingText = 0
             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
@@ -1357,6 +1360,7 @@ class _FeedParserMixin:
             self._start_content(attrsD)
         else:
             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
+    _start_dc_description = _start_description
 
     def _start_abstract(self, attrsD):
         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
@@ -1368,6 +1372,7 @@ class _FeedParserMixin:
             value = self.popContent('description')
         self._summaryKey = None
     _end_abstract = _end_description
+    _end_dc_description = _end_description
 
     def _start_info(self, attrsD):
         self.pushContent('info', attrsD, 'text/plain', 1)
@@ -1427,7 +1432,8 @@ class _FeedParserMixin:
     def _start_enclosure(self, attrsD):
         attrsD = self._itsAnHrefDamnIt(attrsD)
         context = self._getContext()
-        context.setdefault('enclosures', []).append(FeedParserDict(attrsD))
+        attrsD['rel']='enclosure'
+        context.setdefault('links', []).append(FeedParserDict(attrsD))
         href = attrsD.get('href')
         if href and not context.get('id'):
             context['id'] = href
diff --git a/planet/spider.py b/planet/spider.py
index 3e98365..2ffad1e 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -140,7 +140,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
 
     # read feed itself
     if content:
-        data = feedparser.parse(content, resp_headers)
+        data = feedparser.parse(content, resp_headers=resp_headers)
     else:
         modified = None
         try:
@@ -338,8 +338,12 @@ def spiderPlanet(only_if_new = False):
         work_queue = Queue()
         awaiting_parsing = Queue()
 
+        http_cache = config.http_cache_directory()
+        if not os.path.exists(http_cache):
+            os.makedirs(http_cache, 0700)
+
         def _spider_proc(thread_index):
-            h = httplib2.Http(config.http_cache_directory())
+            h = httplib2.Http(http_cache)
             try:
                 while True:
                     # The non-blocking get will throw an exception when the queue 

From b2ccc8c1ff6be8e6d2030c67463b3b9e773348e0 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Fri, 3 Nov 2006 11:40:16 -0500
Subject: [PATCH 04/39] added 304 checking before calling spiderFeed()

---
 planet/spider.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index 2ffad1e..3bf0d7b 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -372,8 +372,9 @@ def spiderPlanet(only_if_new = False):
                 item = awaiting_parsing.get(False)
                 try:
                     (resp_headers, content, uri) = item
-                    log.info("Parsing pre-fetched %s", uri)
-                    spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                    if not resp_headers.fromcache:
+                        log.info("Parsing pre-fetched %s", uri)
+                        spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
                 except Exception, e:
                     import sys, traceback
                     type, value, tb = sys.exc_info()

From 72318e770b929c3308f48303e5edd7bacfe52968 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Fri, 3 Nov 2006 11:45:34 -0500
Subject: [PATCH 05/39] Documented spider_threads

---
 docs/config.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/config.html b/docs/config.html
index b20d28c..b201b26 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -98,6 +98,9 @@ use for logging output.  Note: this configuration value is processed
 <dd>Number of seconds to wait for any given feed</dd>
 <dt><del>new_feed_items</del></dt>
 <dd>Number of items to take from new feeds</dd>
+<dt><ins>spider_threads</ins></dt>
+<dd>The number of threads to use when spidering. When set to 0, the default, 
+   no threads are used and spidering follows the traditional algorithm.</dd>
 </dl>
 </blockquote>
 

From 796da216b2eeca13bf16b51e5e7951965a845cce Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Fri, 3 Nov 2006 12:00:27 -0500
Subject: [PATCH 06/39] Added httplib2

---
 httplib2/__init__.py | 820 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 820 insertions(+)
 create mode 100644 httplib2/__init__.py

diff --git a/httplib2/__init__.py b/httplib2/__init__.py
new file mode 100644
index 0000000..83421b4
--- /dev/null
+++ b/httplib2/__init__.py
@@ -0,0 +1,820 @@
+"""
+httplib2
+
+A caching http interface that supports ETags and gzip
+to conserve bandwidth. 
+
+Requires Python 2.3 or later
+
+"""
+
+__author__ = "Joe Gregorio (joe@bitworking.org)"
+__copyright__ = "Copyright 2006, Joe Gregorio"
+__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
+    "James Antill",
+    "Xavier Verges Farrero",
+    "Jonathan Feinberg",
+    "Blair Zajac"]
+__license__ = "MIT"
+__version__ = "$Rev: 204 $"
+
+import re 
+import md5
+import rfc822
+import StringIO
+import gzip
+import zlib
+import httplib
+import urlparse
+import base64
+import os
+import copy
+import calendar
+import time
+import random
+import sha
+import hmac
+from gettext import gettext as _
+
+__all__ = ['Http', 'Response', 'HttpLib2Error',
+  'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 
+  'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
+  'debuglevel']
+
+
+# The httplib debug level, set to a non-zero value to get debug output
+debuglevel = 0
+
+# Python 2.3 support
+if 'sorted' not in __builtins__:
+    def sorted(seq):
+        seq.sort()
+        return seq
+
+# Python 2.3 support
+def HTTPResponse__getheaders(self):
+    """Return list of (header, value) tuples."""
+    if self.msg is None:
+        print "================================"
+        raise httplib.ResponseNotReady()
+    return self.msg.items()
+
+if not hasattr(httplib.HTTPResponse, 'getheaders'):
+    httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
+
+# All exceptions raised here derive from HttpLib2Error
+class HttpLib2Error(Exception): pass
+
+class RedirectMissingLocation(HttpLib2Error): pass
+class RedirectLimit(HttpLib2Error): pass
+class FailedToDecompressContent(HttpLib2Error): pass
+class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
+class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
+
+# Open Items:
+# -----------
+# Proxy support
+
+# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
+
+# Pluggable cache storage (supports storing the cache in
+#   flat files by default. We need a plug-in architecture
+#   that can support Berkeley DB and Squid)
+
+# == Known Issues ==
+# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
+# Does not handle Cache-Control: max-stale
+# Does not use Age: headers when calculating cache freshness.
+
+
+# The number of redirections to follow before giving up.
+# Note that only GET redirects are automatically followed.
+# Will also honor 301 requests by saving that info and never
+# requesting that URI again.
+DEFAULT_MAX_REDIRECTS = 5
+
+# Which headers are hop-by-hop headers by default
+HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
+
+def _get_end2end_headers(response):
+    hopbyhop = list(HOP_BY_HOP)
+    hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
+    return [header for header in response.keys() if header not in hopbyhop]
+
+URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
+
+def parse_uri(uri):
+    """Parses a URI using the regex given in Appendix B of RFC 3986.
+
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+    """
+    groups = URI.match(uri).groups()
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])
+
+NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
+def _normalize_headers(headers):
+    return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
+
+def _parse_cache_control(headers):
+    retval = {}
+    if headers.has_key('cache-control'):
+        parts =  headers['cache-control'].split(',')
+        parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
+        parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
+        retval = dict(parts_with_args + parts_wo_args)
+    return retval 
+
+# Whether to use a strict mode to parse WWW-Authenticate headers
+# Might lead to bad results in case of ill-formed header value,
+# so disabled by default, falling back to relaxed parsing.
+# Set to true to turn on, usefull for testing servers.
+USE_WWW_AUTH_STRICT_PARSING = 0
+
+# In regex below:
+#    [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+             matches a "token" as defined by HTTP
+#    "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?"    matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
+# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
+#    \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
+WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
+WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
+UNQUOTE_PAIRS = re.compile(r'\\(.)')
+def _parse_www_authenticate(headers, headername='www-authenticate'):
+    """Returns a dictionary of dictionaries, one dict
+    per auth_scheme."""
+    retval = {}
+    if headers.has_key(headername):
+        authenticate = headers[headername].strip()
+        www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
+        while authenticate:
+            # Break off the scheme at the beginning of the line
+            if headername == 'authentication-info':
+                (auth_scheme, the_rest) = ('digest', authenticate)                
+            else:
+                (auth_scheme, the_rest) = authenticate.split(" ", 1)
+            # Now loop over all the key value pairs that come after the scheme, 
+            # being careful not to roll into the next scheme
+            match = www_auth.search(the_rest)
+            auth_params = {}
+            while match:
+                if match and len(match.groups()) == 3:
+                    (key, value, the_rest) = match.groups()
+                    auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
+                match = www_auth.search(the_rest)
+            retval[auth_scheme.lower()] = auth_params
+            authenticate = the_rest.strip()
+    return retval
+
+
+def _entry_disposition(response_headers, request_headers):
+    """Determine freshness from the Date, Expires and Cache-Control headers.
+
+    We don't handle the following:
+
+    1. Cache-Control: max-stale
+    2. Age: headers are not used in the calculations.
+
+    Not that this algorithm is simpler than you might think 
+    because we are operating as a private (non-shared) cache.
+    This lets us ignore 's-maxage'. We can also ignore
+    'proxy-invalidate' since we aren't a proxy.
+    We will never return a stale document as 
+    fresh as a design decision, and thus the non-implementation 
+    of 'max-stale'. This also lets us safely ignore 'must-revalidate' 
+    since we operate as if every server has sent 'must-revalidate'.
+    Since we are private we get to ignore both 'public' and
+    'private' parameters. We also ignore 'no-transform' since
+    we don't do any transformations.    
+    The 'no-store' parameter is handled at a higher level.
+    So the only Cache-Control parameters we look at are:
+
+    no-cache
+    only-if-cached
+    max-age
+    min-fresh
+    """
+    
+    retval = "STALE"
+    cc = _parse_cache_control(request_headers)
+    cc_response = _parse_cache_control(response_headers)
+
+    if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
+        retval = "TRANSPARENT"
+        if 'cache-control' not in request_headers:
+            request_headers['cache-control'] = 'no-cache'
+    elif cc.has_key('no-cache'):
+        retval = "TRANSPARENT"
+    elif cc_response.has_key('no-cache'):
+        retval = "STALE"
+    elif cc.has_key('only-if-cached'):
+        retval = "FRESH"
+    elif response_headers.has_key('date'):
+        date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
+        now = time.time()
+        current_age = max(0, now - date)
+        if cc_response.has_key('max-age'):
+            freshness_lifetime = int(cc_response['max-age'])
+        elif response_headers.has_key('expires'):
+            expires = rfc822.parsedate_tz(response_headers['expires'])
+            freshness_lifetime = max(0, calendar.timegm(expires) - date)
+        else:
+            freshness_lifetime = 0
+        if cc.has_key('max-age'):
+            freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
+        if cc.has_key('min-fresh'):
+            current_age += int(cc['min-fresh'])
+        if freshness_lifetime > current_age:
+            retval = "FRESH"
+    return retval 
+
+def _decompressContent(response, new_content):
+    content = new_content
+    try:
+        if response.get('content-encoding', None) == 'gzip':
+            content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
+        if response.get('content-encoding', None) == 'deflate':
+            content = zlib.decompress(content)
+    except:
+        content = ""
+        raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
+    return content
+
+def _updateCache(request_headers, response_headers, content, cache, cachekey):
+    if cachekey:
+        cc = _parse_cache_control(request_headers)
+        cc_response = _parse_cache_control(response_headers)
+        if cc.has_key('no-store') or cc_response.has_key('no-store'):
+            cache.delete(cachekey)
+        else:
+            f = StringIO.StringIO("")
+            info = rfc822.Message(StringIO.StringIO(""))
+            for key, value in response_headers.iteritems():
+                info[key] = value
+            f.write(str(info))
+            f.write("\r\n\r\n")
+            f.write(content)
+            cache.set(cachekey, f.getvalue())
+
+def _cnonce():
+    dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
+    return dig[:16]
+
+def _wsse_username_token(cnonce, iso_now, password):
+    return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
+
+
+# For credentials we need two things, first 
+# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
+# Then we also need a list of URIs that have already demanded authentication
+# That list is tricky since sub-URIs can take the same auth, or the 
+# auth scheme may change as you descend the tree.
+# So we also need each Auth instance to be able to tell us
+# how close to the 'top' it is.
+
+class Authentication:
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        self.path = path
+        self.host = host
+        self.credentials = credentials
+        self.http = http
+
+    def depth(self, request_uri):
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        return request_uri[len(self.path):].count("/")
+
+    def inscope(self, host, request_uri):
+        # XXX Should we normalize the request_uri?
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        return (host == self.host) and path.startswith(self.path)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header. Over-rise this in sub-classes."""
+        pass
+
+    def response(self, response, content):
+        """Gives us a chance to update with new nonces
+        or such returned from the last authorized response.
+        Over-rise this in sub-classes if necessary.
+
+        Return TRUE is the request is to be retried, for 
+        example Digest may return stale=true.
+        """
+        return False
+
+
+
+class BasicAuthentication(Authentication):
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()  
+
+
+class DigestAuthentication(Authentication):
+    """Only do qop='auth' and MD5, since that 
+    is all Apache currently implements"""
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+        challenge = _parse_www_authenticate(response, 'www-authenticate')
+        self.challenge = challenge['digest']
+        qop = self.challenge.get('qop')
+        self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
+        if self.challenge['qop'] is None:
+            raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
+        self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
+        if self.challenge['algorithm'] != 'MD5':
+            raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
+        self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])   
+        self.challenge['nc'] = 1
+
+    def request(self, method, request_uri, headers, content, cnonce = None):
+        """Modify the request headers"""
+        H = lambda x: md5.new(x).hexdigest()
+        KD = lambda s, d: H("%s:%s" % (s, d))
+        A2 = "".join([method, ":", request_uri])
+        self.challenge['cnonce'] = cnonce or _cnonce() 
+        request_digest  = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'], 
+                    '%08x' % self.challenge['nc'], 
+                    self.challenge['cnonce'], 
+                    self.challenge['qop'], H(A2)
+                    )) 
+        headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
+                self.credentials[0], 
+                self.challenge['realm'],
+                self.challenge['nonce'],
+                request_uri, 
+                self.challenge['algorithm'],
+                request_digest,
+                self.challenge['qop'],
+                self.challenge['nc'],
+                self.challenge['cnonce'],
+                )
+        self.challenge['nc'] += 1
+
+    def response(self, response, content):
+        if not response.has_key('authentication-info'):
+            challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
+            if 'true' == challenge.get('stale'):
+                self.challenge['nonce'] = challenge['nonce']
+                self.challenge['nc'] = 1 
+                return True
+        else:
+            updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
+
+            if updated_challenge.has_key('nextnonce'):
+                self.challenge['nonce'] = updated_challenge['nextnonce']
+                self.challenge['nc'] = 1 
+        return False
+
+
+class HmacDigestAuthentication(Authentication):
+    """Adapted from Robert Sayre's code and DigestAuthentication above."""
+    __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
+
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+        challenge = _parse_www_authenticate(response, 'www-authenticate')
+        self.challenge = challenge['hmacdigest']
+        print self.challenge
+        # TODO: self.challenge['domain']
+        self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
+        if self.challenge['reason'] not in ['unauthorized', 'integrity']:
+            self.challenge['reason'] = 'unauthorized'
+        self.challenge['salt'] = self.challenge.get('salt', '')
+        if not self.challenge.get('snonce'):
+            raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
+        self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
+        if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
+            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
+        self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
+        if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
+            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
+        if self.challenge['algorithm'] == 'HMAC-MD5':
+            self.hashmod = md5
+        else:
+            self.hashmod = sha
+        if self.challenge['pw-algorithm'] == 'MD5':
+            self.pwhashmod = md5
+        else:
+            self.pwhashmod = sha
+        self.key = "".join([self.credentials[0], ":",
+                    self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
+                    ":", self.challenge['realm']
+                    ])
+        print response['www-authenticate']
+        print "".join([self.credentials[1], self.challenge['salt']])
+        print "key_str = %s" % self.key
+        self.key = self.pwhashmod.new(self.key).hexdigest().lower()
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers"""
+        keys = _get_end2end_headers(headers)
+        keylist = "".join(["%s " % k for k in keys])
+        headers_val = "".join([headers[k] for k in keys])
+        created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
+        cnonce = _cnonce()
+        request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
+        print "key = %s" % self.key
+        print "msg = %s" % request_digest
+        request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
+        headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
+                self.credentials[0], 
+                self.challenge['realm'],
+                self.challenge['snonce'],
+                cnonce,
+                request_uri, 
+                created,
+                request_digest,
+                keylist,
+                )
+
+    def response(self, response, content):
+        challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
+        if challenge.get('reason') in ['integrity', 'stale']:
+            return True
+        return False
+
+
+class WsseAuthentication(Authentication):
+    """This is thinly tested and should not be relied upon.
+    At this time there isn't any third party server to test against.
+    Blogger and TypePad implemented this algorithm at one point
+    but Blogger has since switched to Basic over HTTPS and 
+    TypePad has implemented it wrong, by never issuing a 401
+    challenge but instead requiring your client to telepathically know that
+    their endpoint is expecting WSSE profile="UsernameToken"."""
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['Authorization'] = 'WSSE profile="UsernameToken"'
+        iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        cnonce = _cnonce()
+        password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
+        headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
+                self.credentials[0],
+                password_digest,
+                cnonce,
+                iso_now)
+
+class GoogleLoginAuthentication(Authentication):
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        from urllib import urlencode
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+        auth = dict(Email=credentials[0], Passwd=credentials[1], service='cl', source=headers['user-agent'])
+        resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
+        lines = content.split('\n')
+        d = dict([tuple(line.split("=", 1)) for line in lines if line])
+        if resp.status == 403:
+            self.Auth = ""
+        else:
+            self.Auth = d['Auth']
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['authorization'] = 'GoogleLogin Auth=' + self.Auth 
+
+
+AUTH_SCHEME_CLASSES = {
+    "basic": BasicAuthentication,
+    "wsse": WsseAuthentication,
+    "digest": DigestAuthentication,
+    "hmacdigest": HmacDigestAuthentication,
+    "googlelogin": GoogleLoginAuthentication
+}
+
+AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
+
+
+class FileCache:
+    """Uses a local directory as a store for cached files.
+    Not really safe to use if multiple threads or processes are going to 
+    be running on the same cache.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        if not os.path.exists(cache): 
+            os.makedirs(self.cache)
+
+    def get(self, key):
+        retval = None
+        cacheFullPath = os.path.join(self.cache, key)
+        try:
+            f = file(cacheFullPath, "r")
+            retval = f.read()
+            f.close()
+        except:
+            pass
+        return retval
+
+    def set(self, key, value):
+        cacheFullPath = os.path.join(self.cache, key)
+        f = file(cacheFullPath, "w")
+        f.write(value)
+        f.close()
+
+    def delete(self, key):
+        cacheFullPath = os.path.join(self.cache, key)
+        if os.path.exists(cacheFullPath):
+            os.remove(cacheFullPath)
+
+class Http:
+    """An HTTP client that handles all 
+    methods, caching, ETags, compression,
+    HTTPS, Basic, Digest, WSSE, etc.
+    """
+    def __init__(self, cache=None):
+        # Map domain name to an httplib connection
+        self.connections = {}
+        # The location of the cache, for now a directory
+        # where cached responses are held.
+        if cache and isinstance(cache, str):
+            self.cache = FileCache(cache)
+        else:
+            self.cache = cache
+
+        # tuples of name, password
+        self.credentials = []
+
+        # authorization objects
+        self.authorizations = []
+
+        self.follow_all_redirects = False
+
+        self.ignore_etag = False
+
+    def _auth_from_challenge(self, host, request_uri, headers, response, content):
+        """A generator that creates Authorization objects
+           that can be applied to requests.
+        """
+        challenges = _parse_www_authenticate(response, 'www-authenticate')
+        for cred in self.credentials:
+            for scheme in AUTH_SCHEME_ORDER:
+                if challenges.has_key(scheme):
+                    yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self) 
+
+    def add_credentials(self, name, password):
+        """Add a name and password that will be used
+        any time a request requires authentication."""
+        self.credentials.append((name, password))
+
+    def clear_credentials(self):
+        """Remove all the names and passwords
+        that are used for authentication"""
+        self.credentials = []
+        self.authorizations = []
+
+    def _conn_request(self, conn, request_uri, method, body, headers):
+        for i in range(2):
+            try:
+                conn.request(method, request_uri, body, headers)
+                response = conn.getresponse()
+            except:
+                if i == 0:
+                    conn.close()
+                    conn.connect()
+                    continue
+                else:
+                    raise
+            else:
+                content = response.read()
+                response = Response(response)
+                content = _decompressContent(response, content)
+
+            break;
+        return (response, content)
+
+
+    def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
+        """Do the actual request using the connection object
+        and also follow one level of redirects if necessary"""
+
+        auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
+        auth = auths and sorted(auths)[0][1] or None
+        if auth: 
+            auth.request(method, request_uri, headers, body)
+
+        (response, content) = self._conn_request(conn, request_uri, method, body, headers)
+
+        if auth: 
+            if auth.response(response, body):
+                auth.request(method, request_uri, headers, body)
+                (response, content) = self._conn_request(conn, request_uri, method, body, headers )
+                response._stale_digest = 1
+
+        if response.status == 401:
+            for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
+                authorization.request(method, request_uri, headers, body) 
+                (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
+                if response.status != 401:
+                    self.authorizations.append(authorization)
+                    authorization.response(response, body)
+                    break
+
+        if (self.follow_all_redirects or method in ["GET", "HEAD"]) or response.status == 303:
+            if response.status in [300, 301, 302, 303, 307]:
+                # Pick out the location header and basically start from the beginning
+                # remembering first to strip the ETag header and decrement our 'depth'
+                if redirections:
+                    if not response.has_key('location') and response.status != 300:
+                        raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."))
+                    if response.status == 301 and method in ["GET", "HEAD"]:
+                        response['-x-permanent-redirect-url'] = response['location']
+                        _updateCache(headers, response, content, self.cache, cachekey)
+                    if headers.has_key('if-none-match'):
+                        del headers['if-none-match']
+                    if headers.has_key('if-modified-since'):
+                        del headers['if-modified-since']
+                    if response.has_key('location'):
+                        old_response = copy.deepcopy(response)
+                        location = response['location']
+                        (scheme, authority, path, query, fragment) = parse_uri(location)
+                        if authority == None:
+                            location = urlparse.urljoin(absolute_uri, location)
+                        redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
+                        (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
+                        response.previous = old_response
+                else:
+                    raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
+            elif response.status in [200, 203] and method == "GET":
+                # Don't cache 206's since we aren't going to handle byte range requests
+                _updateCache(headers, response, content, self.cache, cachekey)
+
+        return (response, content)
+
+    def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS):
+        """ Performs a single HTTP request.
+The 'uri' is the URI of the HTTP resource and can begin 
+with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
+
+The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. 
+There is no restriction on the methods allowed.
+
+The 'body' is the entity body to be sent with the request. It is a string
+object.
+
+Any extra headers that are to be sent with the request should be provided in the
+'headers' dictionary.
+
+The maximum number of redirect to follow before raising an 
+exception is 'redirections. The default is 5.
+
+The return value is a tuple of (response, content), the first 
+being and instance of the 'Response' class, the second being 
+a string that contains the response entity body.
+        """
+        if headers is None:
+            headers = {}
+        else:
+            headers = _normalize_headers(headers)
+
+        if not headers.has_key('user-agent'):
+            headers['user-agent'] = "Python-httplib2/%s" % __version__
+
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+        authority = authority.lower()
+        if not path: 
+            path = "/"
+        # Could do syntax based normalization of the URI before
+        # computing the digest. See Section 6.2.2 of Std 66.
+        request_uri = query and "?".join([path, query]) or path
+        defrag_uri = scheme + "://" + authority + request_uri
+
+        if not self.connections.has_key(scheme+":"+authority):
+            connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
+            conn = self.connections[scheme+":"+authority] = connection_type(authority)
+            conn.set_debuglevel(debuglevel)
+        else:
+            conn = self.connections[scheme+":"+authority]
+
+        if method in ["GET", "HEAD"] and 'range' not in headers:
+            headers['accept-encoding'] = 'compress, gzip'
+
+        info = rfc822.Message(StringIO.StringIO(""))
+        cached_value = None
+        if self.cache:
+            cachekey = md5.new(defrag_uri).hexdigest()
+            cached_value = self.cache.get(cachekey)
+            if cached_value:
+                #try:
+                f = StringIO.StringIO(cached_value)
+                info = rfc822.Message(f)
+                content = cached_value.split('\r\n\r\n', 1)[1]
+                #except:
+                #    self.cache.delete(cachekey)
+                #    cachekey = None
+        else:
+            cachekey = None
+                    
+        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
+            # http://www.w3.org/1999/04/Editing/ 
+            headers['if-match'] = info['etag']
+
+        if method not in ["GET", "HEAD"] and self.cache and cachekey:
+            # RFC 2616 Section 13.10
+            self.cache.delete(cachekey)
+
+        if method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
+            if info.has_key('-x-permanent-redirect-url'):
+                # Should cached permanent redirects be counted in our redirection count? For now, yes.
+                (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
+                response.previous = Response(info)
+                response.previous.fromcache = True
+            else:
+                # Determine our course of action:
+                #   Is the cached entry fresh or stale?
+                #   Has the client requested a non-cached response?
+                #   
+                # There seems to be three possible answers: 
+                # 1. [FRESH] Return the cache entry w/o doing a GET
+                # 2. [STALE] Do the GET (but add in cache validators if available)
+                # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
+                entry_disposition = _entry_disposition(info, headers) 
+                
+                if entry_disposition == "FRESH":
+                    if not cached_value:
+                        info['status'] = '504'
+                        content = ""
+                    response = Response(info)
+                    if cached_value:
+                        response.fromcache = True
+                    return (response, content)
+
+                if entry_disposition == "STALE":
+                    if info.has_key('etag') and not self.ignore_etag:
+                        headers['if-none-match'] = info['etag']
+                    if info.has_key('last-modified'):
+                        headers['if-modified-since'] = info['last-modified']
+                elif entry_disposition == "TRANSPARENT":
+                    pass
+
+                (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
+
+            if response.status == 304 and method == "GET":
+                # Rewrite the cache entry with the new end-to-end headers
+                # Take all headers that are in response 
+                # and overwrite their values in info.
+                # unless they are hop-by-hop, or are listed in the connection header.
+
+                for key in _get_end2end_headers(response):
+                    info[key] = response[key]
+                merged_response = Response(info)
+                if hasattr(response, "_stale_digest"):
+                    merged_response._stale_digest = response._stale_digest
+                _updateCache(headers, merged_response, content, self.cache, cachekey)
+                response = merged_response
+                response.status = 200
+                response.fromcache = True 
+
+            elif response.status == 200:
+                content = new_content
+            else:
+                self.cache.delete(cachekey)
+                content = new_content 
+        else: 
+            (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
+        return (response, content)
+
+ 
+
+class Response(dict):
+    """An object more like rfc822.Message than httplib.HTTPResponse."""
+   
+    """Is this response from our local cache"""
+    fromcache = False
+
+    """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
+    version = 11
+
+    "Status code returned by server. "
+    status = 200
+
+    """Reason phrase returned by server."""
+    reason = "Ok"
+
+    previous = None
+
+    def __init__(self, info):
+        # info is either an rfc822.Message or 
+        # an httplib.HTTPResponse object.
+        if isinstance(info, httplib.HTTPResponse):
+            for key, value in info.getheaders(): 
+                self[key] = value 
+            self.status = info.status
+            self['status'] = str(self.status)
+            self.reason = info.reason
+            self.version = info.version
+        elif isinstance(info, rfc822.Message):
+            for key, value in info.items(): 
+                self[key] = value 
+            self.status = int(self['status'])
+
+

From 4569dba5e2184d17200b73b4837c395a4ff7708b Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Sat, 4 Nov 2006 11:31:52 -0500
Subject: [PATCH 07/39] Moved httplib2 directory

---
 {httplib2 => planet/httplib2}/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {httplib2 => planet/httplib2}/__init__.py (100%)

diff --git a/httplib2/__init__.py b/planet/httplib2/__init__.py
similarity index 100%
rename from httplib2/__init__.py
rename to planet/httplib2/__init__.py

From 681eb117f8a74e8d321647e57ffcfe43b6f8a7dc Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Sat, 4 Nov 2006 16:58:03 -0500
Subject: [PATCH 08/39] Fixed one bug with passing non-2xx responses to
 feedparser. Also added a try/except to help debug the problem with 'content'
 undefined in httplib2.

---
 planet/httplib2/__init__.py | 21 +++++++++++++--------
 planet/spider.py            | 15 +++++++++++----
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index 83421b4..73e9bf7 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -35,6 +35,7 @@ import random
 import sha
 import hmac
 from gettext import gettext as _
+from socket import gaierror
 
 __all__ = ['Http', 'Response', 'HttpLib2Error',
   'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 
@@ -704,13 +705,13 @@ a string that contains the response entity body.
             cachekey = md5.new(defrag_uri).hexdigest()
             cached_value = self.cache.get(cachekey)
             if cached_value:
-                #try:
-                f = StringIO.StringIO(cached_value)
-                info = rfc822.Message(f)
-                content = cached_value.split('\r\n\r\n', 1)[1]
-                #except:
-                #    self.cache.delete(cachekey)
-                #    cachekey = None
+                try:
+                    f = StringIO.StringIO(cached_value)
+                    info = rfc822.Message(f)
+                    content = cached_value.split('\r\n\r\n', 1)[1]
+                except:
+                    self.cache.delete(cachekey)
+                    cachekey = None
         else:
             cachekey = None
                     
@@ -769,7 +770,11 @@ a string that contains the response entity body.
                 merged_response = Response(info)
                 if hasattr(response, "_stale_digest"):
                     merged_response._stale_digest = response._stale_digest
-                _updateCache(headers, merged_response, content, self.cache, cachekey)
+                try:
+                    _updateCache(headers, merged_response, content, self.cache, cachekey)
+                except:
+                    print locals()
+                    raise 
                 response = merged_response
                 response.status = 200
                 response.fromcache = True 
diff --git a/planet/spider.py b/planet/spider.py
index 3bf0d7b..41b2d57 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -334,6 +334,7 @@ def spiderPlanet(only_if_new = False):
         from Queue import Queue, Empty
         from threading import Thread
         import httplib2
+        from socket import gaierror
 
         work_queue = Queue()
         awaiting_parsing = Queue()
@@ -350,8 +351,11 @@ def spiderPlanet(only_if_new = False):
                     # is empty which will terminate the thread.
                     uri = work_queue.get(block=False)
                     log.info("Fetching %s via %d", uri, thread_index)
-                    (resp, content) = h.request(uri)
-                    awaiting_parsing.put(block=True, item=(resp, content, uri))
+                    try:
+                        (resp, content) = h.request(uri)
+                        awaiting_parsing.put(block=True, item=(resp, content, uri))
+                    except gaierror:
+                        log.error("Fail to resolve server name %s via %d", uri, thread_index)
             except Empty, e:
                 log.info("Thread %d finished", thread_index)
                 pass
@@ -373,8 +377,11 @@ def spiderPlanet(only_if_new = False):
                 try:
                     (resp_headers, content, uri) = item
                     if not resp_headers.fromcache:
-                        log.info("Parsing pre-fetched %s", uri)
-                        spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                        if resp_headers.status < 300:
+                            log.info("Parsing pre-fetched %s", uri)
+                            spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                        else:
+                            log.error("Status code %d from %s", resp_headers.status, uri)
                 except Exception, e:
                     import sys, traceback
                     type, value, tb = sys.exc_info()

From b58d815a0d757d4c5bc22e77bcc4f12092d4f44b Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Sat, 4 Nov 2006 17:19:59 -0500
Subject: [PATCH 09/39] Fixed very weird bug where we would break on relative
 301's, but *only* on the second attempt, i.e. only when reading the cache 301
 redirect

---
 planet/httplib2/__init__.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index 73e9bf7..f8bb205 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -627,6 +627,12 @@ class Http:
                 if redirections:
                     if not response.has_key('location') and response.status != 300:
                         raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."))
+                    # Fix-up relative redirects (which violate an RFC 2616 MUST)
+                    if response.has_key('location'):
+                        location = response['location']
+                        (scheme, authority, path, query, fragment) = parse_uri(location)
+                        if authority == None:
+                            response['location'] = urlparse.urljoin(absolute_uri, location)
                     if response.status == 301 and method in ["GET", "HEAD"]:
                         response['-x-permanent-redirect-url'] = response['location']
                         _updateCache(headers, response, content, self.cache, cachekey)
@@ -635,11 +641,8 @@ class Http:
                     if headers.has_key('if-modified-since'):
                         del headers['if-modified-since']
                     if response.has_key('location'):
-                        old_response = copy.deepcopy(response)
                         location = response['location']
-                        (scheme, authority, path, query, fragment) = parse_uri(location)
-                        if authority == None:
-                            location = urlparse.urljoin(absolute_uri, location)
+                        old_response = copy.deepcopy(response)
                         redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
                         (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
                         response.previous = old_response

From 4b9e85e4f7bddc6084c97ccddfba3dc56591131d Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Sun, 5 Nov 2006 22:00:05 -0500
Subject: [PATCH 10/39] reverted feedparser to HEAD, i.e. it doesn't need
 changes to be used with an external http client. Made the changes as
 suggested by Sam on how to get httplib2 and feedparser working together.
 Added a 'dict' attribute to httplib2.Response to get it to work as feedparser
 expects.

---
 planet/feedparser.py        |  9 ++-------
 planet/httplib2/__init__.py | 10 +++++++++-
 planet/spider.py            | 21 ++++++++++++++++++---
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/planet/feedparser.py b/planet/feedparser.py
index d24d82a..1860539 100755
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """
 
-__version__ = "4.2-pre-" + "$Revision: 1.145 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
@@ -3254,7 +3254,7 @@ def _stripDoctype(data):
 
     return version, data, dict(replacement and safe_pattern.findall(replacement))
     
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], resp_headers=None):
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
     '''Parse a feed from a URL, file, stream, or string'''
     result = FeedParserDict()
     result['feed'] = FeedParserDict()
@@ -3263,9 +3263,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result['bozo'] = 0
     if type(handlers) == types.InstanceType:
         handlers = [handlers]
-    if resp_headers:
-       f = None
-       data = url_file_stream_or_string
     try:
         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
         data = f.read()
@@ -3310,8 +3307,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
         result['status'] = f.status
     if hasattr(f, 'headers'):
         result['headers'] = f.headers.dict
-    if resp_headers:
-        result['headers'] = resp_headers 
     if hasattr(f, 'close'):
         f.close()
 
diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index f8bb205..b96130b 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -715,6 +715,7 @@ a string that contains the response entity body.
                 except:
                     self.cache.delete(cachekey)
                     cachekey = None
+                    cached_value = None
         else:
             cachekey = None
                     
@@ -726,7 +727,7 @@ a string that contains the response entity body.
             # RFC 2616 Section 13.10
             self.cache.delete(cachekey)
 
-        if method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
+        if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
             if info.has_key('-x-permanent-redirect-url'):
                 # Should cached permanent redirects be counted in our redirection count? For now, yes.
                 (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
@@ -825,4 +826,11 @@ class Response(dict):
                 self[key] = value 
             self.status = int(self['status'])
 
+    def __getattr__(self, name):
+        if name == 'dict':
+            return self 
+        else:  
+            raise AttributeError, name 
+
+
 
diff --git a/planet/spider.py b/planet/spider.py
index 41b2d57..a12cf95 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -8,6 +8,7 @@ import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
 import planet, config, feedparser, reconstitute, shell
+from StringIO import StringIO 
 
 # Regular expressions to sanitise cache filenames
 re_url_scheme    = re.compile(r'^\w+:/*(\w+:|www\.)?')
@@ -140,7 +141,11 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
 
     # read feed itself
     if content:
-        data = feedparser.parse(content, resp_headers=resp_headers)
+        f = StringIO(content) 
+        setattr(f, 'url', feed)
+        if resp_headers:
+            setattr(f, 'headers', resp_headers)
+        data = feedparser.parse(f)
     else:
         modified = None
         try:
@@ -334,7 +339,7 @@ def spiderPlanet(only_if_new = False):
         from Queue import Queue, Empty
         from threading import Thread
         import httplib2
-        from socket import gaierror
+        from socket import gaierror, error 
 
         work_queue = Queue()
         awaiting_parsing = Queue()
@@ -356,6 +361,16 @@ def spiderPlanet(only_if_new = False):
                         awaiting_parsing.put(block=True, item=(resp, content, uri))
                     except gaierror:
                         log.error("Fail to resolve server name %s via %d", uri, thread_index)
+                    except error, e:
+                        log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
+                    except Exception, e:
+                        import sys, traceback
+                        type, value, tb = sys.exc_info()
+                        log.error('Error processing %s', uri)
+                        for line in (traceback.format_exception_only(type, value) +
+                            traceback.format_tb(tb)):
+                            log.error(line.rstrip())
+ 
             except Empty, e:
                 log.info("Thread %d finished", thread_index)
                 pass
@@ -385,7 +400,7 @@ def spiderPlanet(only_if_new = False):
                 except Exception, e:
                     import sys, traceback
                     type, value, tb = sys.exc_info()
-                    log.error('Error processing %s', feed)
+                    log.error('Error processing %s', uri)
                     for line in (traceback.format_exception_only(type, value) +
                         traceback.format_tb(tb)):
                         log.error(line.rstrip())

From 56a447e1beda4ec67595b5407af3c8df4e87498c Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Sun, 5 Nov 2006 22:48:30 -0500
Subject: [PATCH 11/39] Updated to latest httplib2. Now deleting
 'content-encoding' header from the httplib2 response before passing to
 feedparser

---
 planet/httplib2/__init__.py | 5 +++--
 planet/spider.py            | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index b96130b..2941c73 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -16,7 +16,7 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
     "Jonathan Feinberg",
     "Blair Zajac"]
 __license__ = "MIT"
-__version__ = "$Rev: 204 $"
+__version__ = "$Rev: 208 $"
 
 import re 
 import md5
@@ -232,8 +232,10 @@ def _decompressContent(response, new_content):
     try:
         if response.get('content-encoding', None) == 'gzip':
             content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
+            response['content-length'] = str(len(content))
         if response.get('content-encoding', None) == 'deflate':
             content = zlib.decompress(content)
+            response['content-length'] = str(len(content))
     except:
         content = ""
         raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
@@ -833,4 +835,3 @@ class Response(dict):
             raise AttributeError, name 
 
 
-
diff --git a/planet/spider.py b/planet/spider.py
index a12cf95..a438eeb 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -141,9 +141,13 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
 
     # read feed itself
     if content:
+        # httplib2 was used to get the content, so prepare a 
+        # proper object to pass to feedparser.
         f = StringIO(content) 
         setattr(f, 'url', feed)
         if resp_headers:
+            if resp_headers.has_key('content-encoding'):
+                del resp_headers['content-encoding']
             setattr(f, 'headers', resp_headers)
         data = feedparser.parse(f)
     else:

From daec4769c77046b939f73a459bde8576eff34811 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Tue, 7 Nov 2006 13:19:42 -0500
Subject: [PATCH 12/39] Added in support for '-location' in httlib2 responses

---
 planet/httplib2/__init__.py | 5 ++++-
 planet/spider.py            | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index 2941c73..a92540e 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -16,7 +16,7 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
     "Jonathan Feinberg",
     "Blair Zajac"]
 __license__ = "MIT"
-__version__ = "$Rev: 208 $"
+__version__ = "$Rev: 209 $"
 
 import re 
 import md5
@@ -637,6 +637,7 @@ class Http:
                             response['location'] = urlparse.urljoin(absolute_uri, location)
                     if response.status == 301 and method in ["GET", "HEAD"]:
                         response['-x-permanent-redirect-url'] = response['location']
+                        response['-location'] = absolute_uri 
                         _updateCache(headers, response, content, self.cache, cachekey)
                     if headers.has_key('if-none-match'):
                         del headers['if-none-match']
@@ -645,6 +646,7 @@ class Http:
                     if response.has_key('location'):
                         location = response['location']
                         old_response = copy.deepcopy(response)
+                        old_response['-location'] = absolute_uri 
                         redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
                         (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
                         response.previous = old_response
@@ -652,6 +654,7 @@ class Http:
                     raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
             elif response.status in [200, 203] and method == "GET":
                 # Don't cache 206's since we aren't going to handle byte range requests
+                response['-location'] = absolute_uri 
                 _updateCache(headers, response, content, self.cache, cachekey)
 
         return (response, content)
diff --git a/planet/spider.py b/planet/spider.py
index a438eeb..f7a0d28 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -144,7 +144,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
         # httplib2 was used to get the content, so prepare a 
         # proper object to pass to feedparser.
         f = StringIO(content) 
-        setattr(f, 'url', feed)
+        setattr(f, 'url', resp_headers.get('-location', feed))
         if resp_headers:
             if resp_headers.has_key('content-encoding'):
                 del resp_headers['content-encoding']

From 45f0f92110e7fb40a863923c36a63743148a2a35 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Tue, 7 Nov 2006 22:39:35 -0500
Subject: [PATCH 13/39] Switched to standard socket timeouts.
 http://mail.python.org/pipermail/python-list/2005-May/281697.html

---
 planet/__init__.py      |  21 --
 planet/spider.py        |   4 +-
 planet/timeoutsocket.py | 424 ----------------------------------------
 3 files changed, 2 insertions(+), 447 deletions(-)
 delete mode 100644 planet/timeoutsocket.py

diff --git a/planet/__init__.py b/planet/__init__.py
index 6be34ed..166acff 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -30,25 +30,4 @@ def getLogger(level, format):
     return logger
 
 
-def setTimeout(timeout):
-    """ time out rather than hang forever on ultra-slow servers."""
-    if timeout:
-        try:
-            timeout = float(timeout)
-        except:
-            logger.warning("Timeout set to invalid value '%s', skipping", timeout)
-            timeout = None
 
-    if timeout:
-        try:
-            from planet import timeoutsocket
-            timeoutsocket.setDefaultSocketTimeout(timeout)
-            logger.info("Socket timeout set to %d seconds", timeout)
-        except ImportError:
-            import socket
-            if hasattr(socket, 'setdefaulttimeout'):
-                logger.debug("timeoutsocket not found, using python function")
-                socket.setdefaulttimeout(timeout)
-                logger.info("Socket timeout set to %d seconds", timeout)
-            else:
-                logger.error("Unable to set timeout to %d seconds", timeout)
diff --git a/planet/spider.py b/planet/spider.py
index f7a0d28..f4badf2 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
 import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
-import planet, config, feedparser, reconstitute, shell
+import planet, config, feedparser, reconstitute, shell, socket
 from StringIO import StringIO 
 
 # Regular expressions to sanitise cache filenames
@@ -338,6 +338,7 @@ def spiderPlanet(only_if_new = False):
 
     global index
     index = True
+    socket.setdefaulttimeout(float(config.feed_timeout()))
 
     if int(config.spider_threads()):
         from Queue import Queue, Empty
@@ -414,7 +415,6 @@ def spiderPlanet(only_if_new = False):
     log.info("Finished threaded part of processing.")
                     
 
-    planet.setTimeout(config.feed_timeout())
     # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
     unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
     for feed in unthreaded_work_queue:
diff --git a/planet/timeoutsocket.py b/planet/timeoutsocket.py
deleted file mode 100644
index b698df0..0000000
--- a/planet/timeoutsocket.py
+++ /dev/null
@@ -1,424 +0,0 @@
-
-####
-# Copyright 2000,2001 by Timothy O'Malley <timo@alum.mit.edu>
-# 
-#                All Rights Reserved
-# 
-# Permission to use, copy, modify, and distribute this software
-# and its documentation for any purpose and without fee is hereby
-# granted, provided that the above copyright notice appear in all
-# copies and that both that copyright notice and this permission
-# notice appear in supporting documentation, and that the name of
-# Timothy O'Malley  not be used in advertising or publicity
-# pertaining to distribution of the software without specific, written
-# prior permission. 
-# 
-# Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
-# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
-# AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
-# ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
-# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-# PERFORMANCE OF THIS SOFTWARE. 
-#
-####
-
-"""Timeout Socket
-
-This module enables a timeout mechanism on all TCP connections.  It
-does this by inserting a shim into the socket module.  After this module
-has been imported, all socket creation goes through this shim.  As a
-result, every TCP connection will support a timeout.
-
-The beauty of this method is that it immediately and transparently
-enables the entire python library to support timeouts on TCP sockets.
-As an example, if you wanted to SMTP connections to have a 20 second
-timeout:
-
-    import timeoutsocket
-    import smtplib
-    timeoutsocket.setDefaultSocketTimeout(20)
-
-
-The timeout applies to the socket functions that normally block on
-execution:  read, write, connect, and accept.  If any of these 
-operations exceeds the specified timeout, the exception Timeout
-will be raised.
-
-The default timeout value is set to None.  As a result, importing
-this module does not change the default behavior of a socket.  The
-timeout mechanism only activates when the timeout has been set to
-a numeric value.  (This behavior mimics the behavior of the
-select.select() function.)
-
-This module implements two classes: TimeoutSocket and TimeoutFile.
-
-The TimeoutSocket class defines a socket-like object that attempts to
-avoid the condition where a socket may block indefinitely.  The
-TimeoutSocket class raises a Timeout exception whenever the
-current operation delays too long. 
-
-The TimeoutFile class defines a file-like object that uses the TimeoutSocket
-class.  When the makefile() method of TimeoutSocket is called, it returns
-an instance of a TimeoutFile.
-
-Each of these objects adds two methods to manage the timeout value:
-
-    get_timeout()   -->  returns the timeout of the socket or file
-    set_timeout()   -->  sets the timeout of the socket or file
-
-
-As an example, one might use the timeout feature to create httplib
-connections that will timeout after 30 seconds:
-
-    import timeoutsocket
-    import httplib
-    H = httplib.HTTP("www.python.org")
-    H.sock.set_timeout(30)
-
-Note:  When used in this manner, the connect() routine may still
-block because it happens before the timeout is set.  To avoid
-this, use the 'timeoutsocket.setDefaultSocketTimeout()' function.
-
-Good Luck!
-
-"""
-
-__version__ = "$Revision: 1.1.1.1 $"
-__author__  = "Timothy O'Malley <timo@alum.mit.edu>"
-
-#
-# Imports
-#
-import select, string
-import socket
-if not hasattr(socket, "_no_timeoutsocket"):
-    _socket = socket.socket
-else:
-    _socket = socket._no_timeoutsocket
-
-
-#
-# Set up constants to test for Connected and Blocking operations.
-# We delete 'os' and 'errno' to keep our namespace clean(er).
-# Thanks to Alex Martelli and G. Li for the Windows error codes.
-#
-import os
-if os.name == "nt":
-    _IsConnected = ( 10022, 10056 )
-    _ConnectBusy = ( 10035, )
-    _AcceptBusy  = ( 10035, )
-else:
-    import errno
-    _IsConnected = ( errno.EISCONN, )
-    _ConnectBusy = ( errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK )
-    _AcceptBusy  = ( errno.EAGAIN, errno.EWOULDBLOCK )
-    del errno
-del os
-
-
-#
-# Default timeout value for ALL TimeoutSockets
-#
-_DefaultTimeout = None
-def setDefaultSocketTimeout(timeout):
-    global _DefaultTimeout
-    _DefaultTimeout = timeout
-def getDefaultSocketTimeout():
-    return _DefaultTimeout
-
-#
-# Exceptions for socket errors and timeouts
-#
-Error = socket.error
-class Timeout(Exception):
-    pass
-
-
-#
-# Factory function
-#
-from socket import AF_INET, SOCK_STREAM
-def timeoutsocket(family=AF_INET, type=SOCK_STREAM, proto=None):
-    if family != AF_INET or type != SOCK_STREAM:
-        if proto:
-            return _socket(family, type, proto)
-        else:
-            return _socket(family, type)
-    return TimeoutSocket( _socket(family, type), _DefaultTimeout )
-# end timeoutsocket
-
-#
-# The TimeoutSocket class definition
-#
-class TimeoutSocket:
-    """TimeoutSocket object
-    Implements a socket-like object that raises Timeout whenever
-    an operation takes too long.
-    The definition of 'too long' can be changed using the
-    set_timeout() method.
-    """
-
-    _copies = 0
-    _blocking = 1
-    
-    def __init__(self, sock, timeout):
-        self._sock     = sock
-        self._timeout  = timeout
-    # end __init__
-
-    def __getattr__(self, key):
-        return getattr(self._sock, key)
-    # end __getattr__
-
-    def get_timeout(self):
-        return self._timeout
-    # end set_timeout
-
-    def set_timeout(self, timeout=None):
-        self._timeout = timeout
-    # end set_timeout
-
-    def setblocking(self, blocking):
-        self._blocking = blocking
-        return self._sock.setblocking(blocking)
-    # end set_timeout
-
-    def connect_ex(self, addr):
-        errcode = 0
-        try:
-            self.connect(addr)
-        except Error, why:
-            errcode = why[0]
-        return errcode
-    # end connect_ex
-        
-    def connect(self, addr, port=None, dumbhack=None):
-        # In case we were called as connect(host, port)
-        if port != None:  addr = (addr, port)
-
-        # Shortcuts
-        sock    = self._sock
-        timeout = self._timeout
-        blocking = self._blocking
-
-        # First, make a non-blocking call to connect
-        try:
-            sock.setblocking(0)
-            sock.connect(addr)
-            sock.setblocking(blocking)
-            return
-        except Error, why:
-            # Set the socket's blocking mode back
-            sock.setblocking(blocking)
-            
-            # If we are not blocking, re-raise
-            if not blocking:
-                raise
-            
-            # If we are already connected, then return success.
-            # If we got a genuine error, re-raise it.
-            errcode = why[0]
-            if dumbhack and errcode in _IsConnected:
-                return
-            elif errcode not in _ConnectBusy:
-                raise
-            
-        # Now, wait for the connect to happen
-        # ONLY if dumbhack indicates this is pass number one.
-        #   If select raises an error, we pass it on.
-        #   Is this the right behavior?
-        if not dumbhack:
-            r,w,e = select.select([], [sock], [], timeout)
-            if w:
-                return self.connect(addr, dumbhack=1)
-
-        # If we get here, then we should raise Timeout
-        raise Timeout("Attempted connect to %s timed out." % str(addr) )
-    # end connect
-
-    def accept(self, dumbhack=None):
-        # Shortcuts
-        sock     = self._sock
-        timeout  = self._timeout
-        blocking = self._blocking
-
-        # First, make a non-blocking call to accept
-        #  If we get a valid result, then convert the
-        #  accept'ed socket into a TimeoutSocket.
-        # Be carefult about the blocking mode of ourselves.
-        try:
-            sock.setblocking(0)
-            newsock, addr = sock.accept()
-            sock.setblocking(blocking)
-            timeoutnewsock = self.__class__(newsock, timeout)
-            timeoutnewsock.setblocking(blocking)
-            return (timeoutnewsock, addr)
-        except Error, why:
-            # Set the socket's blocking mode back
-            sock.setblocking(blocking)
-
-            # If we are not supposed to block, then re-raise
-            if not blocking:
-                raise
-            
-            # If we got a genuine error, re-raise it.
-            errcode = why[0]
-            if errcode not in _AcceptBusy:
-                raise
-            
-        # Now, wait for the accept to happen
-        # ONLY if dumbhack indicates this is pass number one.
-        #   If select raises an error, we pass it on.
-        #   Is this the right behavior?
-        if not dumbhack:
-            r,w,e = select.select([sock], [], [], timeout)
-            if r:
-                return self.accept(dumbhack=1)
-
-        # If we get here, then we should raise Timeout
-        raise Timeout("Attempted accept timed out.")
-    # end accept
-
-    def send(self, data, flags=0):
-        sock = self._sock
-        if self._blocking:
-            r,w,e = select.select([],[sock],[], self._timeout)
-            if not w:
-                raise Timeout("Send timed out")
-        return sock.send(data, flags)
-    # end send
-
-    def recv(self, bufsize, flags=0):
-        sock = self._sock
-        if self._blocking:
-            r,w,e = select.select([sock], [], [], self._timeout)
-            if not r:
-                raise Timeout("Recv timed out")
-        return sock.recv(bufsize, flags)
-    # end recv
-
-    def makefile(self, flags="r", bufsize=-1):
-        self._copies = self._copies +1
-        return TimeoutFile(self, flags, bufsize)
-    # end makefile
-
-    def close(self):
-        if self._copies <= 0:
-            self._sock.close()
-        else:
-            self._copies = self._copies -1
-    # end close
-
-# end TimeoutSocket
-
-
-class TimeoutFile:
-    """TimeoutFile object
-    Implements a file-like object on top of TimeoutSocket.
-    """
-    
-    def __init__(self, sock, mode="r", bufsize=4096):
-        self._sock          = sock
-        self._bufsize       = 4096
-        if bufsize > 0: self._bufsize = bufsize
-        if not hasattr(sock, "_inqueue"): self._sock._inqueue = ""
-
-    # end __init__
-
-    def __getattr__(self, key):
-        return getattr(self._sock, key)
-    # end __getattr__
-
-    def close(self):
-        self._sock.close()
-        self._sock = None
-    # end close
-    
-    def write(self, data):
-        self.send(data)
-    # end write
-
-    def read(self, size=-1):
-        _sock = self._sock
-        _bufsize = self._bufsize
-        while 1:
-            datalen = len(_sock._inqueue)
-            if datalen >= size >= 0:
-                break
-            bufsize = _bufsize
-            if size > 0:
-                bufsize = min(bufsize, size - datalen )
-            buf = self.recv(bufsize)
-            if not buf:
-                break
-            _sock._inqueue = _sock._inqueue + buf
-        data = _sock._inqueue
-        _sock._inqueue = ""
-        if size > 0 and datalen > size:
-            _sock._inqueue = data[size:]
-            data = data[:size]
-        return data
-    # end read
-
-    def readline(self, size=-1):
-        _sock = self._sock
-        _bufsize = self._bufsize
-        while 1:
-            idx = string.find(_sock._inqueue, "\n")
-            if idx >= 0:
-                break
-            datalen = len(_sock._inqueue)
-            if datalen >= size >= 0:
-                break
-            bufsize = _bufsize
-            if size > 0:
-                bufsize = min(bufsize, size - datalen )
-            buf = self.recv(bufsize)
-            if not buf:
-                break
-            _sock._inqueue = _sock._inqueue + buf
-
-        data = _sock._inqueue
-        _sock._inqueue = ""
-        if idx >= 0:
-            idx = idx + 1
-            _sock._inqueue = data[idx:]
-            data = data[:idx]
-        elif size > 0 and datalen > size:
-            _sock._inqueue = data[size:]
-            data = data[:size]
-        return data
-    # end readline
-
-    def readlines(self, sizehint=-1):
-        result = []
-        data = self.read()
-        while data:
-            idx = string.find(data, "\n")
-            if idx >= 0:
-                idx = idx + 1
-                result.append( data[:idx] )
-                data = data[idx:]
-            else:
-                result.append( data )
-                data = ""
-        return result
-    # end readlines
-
-    def flush(self):  pass
-
-# end TimeoutFile
-
-
-#
-# Silently replace the socket() builtin function with
-# our timeoutsocket() definition.
-#
-if not hasattr(socket, "_no_timeoutsocket"):
-    socket._no_timeoutsocket = socket.socket
-    socket.socket = timeoutsocket
-del socket
-socket = timeoutsocket
-# Finis

From 0df474c8ff6feda95506b26e5ea6f352a1b518ba Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Tue, 14 Nov 2006 10:28:40 -0500
Subject: [PATCH 14/39] Support backlevel versions of Python

---
 docs/installation.html | 18 +++++++++++++-----
 planet/__init__.py     |  3 ++-
 planet/spider.py       |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/docs/installation.html b/docs/installation.html
index 9994984..6a668d1 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -69,7 +69,7 @@ right directory.</p></li>
 <p>Build your own themes, templates, or filters!  And share!</p></li>
 </ol>
 
-<h3>Mac OS X and Fink Instructions</h3>
+<h3 id="macosx">Mac OS X and Fink Instructions</h3>
 
 <p>
 The <a href="http://fink.sourceforge.net/">Fink Project</a> packages
@@ -101,12 +101,20 @@ not yet ported to the newer python so Venus will be less featureful.
         may want to explicitly specify <code>python2.4</code>.</p></li>
 </ol>
 
-<h3>Ubuntu Linux (Edgy Eft) instructions</h3>
+<h3 id="ubuntu">Ubuntu Linux (Edgy Eft) instructions</h3>
 
 <p>Before starting, issue the following command:</p>
-<ul>
-<li><code>sudo apt-get install bzr python2.4-librdf</code></li>
-</ul>
+
+<blockquote><pre>sudo apt-get install bzr python2.4-librdf</pre></blockquote>
+
+<h3 id="python22">Python 2.2 instructions</h3>
+
+<p>If you are running Python 2.2, you may also need to install <a href="http://pyxml.sourceforge.net/">pyxml</a>.  If the
+following runs without error, you do <b>not</b> have the problem.</p>
+<blockquote><pre>python -c "__import__('xml.dom.minidom').dom.minidom.parseString('&lt;entry xml:lang=\"en\"/&gt;')"</pre></blockquote>
+<p>Installation of pyxml varies by platform.  For Ubuntu Linux (Dapper Drake), issue the following command:</p>
+
+<blockquote><pre>sudo apt-get install python2.2-xml</pre></blockquote>
 
 </body>
 </html>
diff --git a/planet/__init__.py b/planet/__init__.py
index 6be34ed..0902bd8 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -16,10 +16,11 @@ def getLogger(level, format):
 
     try:
         import logging
+        logging.basicConfig(format=format)
     except:
         import compat_logging as logging
+        logging.basicConfig(format=format)
 
-    logging.basicConfig(format=format)
     logging.getLogger().setLevel(logging.getLevelName(level))
     logger = logging.getLogger("planet.runner")
     try:
diff --git a/planet/spider.py b/planet/spider.py
index aff3884..72f339c 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -260,7 +260,7 @@ def spiderFeed(feed, only_if_new=0):
 
         # apply any filters
         xdoc = reconstitute.reconstitute(data, entry)
-        output = xdoc.toxml('utf-8')
+        output = xdoc.toxml().encode('utf-8')
         xdoc.unlink()
         for filter in config.filters(feed):
             output = shell.run(filter, output, mode="filter")
@@ -320,7 +320,7 @@ def spiderFeed(feed, only_if_new=0):
     xdoc=minidom.parseString('''<feed xmlns:planet="%s"
       xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
     reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
-    write(xdoc.toxml('utf-8'), filename(sources, feed))
+    write(xdoc.toxml().encode('utf-8'), filename(sources, feed))
     xdoc.unlink()
 
 def spiderPlanet(only_if_new = False):

From ba25b691ff85ebb602c441e6fc39173971c5a81b Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Tue, 14 Nov 2006 11:05:09 -0500
Subject: [PATCH 15/39] Fix windows regression

---
 planet/splice.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/planet/splice.py b/planet/splice.py
index e50f927..4853619 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -68,8 +68,8 @@ def splice():
     # insert entry information
     items = 0
     for mtime,file in dir:
-        if index:
-            base = file.split('/')[-1]
+        if index != None:
+            base = os.path.basename(file)
             if index.has_key(base) and index[base] not in sub_ids: continue
 
         try:

From 167f0de4da64b1c095759c0b08febcb62ec5474e Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 15 Nov 2006 07:46:35 -0500
Subject: [PATCH 16/39] More bullet-proofing

---
 planet/spider.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index a41edc8..32e03ce 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -257,9 +257,8 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
 
         # get updated-date either from the entry or the cache (default to now)
         mtime = None
-        if not entry.has_key('updated_parsed'):
-            if entry.has_key('published_parsed'):
-                entry['updated_parsed'] = entry['published_parsed']
+        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
+            entry['updated_parsed'] = entry.get('published_parsed',None)
         if not entry.has_key('updated_parsed'):
             try:
                 mtime = calendar.timegm(entry.updated_parsed)
@@ -270,7 +269,10 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
                 mtime = os.stat(cache_file).st_mtime
             except:
                 if data.feed.has_key('updated_parsed'):
-                    mtime = calendar.timegm(data.feed.updated_parsed)
+                    try:
+                        mtime = calendar.timegm(data.feed.updated_parsed)
+                    except:
+                        pass
         if not mtime or mtime > time.time(): mtime = time.time()
         entry['updated_parsed'] = time.gmtime(mtime)
 

From ccb5aa4e39fab674cf5456f15f46b15dcbc3c010 Mon Sep 17 00:00:00 2001
From: Harry Fuecks <hfuecks@gmail.com>
Date: Thu, 16 Nov 2006 16:06:35 +0000
Subject: [PATCH 17/39] Add tests to check entry updated value is preserved

---
 tests/test_spider.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_spider.py b/tests/test_spider.py
index b01eebb..e6ce66f 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -59,6 +59,7 @@ class SpiderTest(unittest.TestCase):
         self.assertEqual(['application/atom+xml'], [link.type
             for link in data.entries[0].source.links if link.rel=='self'])
         self.assertEqual('one', data.entries[0].source.planet_name)
+        self.assertEqual('2006-01-01T00:00:00Z', data.entries[0].updated)
         self.assertEqual(os.stat(files[2]).st_mtime,
             calendar.timegm(data.entries[0].updated_parsed))
 

From bf0c7b736d25d32c724272f4ade2dc3300e75a1f Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 16 Nov 2006 15:51:27 -0500
Subject: [PATCH 18/39] Fix regression where entry updated was always ignored

---
 planet/spider.py     | 2 +-
 tests/test_spider.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index 32e03ce..4922d76 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -259,7 +259,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
         mtime = None
         if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
             entry['updated_parsed'] = entry.get('published_parsed',None)
-        if not entry.has_key('updated_parsed'):
+        if entry.has_key('updated_parsed'):
             try:
                 mtime = calendar.timegm(entry.updated_parsed)
             except:
diff --git a/tests/test_spider.py b/tests/test_spider.py
index e6ce66f..418364c 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -59,7 +59,7 @@ class SpiderTest(unittest.TestCase):
         self.assertEqual(['application/atom+xml'], [link.type
             for link in data.entries[0].source.links if link.rel=='self'])
         self.assertEqual('one', data.entries[0].source.planet_name)
-        self.assertEqual('2006-01-01T00:00:00Z', data.entries[0].updated)
+        self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
         self.assertEqual(os.stat(files[2]).st_mtime,
             calendar.timegm(data.entries[0].updated_parsed))
 

From 1ce96ca53b2859fca09f9e214e1f44df90bf3879 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 16 Nov 2006 20:18:34 -0500
Subject: [PATCH 19/39] Assign a css-id to each source

---
 planet/reconstitute.py     | 11 +++++++++++
 tests/test_spider.py       |  1 +
 themes/asf/index.html.xslt |  3 ++-
 themes/asf/personalize.js  |  3 ++-
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 989a107..6d7f43d 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -50,6 +50,15 @@ def ncr2c(value):
         value=unichr(int(value))
     return value
 
+nonalpha=re.compile('\W+',re.UNICODE)
+def cssid(name):
+    """ generate a css id from a name """
+    try:
+        name = nonalpha.sub('-',name.decode('utf-8')).lower().encode('utf-8')
+    except:
+        name = nonalpha.sub('-',name).lower()
+    return name.strip('-')
+
 def normalize(text, bozo):
     """ convert everything to well formed XML """
     if text.has_key('type'):
@@ -198,6 +207,8 @@ def source(xsource, source, bozo, format):
     if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
 
     # propagate planet inserted information
+    if source.has_key('planet_name') and not source.has_key('planet_css-id'):
+        source['planet_css-id'] = cssid(source['planet_name'])
     for key, value in source.items():
         if key.startswith('planet_'):
             createTextElement(xsource, key.replace('_',':',1), value)
diff --git a/tests/test_spider.py b/tests/test_spider.py
index 418364c..2bef04a 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -91,6 +91,7 @@ class SpiderTest(unittest.TestCase):
         self.assertEqual(['application/rss+xml'], [link.type
             for link in data.entries[0].source.links if link.rel=='self'])
         self.assertEqual('three', data.entries[0].source.author_detail.name)
+        self.assertEqual('three', data.entries[0].source['planet_css-id'])
 
     def test_spiderPlanet(self):
         config.load(configfile)
diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index b3d2063..1c7468e 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -1,4 +1,5 @@
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+                xmlns:fn="http://www.w3.org/2005/xpath-functions/"
                 xmlns:atom="http://www.w3.org/2005/Atom"
                 xmlns:xhtml="http://www.w3.org/1999/xhtml"
                 xmlns:planet="http://planet.intertwingly.net/"
@@ -139,7 +140,7 @@
     </xsl:if>
 
     <xsl:text>&#10;&#10;</xsl:text>
-    <div class="news">
+    <div class="news {atom:source/planet:css-id}">
 
       <xsl:if test="@xml:lang">
         <xsl:attribute name="xml:lang">
diff --git a/themes/asf/personalize.js b/themes/asf/personalize.js
index 83db3a3..afcec17 100644
--- a/themes/asf/personalize.js
+++ b/themes/asf/personalize.js
@@ -159,7 +159,8 @@ function findEntries() {
       var date = localizeDate(span[i]);
 
       var parent = span[i];
-      while (parent && parent.className != 'news') {
+      while (parent && 
+        (!parent.className || parent.className.split(' ')[0] != 'news')) {
         parent = parent.parentNode;
       }
 

From c337597302b228a617e312e0c2c6d1a10158ff18 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 16 Nov 2006 21:46:58 -0500
Subject: [PATCH 20/39] Cleanup

---
 themes/asf/index.html.xslt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index 1c7468e..ca355fe 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -1,9 +1,9 @@
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
-                xmlns:fn="http://www.w3.org/2005/xpath-functions/"
                 xmlns:atom="http://www.w3.org/2005/Atom"
                 xmlns:xhtml="http://www.w3.org/1999/xhtml"
                 xmlns:planet="http://planet.intertwingly.net/"
-                xmlns="http://www.w3.org/1999/xhtml">
+                xmlns="http://www.w3.org/1999/xhtml"
+                exclude-result-prefixes="atom planet xhtml">
  
   <xsl:template match="atom:feed">
     <html xmlns="http://www.w3.org/1999/xhtml">

From 20cb60df7c7e358bed22dcfb08f17eaf1b0e1d50 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Sun, 19 Nov 2006 11:56:36 -0500
Subject: [PATCH 21/39] Resync with httplib2

---
 planet/httplib2/__init__.py | 138 +++++++++++++++++++++++++-----------
 planet/spider.py            |  61 ++++++++++------
 2 files changed, 134 insertions(+), 65 deletions(-)

diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py
index 08c87b9..3ee6b4f 100644
--- a/planet/httplib2/__init__.py
+++ b/planet/httplib2/__init__.py
@@ -1,3 +1,4 @@
+from __future__ import generators
 """
 httplib2
 
@@ -8,21 +9,22 @@ Requires Python 2.3 or later
 
 """
 
-from __future__ import generators
-
 __author__ = "Joe Gregorio (joe@bitworking.org)"
 __copyright__ = "Copyright 2006, Joe Gregorio"
 __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
     "James Antill",
     "Xavier Verges Farrero",
     "Jonathan Feinberg",
-    "Blair Zajac"]
+    "Blair Zajac",
+    "Sam Ruby"]
 __license__ = "MIT"
-__version__ = "$Rev: 209 $"
+__version__ = "$Rev: 217 $"
 
 import re 
 import md5
-import rfc822
+import email
+import email.Utils
+import email.Message
 import StringIO
 import gzip
 import zlib
@@ -114,6 +116,49 @@ def parse_uri(uri):
     groups = URI.match(uri).groups()
     return (groups[1], groups[3], groups[4], groups[6], groups[8])
 
+def urlnorm(uri):
+    (scheme, authority, path, query, fragment) = parse_uri(uri)
+    authority = authority.lower()
+    scheme = scheme.lower()
+    if not path: 
+        path = "/"
+    # Could do syntax based normalization of the URI before
+    # computing the digest. See Section 6.2.2 of Std 66.
+    request_uri = query and "?".join([path, query]) or path
+    defrag_uri = scheme + "://" + authority + request_uri
+    return scheme, authority, request_uri, defrag_uri
+
+
+# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
+re_url_scheme    = re.compile(r'^\w+://')
+re_slash         = re.compile(r'[?/:|]+')
+
+def safename(filename):
+    """Return a filename suitable for the cache.
+
+    Strips dangerous and common characters to create a filename we
+    can use to store the cache in.
+    """
+
+    try:
+        if re_url_scheme.match(filename):
+            if isinstance(filename,str):
+                filename=filename.decode('utf-8').encode('idna')
+            else:
+                filename=filename.encode('idna')
+    except:
+        pass
+    if isinstance(filename,unicode):
+        filename=filename.encode('utf-8')
+    filemd5 = md5.new(filename).hexdigest()
+    filename = re_url_scheme.sub("", filename)
+    filename = re_slash.sub(",", filename)
+
+    # limit length of filename
+    if len(filename)>200:
+        filename=filename[:200]
+    return ",".join((filename, filemd5))
+
 NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
 def _normalize_headers(headers):
     return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
@@ -211,13 +256,13 @@ def _entry_disposition(response_headers, request_headers):
     elif cc.has_key('only-if-cached'):
         retval = "FRESH"
     elif response_headers.has_key('date'):
-        date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
+        date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
         now = time.time()
         current_age = max(0, now - date)
         if cc_response.has_key('max-age'):
             freshness_lifetime = int(cc_response['max-age'])
         elif response_headers.has_key('expires'):
-            expires = rfc822.parsedate_tz(response_headers['expires'])
+            expires = email.Utils.parsedate_tz(response_headers['expires'])
             freshness_lifetime = max(0, calendar.timegm(expires) - date)
         else:
             freshness_lifetime = 0
@@ -232,12 +277,14 @@ def _entry_disposition(response_headers, request_headers):
 def _decompressContent(response, new_content):
     content = new_content
     try:
-        if response.get('content-encoding', None) == 'gzip':
-            content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
-            response['content-length'] = str(len(content))
-        if response.get('content-encoding', None) == 'deflate':
-            content = zlib.decompress(content)
+        encoding = response.get('content-encoding', None)
+        if encoding in ['gzip', 'deflate']:
+            if encoding == 'gzip':
+                content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
+            if encoding == 'deflate':
+                content = zlib.decompress(content)
             response['content-length'] = str(len(content))
+            del response['content-encoding']
     except:
         content = ""
         raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
@@ -250,14 +297,23 @@ def _updateCache(request_headers, response_headers, content, cache, cachekey):
         if cc.has_key('no-store') or cc_response.has_key('no-store'):
             cache.delete(cachekey)
         else:
-            f = StringIO.StringIO("")
-            info = rfc822.Message(StringIO.StringIO(""))
+            info = email.Message.Message()
             for key, value in response_headers.iteritems():
-                info[key] = value
-            f.write(str(info))
-            f.write("\r\n\r\n")
-            f.write(content)
-            cache.set(cachekey, f.getvalue())
+                if key not in ['status','content-encoding','transfer-encoding']:
+                    info[key] = value
+
+            status = response_headers.status
+            if status == 304:
+                status = 200
+
+            status_header = 'status: %d\r\n' % response_headers.status
+
+            header_str = info.as_string()
+
+            header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
+            text = "".join([status_header, header_str, content])
+
+            cache.set(cachekey, text)
 
 def _cnonce():
     dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
@@ -498,20 +554,23 @@ AUTH_SCHEME_CLASSES = {
 
 AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
 
+def _md5(s):
+    return 
 
 class FileCache:
     """Uses a local directory as a store for cached files.
     Not really safe to use if multiple threads or processes are going to 
     be running on the same cache.
     """
-    def __init__(self, cache):
+    def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
         self.cache = cache
+        self.safe = safe
         if not os.path.exists(cache): 
             os.makedirs(self.cache)
 
     def get(self, key):
         retval = None
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
         try:
             f = file(cacheFullPath, "r")
             retval = f.read()
@@ -521,13 +580,13 @@ class FileCache:
         return retval
 
     def set(self, key, value):
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
         f = file(cacheFullPath, "w")
         f.write(value)
         f.close()
 
     def delete(self, key):
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
         if os.path.exists(cacheFullPath):
             os.remove(cacheFullPath)
 
@@ -639,7 +698,8 @@ class Http:
                             response['location'] = urlparse.urljoin(absolute_uri, location)
                     if response.status == 301 and method in ["GET", "HEAD"]:
                         response['-x-permanent-redirect-url'] = response['location']
-                        response['-location'] = absolute_uri 
+                        if not response.has_key('content-location'):
+                            response['content-location'] = absolute_uri 
                         _updateCache(headers, response, content, self.cache, cachekey)
                     if headers.has_key('if-none-match'):
                         del headers['if-none-match']
@@ -648,7 +708,8 @@ class Http:
                     if response.has_key('location'):
                         location = response['location']
                         old_response = copy.deepcopy(response)
-                        old_response['-location'] = absolute_uri 
+                        if not old_response.has_key('content-location'):
+                            old_response['content-location'] = absolute_uri 
                         redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
                         (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
                         response.previous = old_response
@@ -656,7 +717,8 @@ class Http:
                     raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
             elif response.status in [200, 203] and method == "GET":
                 # Don't cache 206's since we aren't going to handle byte range requests
-                response['-location'] = absolute_uri 
+                if not response.has_key('content-location'):
+                    response['content-location'] = absolute_uri 
                 _updateCache(headers, response, content, self.cache, cachekey)
 
         return (response, content)
@@ -690,14 +752,7 @@ a string that contains the response entity body.
         if not headers.has_key('user-agent'):
             headers['user-agent'] = "Python-httplib2/%s" % __version__
 
-        (scheme, authority, path, query, fragment) = parse_uri(uri)
-        authority = authority.lower()
-        if not path: 
-            path = "/"
-        # Could do syntax based normalization of the URI before
-        # computing the digest. See Section 6.2.2 of Std 66.
-        request_uri = query and "?".join([path, query]) or path
-        defrag_uri = scheme + "://" + authority + request_uri
+        (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
 
         if not self.connections.has_key(scheme+":"+authority):
             connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
@@ -709,17 +764,16 @@ a string that contains the response entity body.
         if method in ["GET", "HEAD"] and 'range' not in headers:
             headers['accept-encoding'] = 'compress, gzip'
 
-        info = rfc822.Message(StringIO.StringIO(""))
+        info = email.Message.Message()
         cached_value = None
         if self.cache:
-            cachekey = md5.new(defrag_uri).hexdigest()
+            cachekey = defrag_uri
             cached_value = self.cache.get(cachekey)
             if cached_value:
                 try:
-                    f = StringIO.StringIO(cached_value)
-                    info = rfc822.Message(f)
+                    info = email.message_from_string(cached_value)
                     content = cached_value.split('\r\n\r\n', 1)[1]
-                except:
+                except Exception, e:
                     self.cache.delete(cachekey)
                     cachekey = None
                     cached_value = None
@@ -802,7 +856,7 @@ a string that contains the response entity body.
  
 
 class Response(dict):
-    """An object more like rfc822.Message than httplib.HTTPResponse."""
+    """An object more like email.Message than httplib.HTTPResponse."""
    
     """Is this response from our local cache"""
     fromcache = False
@@ -819,7 +873,7 @@ class Response(dict):
     previous = None
 
     def __init__(self, info):
-        # info is either an rfc822.Message or 
+        # info is either an email.Message or 
         # an httplib.HTTPResponse object.
         if isinstance(info, httplib.HTTPResponse):
             for key, value in info.getheaders(): 
@@ -828,7 +882,7 @@ class Response(dict):
             self['status'] = str(self.status)
             self.reason = info.reason
             self.version = info.version
-        elif isinstance(info, rfc822.Message):
+        elif isinstance(info, email.Message.Message):
             for key, value in info.items(): 
                 self[key] = value 
             self.status = int(self['status'])
diff --git a/planet/spider.py b/planet/spider.py
index 4922d76..16595b9 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -140,17 +140,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
         return
 
     # read feed itself
-    if content:
-        # httplib2 was used to get the content, so prepare a 
-        # proper object to pass to feedparser.
-        f = StringIO(content) 
-        setattr(f, 'url', resp_headers.get('-location', feed))
-        if resp_headers:
-            if resp_headers.has_key('content-encoding'):
-                del resp_headers['content-encoding']
-            setattr(f, 'headers', resp_headers)
-        data = feedparser.parse(f)
-    else:
+    if not resp_headers:
         modified = None
         try:
             modified=time.strptime(
@@ -159,12 +149,25 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
             pass
         data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
             etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
+    elif int(resp_headers.status) < 300:
+        # httplib2 was used to get the content, so prepare a 
+        # proper object to pass to feedparser.
+        f = StringIO(content) 
+        setattr(f, 'url', resp_headers.get('content-location', feed))
+        if resp_headers:
+            if resp_headers.has_key('content-encoding'):
+                del resp_headers['content-encoding']
+            setattr(f, 'headers', resp_headers)
+        data = feedparser.parse(f)
+    else:
+        data = feedparser.FeedParserDict({'status': int(resp_headers.status),
+            'headers':resp_headers, 'version':None, 'entries': []})
 
     # capture http status
     if not data.has_key("status"):
         if data.has_key("entries") and len(data.entries)>0:
             data.status = 200
-        elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout':
+        elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
             data.status = 408
         else:
             data.status = 500
@@ -380,13 +383,27 @@ def spiderPlanet(only_if_new = False):
                     # is empty which will terminate the thread.
                     uri = work_queue.get(block=False)
                     log.info("Fetching %s via %d", uri, thread_index)
+                    resp = feedparser.FeedParserDict({'status':'500'})
+                    content = None
                     try:
-                        (resp, content) = h.request(uri)
-                        awaiting_parsing.put(block=True, item=(resp, content, uri))
+                        try:
+                            if isinstance(uri,unicode):
+                                idna = uri.encode('idna')
+                            else:
+                                idna = uri.decode('utf-8').encode('idna')
+                            if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
+                        except:
+                            log.info("unable to map %s to a URI", uri)
+                            idna = uri
+                        (resp, content) = h.request(idna)
                     except gaierror:
                         log.error("Fail to resolve server name %s via %d", uri, thread_index)
                     except error, e:
-                        log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
+                        if e.__class__.__name__.lower()=='timeout':
+                            resp['status'] = '408'
+                            log.warn("Timeout in thread-%d", thread_index)
+                        else:
+                            log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
                     except Exception, e:
                         import sys, traceback
                         type, value, tb = sys.exc_info()
@@ -394,6 +411,7 @@ def spiderPlanet(only_if_new = False):
                         for line in (traceback.format_exception_only(type, value) +
                             traceback.format_tb(tb)):
                             log.error(line.rstrip())
+                    awaiting_parsing.put(block=True, item=(resp, content, uri))
  
             except Empty, e:
                 log.info("Thread %d finished", thread_index)
@@ -409,18 +427,15 @@ def spiderPlanet(only_if_new = False):
 
         # Process the results as they arrive
         while work_queue.qsize() or awaiting_parsing.qsize() or threads:
-            if awaiting_parsing.qsize() == 0 and threads:
-                time.sleep(1)
+            while awaiting_parsing.qsize() == 0 and threads:
+                time.sleep(0.1)
             while awaiting_parsing.qsize():
                 item = awaiting_parsing.get(False)
                 try:
                     (resp_headers, content, uri) = item
-                    if not resp_headers.fromcache:
-                        if resp_headers.status < 300:
-                            log.info("Parsing pre-fetched %s", uri)
-                            spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
-                        else:
-                            log.error("Status code %d from %s", resp_headers.status, uri)
+                    if resp_headers.status == 200 and resp_headers.fromcache:
+                        resp_headers.status = 304
+                    spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
                 except Exception, e:
                     import sys, traceback
                     type, value, tb = sys.exc_info()

From 52716d99f7fff0c01c6be4712e817bfb5873ebaf Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Sun, 19 Nov 2006 12:57:44 -0500
Subject: [PATCH 22/39] Make subscription list collapsible

---
 themes/asf/default.css     |  4 ++
 themes/asf/index.html.xslt | 91 +++++++++++++++++++-------------------
 themes/asf/personalize.js  | 53 +++++++++++++++++++---
 3 files changed, 98 insertions(+), 50 deletions(-)

diff --git a/themes/asf/default.css b/themes/asf/default.css
index c5169f0..2de9db5 100644
--- a/themes/asf/default.css
+++ b/themes/asf/default.css
@@ -146,6 +146,10 @@ h1 {
         display: inline;
 }
 
+#footer img {
+        display: none;
+}
+
 /* ----------------------------- Body ---------------------------- */
 
 #body {
diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index ca355fe..2beb5d9 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -30,12 +30,52 @@
         <xsl:text>&#10;</xsl:text>
         <h1><xsl:value-of select="atom:title"/></h1>
 
-        <xsl:text>&#10;</xsl:text>
-        <div id="sidebar">
-
+        <xsl:text>&#10;&#10;</xsl:text>
+        <div id="body">
+          <xsl:apply-templates select="atom:entry"/>
           <xsl:text>&#10;&#10;</xsl:text>
-          <h2>Subscriptions</h2>
-          <xsl:text>&#10;</xsl:text>
+        </div>
+
+        <h1>Subscriptions </h1>
+        <xsl:text>&#10;&#10;</xsl:text>
+
+        <div id="sidebar">
+          <h2>Info</h2>
+
+          <dl>
+            <dt>Last updated:</dt>
+            <dd>
+              <span class="date" title="GMT">
+                <xsl:value-of select="atom:updated/@planet:format"/>
+              </span>
+            </dd>
+            <dt>Powered by:</dt>
+            <dd>
+              <a href="http://intertwingly.net/code/venus/">
+                <img src="images/venus.png" width="80" height="15"
+                  alt="Venus" border="0"/>
+              </a>
+            </dd>
+            <dt>Export:</dt>
+            <dd>
+              <ul>
+                <li>
+                  <a href="opml.xml">
+                    <img src="images/opml.png" alt="OPML"/>
+                  </a>
+                </li>
+                <li>
+                  <a href="foafroll.xml">
+                    <img src="images/foaf.png" alt="FOAF"/>
+                  </a>
+                </li>
+              </ul>
+            </dd>
+          </dl>
+        </div>
+
+        <xsl:text>&#10;&#10;</xsl:text>
+        <div id="footer">
           <ul>
             <xsl:for-each select="planet:source">
               <xsl:sort select="planet:name"/>
@@ -80,48 +120,9 @@
             </xsl:for-each>
             <xsl:text>&#10;</xsl:text>
           </ul>
-
-          <xsl:text>&#10;&#10;</xsl:text>
-          <h2>Info</h2>
-
-          <dl>
-            <dt>Last updated:</dt>
-            <dd>
-              <span class="date" title="GMT">
-                <xsl:value-of select="atom:updated/@planet:format"/>
-              </span>
-            </dd>
-            <dt>Powered by:</dt>
-            <dd>
-              <a href="http://intertwingly.net/code/venus/">
-                <img src="images/venus.png" width="80" height="15"
-                  alt="Venus" border="0"/>
-              </a>
-            </dd>
-            <dt>Export:</dt>
-            <dd>
-              <ul>
-                <li>
-                  <a href="opml.xml">
-                    <img src="images/opml.png" alt="OPML"/>
-                  </a>
-                </li>
-                <li>
-                  <a href="foafroll.xml">
-                    <img src="images/foaf.png" alt="FOAF"/>
-                  </a>
-                </li>
-              </ul>
-            </dd>
-          </dl>
-
         </div>
 
-        <xsl:text>&#10;&#10;</xsl:text>
-        <div id="body">
-          <xsl:apply-templates select="atom:entry"/>
-          <xsl:text>&#10;&#10;</xsl:text>
-        </div>
+        <xsl:text>&#10;</xsl:text>
       </body>
     </html>
   </xsl:template>
diff --git a/themes/asf/personalize.js b/themes/asf/personalize.js
index afcec17..f7508e0 100644
--- a/themes/asf/personalize.js
+++ b/themes/asf/personalize.js
@@ -18,6 +18,7 @@ function stopPropagation(event) {
 // scroll back to the previous article
 function prevArticle(event) {
   for (var i=entries.length; --i>=0;) {
+    if (!entries[i].anchor) continue;
     if (entries[i].anchor.offsetTop < document.documentElement.scrollTop) {
       window.location.hash=entries[i].anchor.id;
       stopPropagation(event);
@@ -29,6 +30,7 @@ function prevArticle(event) {
 // advance to the next article
 function nextArticle(event) {
   for (var i=1; i<entries.length; i++) {
+    if (!entries[i].anchor) continue;
     if (entries[i].anchor.offsetTop-20 > document.documentElement.scrollTop) {
       window.location.hash=entries[i].anchor.id;
       stopPropagation(event);
@@ -84,17 +86,20 @@ function selectOption() {
 
 // add navkeys option to sidebar
 function addOption(event) {
-  if (entries.length > 1 && entries[entries.length-1].parent.offsetTop > 0) {
-    var sidebar = document.getElementById('sidebar');
-    if (!sidebar) return;
+  var sidebar = document.getElementById('sidebar');
+  if (!sidebar) return;
 
-    for (var i=entries.length; --i>=0;) {
+  var h2 = null;
+  for (var i=entries.length; --i>=0;) {
+    if (entries[i].parent.offsetTop > 0) {
       var a = entries[i].anchor = document.createElement('a');
       a.id = "news-" + i;
       entries[i].parent.insertBefore(a, entries[i].parent.firstChild);
+      if (h2 == null) h2 = document.createElement('h2');
     }
+  }
 
-    var h2 = document.createElement('h2');
+  if (h2 != null) {
     h2.appendChild(document.createTextNode('Options'));
     sidebar.appendChild(h2);
 
@@ -203,8 +208,46 @@ function moveDateHeaders() {
   }
 }
 
+function moveSidebar() {
+  var sidebar = document.getElementById('sidebar');
+  var h1 = sidebar.previousSibling;
+  while (h1.nodeType != 1) h1=h1.previousSibling;
+  h1.parentNode.removeChild(h1);
+  var footer = document.getElementById('footer');
+  var ul = footer.firstChild;
+  while (ul.nodeType != 1) ul=ul.nextSibling;
+  footer.removeChild(ul);
+  sidebar.insertBefore(ul, sidebar.firstChild);
+  var h2 = document.createElement('h2');
+  h2.appendChild(h1.firstChild);
+  var twisty = document.createElement('a');
+  twisty.appendChild(document.createTextNode('\u25bc'));
+  twisty.title = 'hide';
+  twisty.onclick = function() {
+    var display = 'block';
+    if (this.childNodes[0].nodeValue == '\u25ba') {
+      this.title = 'hide';
+      this.childNodes[0].nodeValue = '\u25bc';
+    } else {
+      this.title = 'show';
+      this.childNodes[0].nodeValue = '\u25ba';
+      display = 'none';
+    }
+    ul.style.display = display;
+    createCookie("subscriptions", display, 365);
+  }
+  var cookie = readCookie("subscriptions");
+  if (cookie && cookie == 'none') twisty.onclick();
+  h2.appendChild(twisty);
+  sidebar.insertBefore(h2, sidebar.firstChild);
+  var body = document.getElementById('body');
+  sidebar.parentNode.removeChild(sidebar);
+  body.parentNode.insertBefore(sidebar, body);
+}
+
 // adjust dates to local time zones, optionally provide navigation keys
 function personalize() {
+  moveSidebar();
   findEntries(); 
   addOption();
   moveDateHeaders();

From c6c9bed9940250aa906d79df60283afcff4f219e Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Mon, 20 Nov 2006 10:07:39 -0500
Subject: [PATCH 23/39] Partial refactoring

---
 planet/spider.py | 123 +++++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 48 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index 16595b9..0337479 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -121,7 +121,7 @@ def _is_http_uri(uri):
     parsed = urlparse.urlparse(uri)
     return parsed[0] in ['http', 'https']
 
-def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
+def spiderFeed(feed_uri, only_if_new=0):
     """ Spider (fetch) a single feed """
     log = planet.logger
 
@@ -130,38 +130,30 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
     if not os.path.exists(sources):
         os.makedirs(sources, 0700)
 
-    feed_source = filename(sources, feed)
+    feed_source = filename(sources, feed_uri)
     feed_info = feedparser.parse(feed_source)
     if feed_info.feed and only_if_new:
-        log.info("Feed %s already in cache", feed)
+        log.info("Feed %s already in cache", feed_uri)
         return
     if feed_info.feed.get('planet_http_status',None) == '410':
-        log.info("Feed %s gone", feed)
+        log.info("Feed %s gone", feed_uri)
         return
 
     # read feed itself
-    if not resp_headers:
-        modified = None
-        try:
-            modified=time.strptime(
-                feed_info.feed.get('planet_http_last_modified', None))
-        except:
-            pass
-        data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
-            etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
-    elif int(resp_headers.status) < 300:
-        # httplib2 was used to get the content, so prepare a 
-        # proper object to pass to feedparser.
-        f = StringIO(content) 
-        setattr(f, 'url', resp_headers.get('content-location', feed))
-        if resp_headers:
-            if resp_headers.has_key('content-encoding'):
-                del resp_headers['content-encoding']
-            setattr(f, 'headers', resp_headers)
-        data = feedparser.parse(f)
-    else:
-        data = feedparser.FeedParserDict({'status': int(resp_headers.status),
-            'headers':resp_headers, 'version':None, 'entries': []})
+    modified = None
+    try:
+        modified=time.strptime(
+            feed_info.feed.get('planet_http_last_modified', None))
+    except:
+        pass
+    data = feedparser.parse(feed_info.feed.get('planet_http_location',feed_uri),
+        etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
+
+    writeCache(feed_uri, feed_info, data)
+
+def writeCache(feed_uri, feed_info, data):
+    log = planet.logger
+    sources = config.cache_sources_directory()
 
     # capture http status
     if not data.has_key("status"):
@@ -173,20 +165,20 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
             data.status = 500
 
     activity_horizon = \
-        time.gmtime(time.time()-86400*config.activity_threshold(feed))
+        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))
 
     # process based on the HTTP status code
     if data.status == 200 and data.has_key("url"):
         data.feed['planet_http_location'] = data.url
-        if feed == data.url:
-            log.info("Updating feed %s", feed)
+        if feed_uri == data.url:
+            log.info("Updating feed %s", feed_uri)
         else:
-            log.info("Updating feed %s @ %s", feed, data.url)
+            log.info("Updating feed %s @ %s", feed_uri, data.url)
     elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
-        log.warning("Feed has moved from <%s> to <%s>", feed, data.url)
+        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
         data.feed['planet_http_location'] = data.url
     elif data.status == 304:
-        log.info("Feed %s unchanged", feed)
+        log.info("Feed %s unchanged", feed_uri)
 
         if not feed_info.feed.has_key('planet_message'):
             if feed_info.feed.has_key('planet_updated'):
@@ -199,13 +191,13 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
             del feed_info.feed['planet_message']
 
     elif data.status == 410:
-        log.info("Feed %s gone", feed)
+        log.info("Feed %s gone", feed_uri)
     elif data.status == 408:
-        log.warning("Feed %s timed out", feed)
+        log.warning("Feed %s timed out", feed_uri)
     elif data.status >= 400:
-        log.error("Error %d while updating feed %s", data.status, feed)
+        log.error("Error %d while updating feed %s", data.status, feed_uri)
     else:
-        log.info("Updating feed %s", feed)
+        log.info("Updating feed %s", feed_uri)
 
     # if read failed, retain cached information
     if not data.version and feed_info.version:
@@ -236,12 +228,12 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
                 break
         else:
             data.feed.links.append(feedparser.FeedParserDict(
-                {'rel':'self', 'type':feedtype, 'href':feed}))
-    for name, value in config.feed_options(feed).items():
+                {'rel':'self', 'type':feedtype, 'href':feed_uri}))
+    for name, value in config.feed_options(feed_uri).items():
         data.feed['planet_'+name] = value
 
     # perform user configured scrub operations on the data
-    scrub(feed, data)
+    scrub(feed_uri, data)
 
     from planet import idindex
     global index
@@ -283,7 +275,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
         xdoc = reconstitute.reconstitute(data, entry)
         output = xdoc.toxml().encode('utf-8')
         xdoc.unlink()
-        for filter in config.filters(feed):
+        for filter in config.filters(feed_uri):
             output = shell.run(filter, output, mode="filter")
             if not output: break
         if not output: continue
@@ -302,7 +294,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
     if index: index.close()
 
     # identify inactive feeds
-    if config.activity_threshold(feed):
+    if config.activity_threshold(feed_uri):
         updated = [entry.updated_parsed for entry in data.entries
             if entry.has_key('updated_parsed')]
         updated.sort()
@@ -314,7 +306,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
            updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]
 
         if not updated or updated[-1] < activity_horizon:
-            msg = "no activity in %d days" % config.activity_threshold(feed)
+            msg = "no activity in %d days" % config.activity_threshold(feed_uri)
             log.info(msg)
             data.feed['planet_message'] = msg
 
@@ -341,7 +333,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
     xdoc=minidom.parseString('''<feed xmlns:planet="%s"
       xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
     reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
-    write(xdoc.toxml().encode('utf-8'), filename(sources, feed))
+    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
     xdoc.unlink()
 
 def spiderPlanet(only_if_new = False):
@@ -367,6 +359,7 @@ def spiderPlanet(only_if_new = False):
         from threading import Thread
         import httplib2
         from socket import gaierror, error 
+        from httplib import BadStatusLine
 
         work_queue = Queue()
         awaiting_parsing = Queue()
@@ -384,8 +377,20 @@ def spiderPlanet(only_if_new = False):
                     uri = work_queue.get(block=False)
                     log.info("Fetching %s via %d", uri, thread_index)
                     resp = feedparser.FeedParserDict({'status':'500'})
+                    feed_info = None
                     content = None
                     try:
+                        # read cached feed info
+                        sources = config.cache_sources_directory()
+                        feed_source = filename(sources, uri)
+                        feed_info = feedparser.parse(feed_source)
+                        if feed_info.feed and only_if_new:
+                            log.info("Feed %s already in cache", uri)
+                            continue
+                        if feed_info.feed.get('planet_http_status',None) == '410':
+                            log.info("Feed %s gone", uri)
+                            continue
+
                         try:
                             if isinstance(uri,unicode):
                                 idna = uri.encode('idna')
@@ -398,6 +403,8 @@ def spiderPlanet(only_if_new = False):
                         (resp, content) = h.request(idna)
                     except gaierror:
                         log.error("Fail to resolve server name %s via %d", uri, thread_index)
+                    except BadStatusLine:
+                        log.error("Bad Status Line received for %s via %d", uri, thread_index)
                     except error, e:
                         if e.__class__.__name__.lower()=='timeout':
                             resp['status'] = '408'
@@ -411,7 +418,13 @@ def spiderPlanet(only_if_new = False):
                         for line in (traceback.format_exception_only(type, value) +
                             traceback.format_tb(tb)):
                             log.error(line.rstrip())
-                    awaiting_parsing.put(block=True, item=(resp, content, uri))
+                        continue
+
+                    if feed_info:
+                        if resp.status == 200 and resp.fromcache:
+                            resp.status = 304
+                        awaiting_parsing.put(block=True,
+                            item=(resp, content, uri, feed_info))
  
             except Empty, e:
                 log.info("Thread %d finished", thread_index)
@@ -432,10 +445,24 @@ def spiderPlanet(only_if_new = False):
             while awaiting_parsing.qsize():
                 item = awaiting_parsing.get(False)
                 try:
-                    (resp_headers, content, uri) = item
-                    if resp_headers.status == 200 and resp_headers.fromcache:
-                        resp_headers.status = 304
-                    spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                    (resp_headers, content, uri, feed_info) = item
+                    # spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+
+                    if int(resp_headers.status) < 300:
+                        # httplib2 was used to get the content, so prepare a 
+                        # proper object to pass to feedparser.
+                        f = StringIO(content) 
+                        setattr(f, 'url', resp_headers.get('content-location', uri))
+                        if resp_headers.has_key('content-encoding'):
+                            del resp_headers['content-encoding']
+                        setattr(f, 'headers', resp_headers)
+                        data = feedparser.parse(f)
+                    else:
+                        data = feedparser.FeedParserDict({'status': int(resp_headers.status),
+                            'headers':resp_headers, 'version':None, 'entries': []})
+
+                    writeCache(uri, feed_info, data)
+
                 except Exception, e:
                     import sys, traceback
                     type, value, tb = sys.exc_info()

From e85ae487220da0dd09f40e50b263d1c06fd58f0e Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Mon, 20 Nov 2006 15:55:43 -0500
Subject: [PATCH 24/39] More refactoring

---
 planet/spider.py | 184 +++++++++++++++++++++++++----------------------
 1 file changed, 98 insertions(+), 86 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index 0337479..db350da 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -336,6 +336,74 @@ def writeCache(feed_uri, feed_info, data):
     write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
     xdoc.unlink()
 
+def httpThread(thread_index, input_queue, output_queue, log):
+    from Queue import Empty
+    import httplib2
+    from socket import gaierror, error 
+    from httplib import BadStatusLine
+
+    http_cache = config.http_cache_directory()
+    h = httplib2.Http(http_cache)
+    try:
+        while True:
+            # The non-blocking get will throw an exception when the queue 
+            # is empty which will terminate the thread.
+            uri, feed_info = input_queue.get(block=False)
+            log.info("Fetching %s via %d", uri, thread_index)
+            feed = StringIO('')
+            setattr(feed, 'url', uri)
+            setattr(feed, 'headers', 
+                feedparser.FeedParserDict({'status':'500'}))
+            try:
+                # map IRI => URI
+                try:
+                    if isinstance(uri,unicode):
+                        idna = uri.encode('idna')
+                    else:
+                        idna = uri.decode('utf-8').encode('idna')
+                    if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
+                except:
+                    log.info("unable to map %s to a URI", uri)
+                    idna = uri
+
+                # issue request
+                (resp, content) = h.request(idna)
+                if resp.status == 200 and resp.fromcache:
+                    resp.status = 304
+
+                # build a file-like object
+                feed = StringIO(content) 
+                setattr(feed, 'url', resp.get('content-location', uri))
+                if resp.has_key('content-encoding'):
+                    del resp['content-encoding']
+                setattr(feed, 'headers', resp)
+            except gaierror:
+                log.error("Fail to resolve server name %s via %d",
+                    uri, thread_index)
+            except BadStatusLine:
+                log.error("Bad Status Line received for %s via %d",
+                    uri, thread_index)
+            except error, e:
+                if e.__class__.__name__.lower()=='timeout':
+                    feed.headers['status'] = '408'
+                    log.warn("Timeout in thread-%d", thread_index)
+                else:
+                    log.error("HTTP Error: %s in thread-%d",
+                    str(e), thread_index)
+            except Exception, e:
+                import sys, traceback
+                type, value, tb = sys.exc_info()
+                log.error('Error processing %s', uri)
+                for line in (traceback.format_exception_only(type, value) +
+                    traceback.format_tb(tb)):
+                    log.error(line.rstrip())
+                continue
+
+            output_queue.put(block=True, item=(uri, feed_info, feed))
+ 
+    except Empty, e:
+        log.info("Thread %d finished", thread_index)
+
 def spiderPlanet(only_if_new = False):
     """ Spider (fetch) an entire planet """
     log = planet.getLogger(config.log_level(),config.log_format())
@@ -355,111 +423,55 @@ def spiderPlanet(only_if_new = False):
             log.warning("Timeout set to invalid value '%s', skipping", timeout)
 
     if int(config.spider_threads()):
-        from Queue import Queue, Empty
+        from Queue import Queue
         from threading import Thread
-        import httplib2
-        from socket import gaierror, error 
-        from httplib import BadStatusLine
 
-        work_queue = Queue()
-        awaiting_parsing = Queue()
+        fetch_queue = Queue()
+        parse_queue = Queue()
 
         http_cache = config.http_cache_directory()
         if not os.path.exists(http_cache):
             os.makedirs(http_cache, 0700)
 
-        def _spider_proc(thread_index):
-            h = httplib2.Http(http_cache)
-            try:
-                while True:
-                    # The non-blocking get will throw an exception when the queue 
-                    # is empty which will terminate the thread.
-                    uri = work_queue.get(block=False)
-                    log.info("Fetching %s via %d", uri, thread_index)
-                    resp = feedparser.FeedParserDict({'status':'500'})
-                    feed_info = None
-                    content = None
-                    try:
-                        # read cached feed info
-                        sources = config.cache_sources_directory()
-                        feed_source = filename(sources, uri)
-                        feed_info = feedparser.parse(feed_source)
-                        if feed_info.feed and only_if_new:
-                            log.info("Feed %s already in cache", uri)
-                            continue
-                        if feed_info.feed.get('planet_http_status',None) == '410':
-                            log.info("Feed %s gone", uri)
-                            continue
+        # Load the fetch_queue with all the HTTP(S) uris.
+        log.info("Building work queue")
+        for uri in config.subscriptions():
+            if _is_http_uri(uri):
+                # read cached feed info
+                sources = config.cache_sources_directory()
+                feed_source = filename(sources, uri)
+                feed_info = feedparser.parse(feed_source)
 
-                        try:
-                            if isinstance(uri,unicode):
-                                idna = uri.encode('idna')
-                            else:
-                                idna = uri.decode('utf-8').encode('idna')
-                            if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
-                        except:
-                            log.info("unable to map %s to a URI", uri)
-                            idna = uri
-                        (resp, content) = h.request(idna)
-                    except gaierror:
-                        log.error("Fail to resolve server name %s via %d", uri, thread_index)
-                    except BadStatusLine:
-                        log.error("Bad Status Line received for %s via %d", uri, thread_index)
-                    except error, e:
-                        if e.__class__.__name__.lower()=='timeout':
-                            resp['status'] = '408'
-                            log.warn("Timeout in thread-%d", thread_index)
-                        else:
-                            log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
-                    except Exception, e:
-                        import sys, traceback
-                        type, value, tb = sys.exc_info()
-                        log.error('Error processing %s', uri)
-                        for line in (traceback.format_exception_only(type, value) +
-                            traceback.format_tb(tb)):
-                            log.error(line.rstrip())
-                        continue
+                if feed_info.feed and only_if_new:
+                    log.info("Feed %s already in cache", uri)
+                    continue
+                if feed_info.feed.get('planet_http_status',None) == '410':
+                    log.info("Feed %s gone", uri)
+                    continue
 
-                    if feed_info:
-                        if resp.status == 200 and resp.fromcache:
-                            resp.status = 304
-                        awaiting_parsing.put(block=True,
-                            item=(resp, content, uri, feed_info))
- 
-            except Empty, e:
-                log.info("Thread %d finished", thread_index)
-                pass
-
-        # Load the work_queue with all the HTTP(S) uris.
-        map(work_queue.put, [uri for uri in config.subscriptions() if _is_http_uri(uri)])
+                fetch_queue.put(item=((uri, feed_info)))
 
         # Start all the worker threads
-        threads = dict([(i, Thread(target=_spider_proc, args=(i,))) for i in range(int(config.spider_threads()))])
+        threads = dict([(i, Thread(target=httpThread,
+            args=(i,fetch_queue, parse_queue, log)))
+            for i in range(int(config.spider_threads()))])
         for t in threads.itervalues():
             t.start()
 
         # Process the results as they arrive
-        while work_queue.qsize() or awaiting_parsing.qsize() or threads:
-            while awaiting_parsing.qsize() == 0 and threads:
+        while fetch_queue.qsize() or parse_queue.qsize() or threads:
+            while parse_queue.qsize() == 0 and threads:
                 time.sleep(0.1)
-            while awaiting_parsing.qsize():
-                item = awaiting_parsing.get(False)
+            while parse_queue.qsize():
+                (uri, feed_info, feed) = parse_queue.get(False)
                 try:
-                    (resp_headers, content, uri, feed_info) = item
-                    # spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
 
-                    if int(resp_headers.status) < 300:
-                        # httplib2 was used to get the content, so prepare a 
-                        # proper object to pass to feedparser.
-                        f = StringIO(content) 
-                        setattr(f, 'url', resp_headers.get('content-location', uri))
-                        if resp_headers.has_key('content-encoding'):
-                            del resp_headers['content-encoding']
-                        setattr(f, 'headers', resp_headers)
-                        data = feedparser.parse(f)
+                    if int(feed.headers.status) < 300:
+                        data = feedparser.parse(feed)
                     else:
-                        data = feedparser.FeedParserDict({'status': int(resp_headers.status),
-                            'headers':resp_headers, 'version':None, 'entries': []})
+                        data = feedparser.FeedParserDict({'version':None,
+                            'headers':feed.headers, 'entries': [],
+                            'status': int(feed.headers.status)})
 
                     writeCache(uri, feed_info, data)
 

From 70f971750ba633b7b7bb603f887be114583a713d Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Tue, 21 Nov 2006 09:11:52 -0500
Subject: [PATCH 25/39] Complete HttpThread refactoring

---
 planet/spider.py     | 246 +++++++++++++++++++++----------------------
 tests/test_spider.py |  13 ++-
 2 files changed, 132 insertions(+), 127 deletions(-)

diff --git a/planet/spider.py b/planet/spider.py
index db350da..9463fdf 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -344,68 +344,64 @@ def httpThread(thread_index, input_queue, output_queue, log):
 
     http_cache = config.http_cache_directory()
     h = httplib2.Http(http_cache)
-    try:
-        while True:
-            # The non-blocking get will throw an exception when the queue 
-            # is empty which will terminate the thread.
-            uri, feed_info = input_queue.get(block=False)
-            log.info("Fetching %s via %d", uri, thread_index)
-            feed = StringIO('')
-            setattr(feed, 'url', uri)
-            setattr(feed, 'headers', 
-                feedparser.FeedParserDict({'status':'500'}))
+    uri, feed_info = input_queue.get(block=True)
+    while uri:
+        log.info("Fetching %s via %d", uri, thread_index)
+        feed = StringIO('')
+        setattr(feed, 'url', uri)
+        setattr(feed, 'headers', 
+            feedparser.FeedParserDict({'status':'500'}))
+        try:
+            # map IRI => URI
             try:
-                # map IRI => URI
-                try:
-                    if isinstance(uri,unicode):
-                        idna = uri.encode('idna')
-                    else:
-                        idna = uri.decode('utf-8').encode('idna')
-                    if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
-                except:
-                    log.info("unable to map %s to a URI", uri)
-                    idna = uri
-
-                # issue request
-                (resp, content) = h.request(idna)
-                if resp.status == 200 and resp.fromcache:
-                    resp.status = 304
-
-                # build a file-like object
-                feed = StringIO(content) 
-                setattr(feed, 'url', resp.get('content-location', uri))
-                if resp.has_key('content-encoding'):
-                    del resp['content-encoding']
-                setattr(feed, 'headers', resp)
-            except gaierror:
-                log.error("Fail to resolve server name %s via %d",
-                    uri, thread_index)
-            except BadStatusLine:
-                log.error("Bad Status Line received for %s via %d",
-                    uri, thread_index)
-            except error, e:
-                if e.__class__.__name__.lower()=='timeout':
-                    feed.headers['status'] = '408'
-                    log.warn("Timeout in thread-%d", thread_index)
+                if isinstance(uri,unicode):
+                    idna = uri.encode('idna')
                 else:
-                    log.error("HTTP Error: %s in thread-%d",
-                    str(e), thread_index)
-            except Exception, e:
-                import sys, traceback
-                type, value, tb = sys.exc_info()
-                log.error('Error processing %s', uri)
-                for line in (traceback.format_exception_only(type, value) +
-                    traceback.format_tb(tb)):
-                    log.error(line.rstrip())
-                continue
+                    idna = uri.decode('utf-8').encode('idna')
+                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
+            except:
+                log.info("unable to map %s to a URI", uri)
+                idna = uri
 
-            output_queue.put(block=True, item=(uri, feed_info, feed))
- 
-    except Empty, e:
-        log.info("Thread %d finished", thread_index)
+            # issue request
+            (resp, content) = h.request(idna)
+            if resp.status == 200 and resp.fromcache:
+                resp.status = 304
+
+            # build a file-like object
+            feed = StringIO(content) 
+            setattr(feed, 'url', resp.get('content-location', uri))
+            if resp.has_key('content-encoding'):
+                del resp['content-encoding']
+            setattr(feed, 'headers', resp)
+        except gaierror:
+            log.error("Fail to resolve server name %s via %d",
+                uri, thread_index)
+        except BadStatusLine:
+            log.error("Bad Status Line received for %s via %d",
+                uri, thread_index)
+        except error, e:
+            if e.__class__.__name__.lower()=='timeout':
+                feed.headers['status'] = '408'
+                log.warn("Timeout in thread-%d", thread_index)
+            else:
+                log.error("HTTP Error: %s in thread-%d",
+                str(e), thread_index)
+        except Exception, e:
+            import sys, traceback
+            type, value, tb = sys.exc_info()
+            log.error('Error processing %s', uri)
+            for line in (traceback.format_exception_only(type, value) +
+                traceback.format_tb(tb)):
+                log.error(line.rstrip())
+            continue
+
+        output_queue.put(block=True, item=(uri, feed_info, feed))
+        uri, feed_info = input_queue.get(block=True)
 
 def spiderPlanet(only_if_new = False):
     """ Spider (fetch) an entire planet """
+    # log = planet.getLogger(config.log_level(),config.log_format())
     log = planet.getLogger(config.log_level(),config.log_format())
 
     global index
@@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
     timeout = config.feed_timeout()
     try:
         socket.setdefaulttimeout(float(timeout))
+        log.info("Socket timeout set to %d seconds", timeout)
     except:
         try:
             from planet import timeoutsocket
@@ -422,84 +419,87 @@ def spiderPlanet(only_if_new = False):
         except:
             log.warning("Timeout set to invalid value '%s', skipping", timeout)
 
+    from Queue import Queue
+    from threading import Thread
+
+    fetch_queue = Queue()
+    parse_queue = Queue()
+
+    threads = {}
     if int(config.spider_threads()):
-        from Queue import Queue
-        from threading import Thread
-
-        fetch_queue = Queue()
-        parse_queue = Queue()
-
         http_cache = config.http_cache_directory()
         if not os.path.exists(http_cache):
             os.makedirs(http_cache, 0700)
 
-        # Load the fetch_queue with all the HTTP(S) uris.
-        log.info("Building work queue")
-        for uri in config.subscriptions():
-            if _is_http_uri(uri):
-                # read cached feed info
-                sources = config.cache_sources_directory()
-                feed_source = filename(sources, uri)
-                feed_info = feedparser.parse(feed_source)
-
-                if feed_info.feed and only_if_new:
-                    log.info("Feed %s already in cache", uri)
-                    continue
-                if feed_info.feed.get('planet_http_status',None) == '410':
-                    log.info("Feed %s gone", uri)
-                    continue
-
-                fetch_queue.put(item=((uri, feed_info)))
-
         # Start all the worker threads
-        threads = dict([(i, Thread(target=httpThread,
-            args=(i,fetch_queue, parse_queue, log)))
-            for i in range(int(config.spider_threads()))])
-        for t in threads.itervalues():
-            t.start()
+        for i in range(int(config.spider_threads())):
+            threads[i] = Thread(target=httpThread,
+                args=(i,fetch_queue, parse_queue, log))
+            threads[i].start()
+    else:
+        log.info("Building work queue")
 
-        # Process the results as they arrive
-        while fetch_queue.qsize() or parse_queue.qsize() or threads:
-            while parse_queue.qsize() == 0 and threads:
-                time.sleep(0.1)
-            while parse_queue.qsize():
-                (uri, feed_info, feed) = parse_queue.get(False)
-                try:
+    # Load the fetch and parse work queues
+    for uri in config.subscriptions():
+        # read cached feed info
+        sources = config.cache_sources_directory()
+        feed_source = filename(sources, uri)
+        feed_info = feedparser.parse(feed_source)
 
-                    if int(feed.headers.status) < 300:
-                        data = feedparser.parse(feed)
-                    else:
-                        data = feedparser.FeedParserDict({'version':None,
-                            'headers':feed.headers, 'entries': [],
-                            'status': int(feed.headers.status)})
+        if feed_info.feed and only_if_new:
+            log.info("Feed %s already in cache", uri)
+            continue
+        if feed_info.feed.get('planet_http_status',None) == '410':
+            log.info("Feed %s gone", uri)
+            continue
 
-                    writeCache(uri, feed_info, data)
+        if threads and _is_http_uri(uri):
+            fetch_queue.put(item=(uri, feed_info))
+        else:
+            parse_queue.put(item=(uri, feed_info, uri))
 
-                except Exception, e:
-                    import sys, traceback
-                    type, value, tb = sys.exc_info()
-                    log.error('Error processing %s', uri)
-                    for line in (traceback.format_exception_only(type, value) +
-                        traceback.format_tb(tb)):
-                        log.error(line.rstrip())
-            for index in threads.keys():
-                if not threads[index].isAlive():
-                    del threads[index]
-    log.info("Finished threaded part of processing.")
-                    
+    # Mark the end of the fetch queue
+    for thread in threads.keys():
+        fetch_queue.put(item=(None, None))
 
-    # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
-    unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
-    for feed in unthreaded_work_queue:
-        try:
-            spiderFeed(feed, only_if_new=only_if_new)
-        except Exception,e:
-            import sys, traceback
-            type, value, tb = sys.exc_info()
-            log.error('Error processing %s', feed)
-            for line in (traceback.format_exception_only(type, value) +
-                traceback.format_tb(tb)):
-                log.error(line.rstrip())
+    # Process the results as they arrive
+    while fetch_queue.qsize() or parse_queue.qsize() or threads:
+        while parse_queue.qsize() == 0 and threads:
+            time.sleep(0.1)
+        while parse_queue.qsize():
+            (uri, feed_info, feed) = parse_queue.get(False)
+            try:
 
+                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
+                    options = {}
+                    if hasattr(feed_info,'feed'):
+                        options['etag'] = \
+                            feed_info.feed.get('planet_http_etag',None)
+                        try:
+                            modified=time.strptime(
+                                feed_info.feed.get('planet_http_last_modified',
+                                None))
+                        except:
+                            pass
 
+                    data = feedparser.parse(feed, **options)
+                else:
+                    data = feedparser.FeedParserDict({'version':None,
+                        'headers':feed.headers, 'entries': [],
+                        'status': int(feed.headers.status)})
 
+                writeCache(uri, feed_info, data)
+
+            except Exception, e:
+                import sys, traceback
+                type, value, tb = sys.exc_info()
+                log.error('Error processing %s', uri)
+                for line in (traceback.format_exception_only(type, value) +
+                    traceback.format_tb(tb)):
+                    log.error(line.rstrip())
+
+        for index in threads.keys():
+            if not threads[index].isAlive():
+                del threads[index]
+                if not threads:
+                    log.info("Finished threaded part of processing.")
diff --git a/tests/test_spider.py b/tests/test_spider.py
index 2bef04a..7bc24ed 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 import unittest, os, glob, calendar, shutil, time
-from planet.spider import filename, spiderFeed, spiderPlanet
+from planet.spider import filename, spiderPlanet, writeCache
 from planet import feedparser, config
 import planet
 
@@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
         self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
             filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
 
+    def spiderFeed(self, feed_uri):
+        feed_info = feedparser.parse('<feed/>')
+        data = feedparser.parse(feed_uri)
+        writeCache(feed_uri, feed_info, data)
+
     def verify_spiderFeed(self):
         files = glob.glob(workdir+"/*")
         files.sort()
@@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
 
     def test_spiderFeed(self):
         config.load(configfile)
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1b')
         self.verify_spiderFeed()
 
     def test_spiderUpdate(self):
         config.load(configfile)
-        spiderFeed(testfeed % '1a')
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1a')
+        self.spiderFeed(testfeed % '1b')
         self.verify_spiderFeed()
 
     def verify_spiderPlanet(self):

From c20acf9944f9ef914ffe9654d32e9fbb14e0631b Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 22 Nov 2006 12:31:22 -0500
Subject: [PATCH 26/39] Hash content to determine if it was modified

---
 docs/config.html       | 11 +++++-
 planet/config.py       |  8 ++--
 planet/reconstitute.py |  2 +-
 planet/spider.py       | 85 ++++++++++++++++++------------------------
 4 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/docs/config.html b/docs/config.html
index b201b26..f992e2e 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -100,7 +100,16 @@ use for logging output.  Note: this configuration value is processed
 <dd>Number of items to take from new feeds</dd>
 <dt><ins>spider_threads</ins></dt>
 <dd>The number of threads to use when spidering. When set to 0, the default, 
-   no threads are used and spidering follows the traditional algorithm.</dd>
+no threads are used and spidering follows the traditional algorithm.</dd>
+<dt><ins>spider_threads</ins></dt>
+<dd>The number of threads to use when spidering. When set to 0, the default, 
+no threads are used and spidering follows the traditional algorithm.</dd>
+<dt><ins>http_cache_directory</ins></dt>
+<dd>If <code>spider_threads</code> is specified, you can also specify a
+directory to be used for an additional HTTP cache to front end the Venus
+cache.  If specified as a relative path, it is evaluated relative to the
+<code>cache_directory</code>.</dd>
+<code>
 </dl>
 </blockquote>
 
diff --git a/planet/config.py b/planet/config.py
index 9526f36..da8de60 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -285,13 +285,13 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
 
 def http_cache_directory():
     if parser.has_option('Planet', 'http_cache_directory'):
-        parser.get('Planet', 'http_cache_directory')
-    else:
-        return os.path.join(cache_directory(), 'sources/http')
+        os.path.join(cache_directory(), 
+            parser.get('Planet', 'http_cache_directory'))
 
 def cache_sources_directory():
     if parser.has_option('Planet', 'cache_sources_directory'):
-        parser.get('Planet', 'cache_sources_directory')
+        return os.path.join(cache_directory(),
+            parser.get('Planet', 'cache_sources_directory'))
     else:
         return os.path.join(cache_directory(), 'sources')
 
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 6d7f43d..bf209c7 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -105,7 +105,7 @@ def links(xentry, entry):
        if entry.has_key('link'):
          entry['links'].append({'rel':'alternate', 'href':entry.link}) 
     xdoc = xentry.ownerDocument
-    for link in entry.links:
+    for link in entry['links']:
         if not 'href' in link.keys(): continue
         xlink = xdoc.createElement('link')
         xlink.setAttribute('href', link.get('href'))
diff --git a/planet/spider.py b/planet/spider.py
index 9463fdf..bc22d1f 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -121,36 +121,6 @@ def _is_http_uri(uri):
     parsed = urlparse.urlparse(uri)
     return parsed[0] in ['http', 'https']
 
-def spiderFeed(feed_uri, only_if_new=0):
-    """ Spider (fetch) a single feed """
-    log = planet.logger
-
-    # read cached feed info
-    sources = config.cache_sources_directory()
-    if not os.path.exists(sources):
-        os.makedirs(sources, 0700)
-
-    feed_source = filename(sources, feed_uri)
-    feed_info = feedparser.parse(feed_source)
-    if feed_info.feed and only_if_new:
-        log.info("Feed %s already in cache", feed_uri)
-        return
-    if feed_info.feed.get('planet_http_status',None) == '410':
-        log.info("Feed %s gone", feed_uri)
-        return
-
-    # read feed itself
-    modified = None
-    try:
-        modified=time.strptime(
-            feed_info.feed.get('planet_http_last_modified', None))
-    except:
-        pass
-    data = feedparser.parse(feed_info.feed.get('planet_http_location',feed_uri),
-        etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
-
-    writeCache(feed_uri, feed_info, data)
-
 def writeCache(feed_uri, feed_info, data):
     log = planet.logger
     sources = config.cache_sources_directory()
@@ -159,7 +129,8 @@ def writeCache(feed_uri, feed_info, data):
     if not data.has_key("status"):
         if data.has_key("entries") and len(data.entries)>0:
             data.status = 200
-        elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
+        elif data.bozo and \
+            data.bozo_exception.__class__.__name__.lower()=='timeout':
             data.status = 408
         else:
             data.status = 500
@@ -210,11 +181,16 @@ def writeCache(feed_uri, feed_info, data):
     if data.has_key('headers'):
         if data.has_key('etag') and data.etag:
             data.feed['planet_http_etag'] = data.etag
-            log.debug("E-Tag: %s", data.etag)
-        if data.has_key('modified') and data.modified:
+        elif data.headers.has_key('etag') and data.headers['etag']:
+            data.feed['planet_http_etag'] =  data.headers['etag']
+
+        if data.headers.has_key('last-modified'):
+            data.feed['planet_http_last_modified']=data.headers['last-modified']
+        elif data.has_key('modified') and data.modified:
             data.feed['planet_http_last_modified'] = time.asctime(data.modified)
-            log.debug("Last Modified: %s",
-                data.feed['planet_http_last_modified'])
+
+        if data.headers.has_key('-content-hash'):
+            data.feed['planet_content_hash'] = data.headers['-content-hash']
 
     # capture feed and data from the planet configuration file
     if data.version:
@@ -337,13 +313,11 @@ def writeCache(feed_uri, feed_info, data):
     xdoc.unlink()
 
 def httpThread(thread_index, input_queue, output_queue, log):
-    from Queue import Empty
-    import httplib2
+    import httplib2, md5
     from socket import gaierror, error 
     from httplib import BadStatusLine
 
-    http_cache = config.http_cache_directory()
-    h = httplib2.Http(http_cache)
+    h = httplib2.Http(config.http_cache_directory())
     uri, feed_info = input_queue.get(block=True)
     while uri:
         log.info("Fetching %s via %d", uri, thread_index)
@@ -363,10 +337,26 @@ def httpThread(thread_index, input_queue, output_queue, log):
                 log.info("unable to map %s to a URI", uri)
                 idna = uri
 
+            # cache control headers
+            headers = {}
+            if feed_info.feed.has_key('planet_http_etag'):
+                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
+            if feed_info.feed.has_key('planet_http_last_modified'):
+                headers['If-Modified-Since'] = \
+                    feed_info.feed['planet_http_last_modified']
+
             # issue request
-            (resp, content) = h.request(idna)
-            if resp.status == 200 and resp.fromcache:
-                resp.status = 304
+            (resp, content) = h.request(idna, 'GET', headers=headers)
+
+            # unchanged detection
+            resp['-content-hash'] = md5.new(content or '').hexdigest()
+            if resp.status == 200:
+                if resp.fromcache:
+                    resp.status = 304
+                elif feed_info.feed.has_key('planet_content_hash') and \
+                    feed_info.feed['planet_content_hash'] == \
+                    resp['-content-hash']:
+                    resp.status = 304
 
             # build a file-like object
             feed = StringIO(content) 
@@ -385,8 +375,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
                 feed.headers['status'] = '408'
                 log.warn("Timeout in thread-%d", thread_index)
             else:
-                log.error("HTTP Error: %s in thread-%d",
-                str(e), thread_index)
+                log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
         except Exception, e:
             import sys, traceback
             type, value, tb = sys.exc_info()
@@ -428,7 +417,7 @@ def spiderPlanet(only_if_new = False):
     threads = {}
     if int(config.spider_threads()):
         http_cache = config.http_cache_directory()
-        if not os.path.exists(http_cache):
+        if http_cache and not os.path.exists(http_cache):
             os.makedirs(http_cache, 0700)
 
         # Start all the worker threads
@@ -484,9 +473,9 @@ def spiderPlanet(only_if_new = False):
 
                     data = feedparser.parse(feed, **options)
                 else:
-                    data = feedparser.FeedParserDict({'version':None,
-                        'headers':feed.headers, 'entries': [],
-                        'status': int(feed.headers.status)})
+                    data = feedparser.FeedParserDict({'version': None,
+                        'headers': feed.headers, 'entries': [], 'feed': {},
+                        'bozo': 0, 'status': int(feed.headers.status)})
 
                 writeCache(uri, feed_info, data)
 

From 316a1afe5eb0ee28b808e659acd5876c004c62cc Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Sun, 26 Nov 2006 11:10:47 -0500
Subject: [PATCH 27/39] Ensure planet information makes it into the source
 element

---
 planet/reconstitute.py                         | 15 +++++++++++++--
 tests/data/reconstitute/planet_name.xml        | 11 +++++++++++
 tests/data/reconstitute/planet_name_source.xml | 15 +++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 tests/data/reconstitute/planet_name.xml
 create mode 100644 tests/data/reconstitute/planet_name_source.xml

diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index bf209c7..e2707e6 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -250,6 +250,7 @@ def reconstitute(feed, entry):
                 entry['%s_%s' % (ns,name)])
             xoriglink.setAttribute('xmlns:%s' % ns, feed.namespaces[ns])
 
+    # author / contributor
     author_detail = entry.get('author_detail',{})
     if author_detail and not author_detail.has_key('name') and \
         feed.feed.has_key('planet_name'):
@@ -258,14 +259,24 @@ def reconstitute(feed, entry):
     for contributor in entry.get('contributors',[]):
         author(xentry, 'contributor', contributor)
 
-    xsource = xdoc.createElement('source')
-    src = entry.get('source') or feed.feed
+    # merge in planet:* from feed (or simply use the feed if no source)
+    src = entry.get('source')
+    if src:
+      for name,value in feed.feed.items():
+        if name.startswith('planet_'): src[name]=value
+    else:
+      src = feed.feed
+
+    # source:author
     src_author = src.get('author_detail',{})
     if (not author_detail or not author_detail.has_key('name')) and \
        not src_author.has_key('name') and  feed.feed.has_key('planet_name'):
        if src_author: src_author = src_author.__class__(src_author.copy())
        src['author_detail'] = src_author
        src_author['name'] = feed.feed['planet_name']
+
+    # source
+    xsource = xdoc.createElement('source')
     source(xsource, src, bozo, feed.version)
     xentry.appendChild(xsource)
 
diff --git a/tests/data/reconstitute/planet_name.xml b/tests/data/reconstitute/planet_name.xml
new file mode 100644
index 0000000..645f53b
--- /dev/null
+++ b/tests/data/reconstitute/planet_name.xml
@@ -0,0 +1,11 @@
+<!--
+Description:  planet name
+Expect:       source.planet_name == 'John Doe'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom"
+  xmlns:planet="http://planet.intertwingly.net/">
+  <planet:name>John Doe</planet:name>
+  <entry/>
+</feed>
+
diff --git a/tests/data/reconstitute/planet_name_source.xml b/tests/data/reconstitute/planet_name_source.xml
new file mode 100644
index 0000000..05a1ada
--- /dev/null
+++ b/tests/data/reconstitute/planet_name_source.xml
@@ -0,0 +1,15 @@
+<!--
+Description:  ensure that planet attributes make it into the source
+Expect:       source.planet_name == 'John Doe'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom"
+  xmlns:planet="http://planet.intertwingly.net/">
+  <planet:name>John Doe</planet:name>
+  <entry>
+    <source>
+      <id>http://example.com/</id>
+    </source>
+  </entry>
+</feed>
+

From e407a8d6f2b26d253d4e3abe66c33f64873da039 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 7 Dec 2006 15:03:15 -0500
Subject: [PATCH 28/39] HTML5

---
 themes/asf/index.html.xslt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index 2beb5d9..102aa9a 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -5,7 +5,11 @@
                 xmlns="http://www.w3.org/1999/xhtml"
                 exclude-result-prefixes="atom planet xhtml">
  
+  <xsl:output method="xml" omit-xml-declaration="yes"/>
+
   <xsl:template match="atom:feed">
+    <xsl:text disable-output-escaping="yes">&lt;!DOCTYPE html&gt;</xsl:text>
+    <xsl:text>&#10;</xsl:text>
     <html xmlns="http://www.w3.org/1999/xhtml">
 
       <!-- head -->
@@ -21,7 +25,7 @@
         </xsl:if>
         <link rel="shortcut icon" href="/favicon.ico" />
         <script type="text/javascript" src="personalize.js">
-          <xsl:comment>HTML Compatibility</xsl:comment>
+          <xsl:comment><!--HTML Compatibility--></xsl:comment>
         </script>
       </head>
 

From 6cc797ce0ae9912dc7f6ee89033f90502982893c Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 7 Dec 2006 18:31:45 -0500
Subject: [PATCH 29/39] added a new config option: future_dates

---
 docs/config.html        |  6 +--
 docs/normalization.html | 12 +++++-
 planet/config.py        |  1 +
 planet/scrub.py         | 94 +++++++++++++++++++++++++++++++++++++++++
 planet/spider.py        | 66 ++---------------------------
 tests/test_scrub.py     | 53 +++++++++++++++++------
 6 files changed, 149 insertions(+), 83 deletions(-)
 create mode 100644 planet/scrub.py

diff --git a/docs/config.html b/docs/config.html
index f992e2e..9491a29 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -101,16 +101,14 @@ use for logging output.  Note: this configuration value is processed
 <dt><ins>spider_threads</ins></dt>
 <dd>The number of threads to use when spidering. When set to 0, the default, 
 no threads are used and spidering follows the traditional algorithm.</dd>
-<dt><ins>spider_threads</ins></dt>
-<dd>The number of threads to use when spidering. When set to 0, the default, 
-no threads are used and spidering follows the traditional algorithm.</dd>
 <dt><ins>http_cache_directory</ins></dt>
 <dd>If <code>spider_threads</code> is specified, you can also specify a
 directory to be used for an additional HTTP cache to front end the Venus
 cache.  If specified as a relative path, it is evaluated relative to the
 <code>cache_directory</code>.</dd>
-<code>
 </dl>
+<p>Additional options can be found in
+<a href="normalization.html#overrides">normalization level overrides</a>.</p>
 </blockquote>
 
 <h3 id="default"><code>[DEFAULT]</code></h3>
diff --git a/docs/normalization.html b/docs/normalization.html
index de73812..08465f5 100644
--- a/docs/normalization.html
+++ b/docs/normalization.html
@@ -69,8 +69,9 @@ are converted into
 <li><a href="http://www.feedparser.org/docs/reference-entry-content.html">content</a></li>
 </ul>
 <p>If no <a href="http://www.feedparser.org/docs/reference-feed-
-updated.html">updated</a> dates are found in an entry, or if the dates found
-are in the future, the current time is substituted.</p>
+updated.html">updated</a> dates are found in an entry, the updated date from
+the feed is used.  If no updated date is found in either the feed or
+the entry, the current time is substituted.</p>
 <h3 id="overrides">Overrides</h3>
 <p>All of the above describes what Venus does automatically, either directly
 or through its dependencies.  There are a number of errors which can not
@@ -87,6 +88,13 @@ case of feeds where the <code>id</code>, <code>updated</code> or
 attributes on these elements.</li>
 <li><code>name_type</code> does something similar for
 <a href="http://www.feedparser.org/docs/reference-entry-author_detail.html#reference.entry.author_detail.name">author names</a></li>
+<li><code>future_dates</code> allows you to specify how to deal with dates which are in the future.
+<ul style="margin:0">
+<li><code>ignore_date</code> will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.</li>
+<li><code>ignore_entry</code> will cause the entire entry containing the future date to be ignored until the date is past.</li>
+<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
+</ul>
+</li>
 </ul>
 </body>
 </html>
diff --git a/planet/config.py b/planet/config.py
index da8de60..cc12ecc 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -118,6 +118,7 @@ def __init__():
     define_tmpl('title_type', '')
     define_tmpl('summary_type', '')
     define_tmpl('content_type', '')
+    define_tmpl('future_dates', 'keep')
 
 def load(config_file):
     """ initialize and load a configuration"""
diff --git a/planet/scrub.py b/planet/scrub.py
new file mode 100644
index 0000000..42d75ae
--- /dev/null
+++ b/planet/scrub.py
@@ -0,0 +1,94 @@
+"""
+Process a set of configuration defined sanitations on a given feed.
+"""
+
+# Standard library modules
+import time
+# Planet modules
+import planet, config, shell
+
+type_map = {'text': 'text/plain', 'html': 'text/html',
+    'xhtml': 'application/xhtml+xml'}
+
+def scrub(feed_uri, data):
+
+    # some data is not trustworthy
+    for tag in config.ignore_in_feed(feed_uri).split():
+        if tag.find('lang')>=0: tag='language'
+        if data.feed.has_key(tag): del data.feed[tag]
+        for entry in data.entries:
+            if entry.has_key(tag): del entry[tag]
+            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
+            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+            for key in entry.keys():
+                if not key.endswith('_detail'): continue
+                for detail in entry[key].copy():
+                    if detail == tag: del entry[key][detail]
+
+    # adjust title types
+    if config.title_type(feed_uri):
+        title_type = config.title_type(feed_uri)
+        title_type = type_map.get(title_type, title_type)
+        for entry in data.entries:
+            if entry.has_key('title_detail'):
+                entry.title_detail['type'] = title_type
+
+    # adjust summary types
+    if config.summary_type(feed_uri):
+        summary_type = config.summary_type(feed_uri)
+        summary_type = type_map.get(summary_type, summary_type)
+        for entry in data.entries:
+            if entry.has_key('summary_detail'):
+                entry.summary_detail['type'] = summary_type
+
+    # adjust content types
+    if config.content_type(feed_uri):
+        content_type = config.content_type(feed_uri)
+        content_type = type_map.get(content_type, content_type)
+        for entry in data.entries:
+            if entry.has_key('content'):
+                entry.content[0]['type'] = content_type
+
+    # some people put html in author names
+    if config.name_type(feed_uri).find('html')>=0:
+        from shell.tmpl import stripHtml
+        if data.feed.has_key('author_detail') and \
+            data.feed.author_detail.has_key('name'):
+            data.feed.author_detail['name'] = \
+                str(stripHtml(data.feed.author_detail.name))
+        for entry in data.entries:
+            if entry.has_key('author_detail') and \
+                entry.author_detail.has_key('name'):
+                entry.author_detail['name'] = \
+                    str(stripHtml(entry.author_detail.name))
+            if entry.has_key('source'):
+                source = entry.source
+                if source.has_key('author_detail') and \
+                    source.author_detail.has_key('name'):
+                    source.author_detail['name'] = \
+                        str(stripHtml(source.author_detail.name))
+
+    # handle dates in the future
+    future_dates = config.future_dates(feed_uri).lower()
+    if future_dates == 'ignore_date':
+      now = time.gmtime()
+      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
+        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
+      for entry in data.entries:
+        if entry.has_key('published_parsed') and entry['published_parsed']:
+          if entry['published_parsed'] > now:
+            del entry['published_parsed']
+            del entry['published']
+        if entry.has_key('updated_parsed') and entry['updated_parsed']:
+          if entry['updated_parsed'] > now:
+            del entry['updated_parsed']
+            del entry['updated']
+    elif future_dates == 'ignore_entry':
+      now = time.time()
+      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
+        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
+      data.entries = [entry for entry in data.entries if 
+        (not entry.has_key('published_parsed') or not entry['published_parsed']
+          or entry['published_parsed'] <= now) and
+        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
+          or entry['updated_parsed'] <= now)]
diff --git a/planet/spider.py b/planet/spider.py
index bc22d1f..6630be9 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
 import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
-import planet, config, feedparser, reconstitute, shell, socket
+import planet, config, feedparser, reconstitute, shell, socket, scrub
 from StringIO import StringIO 
 
 # Regular expressions to sanitise cache filenames
@@ -57,66 +57,6 @@ def write(xdoc, out):
     file.write(xdoc)
     file.close()
 
-type_map = {'text': 'text/plain', 'html': 'text/html',
-    'xhtml': 'application/xhtml+xml'}
-
-def scrub(feed, data):
-
-    # some data is not trustworthy
-    for tag in config.ignore_in_feed(feed).split():
-        if tag.find('lang')>=0: tag='language'
-        if data.feed.has_key(tag): del data.feed[tag]
-        for entry in data.entries:
-            if entry.has_key(tag): del entry[tag]
-            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
-            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
-            for key in entry.keys():
-                if not key.endswith('_detail'): continue
-                for detail in entry[key].copy():
-                    if detail == tag: del entry[key][detail]
-
-    # adjust title types
-    if config.title_type(feed):
-        title_type = config.title_type(feed)
-        title_type = type_map.get(title_type, title_type)
-        for entry in data.entries:
-            if entry.has_key('title_detail'):
-                entry.title_detail['type'] = title_type
-
-    # adjust summary types
-    if config.summary_type(feed):
-        summary_type = config.summary_type(feed)
-        summary_type = type_map.get(summary_type, summary_type)
-        for entry in data.entries:
-            if entry.has_key('summary_detail'):
-                entry.summary_detail['type'] = summary_type
-
-    # adjust content types
-    if config.content_type(feed):
-        content_type = config.content_type(feed)
-        content_type = type_map.get(content_type, content_type)
-        for entry in data.entries:
-            if entry.has_key('content'):
-                entry.content[0]['type'] = content_type
-
-    # some people put html in author names
-    if config.name_type(feed).find('html')>=0:
-        from planet.shell.tmpl import stripHtml
-        if data.feed.has_key('author_detail') and \
-            data.feed.author_detail.has_key('name'):
-            data.feed.author_detail['name'] = \
-                str(stripHtml(data.feed.author_detail.name))
-        for entry in data.entries:
-            if entry.has_key('author_detail') and \
-                entry.author_detail.has_key('name'):
-                entry.author_detail['name'] = \
-                    str(stripHtml(entry.author_detail.name))
-            if entry.has_key('source'):
-                source = entry.source
-                if source.has_key('author_detail') and \
-                    source.author_detail.has_key('name'):
-                    source.author_detail['name'] = \
-                        str(stripHtml(source.author_detail.name))
 def _is_http_uri(uri):
     parsed = urlparse.urlparse(uri)
     return parsed[0] in ['http', 'https']
@@ -209,7 +149,7 @@ def writeCache(feed_uri, feed_info, data):
         data.feed['planet_'+name] = value
 
     # perform user configured scrub operations on the data
-    scrub(feed_uri, data)
+    scrub.scrub(feed_uri, data)
 
     from planet import idindex
     global index
@@ -244,7 +184,7 @@ def writeCache(feed_uri, feed_info, data):
                         mtime = calendar.timegm(data.feed.updated_parsed)
                     except:
                         pass
-        if not mtime or mtime > time.time(): mtime = time.time()
+        if not mtime: mtime = time.time()
         entry['updated_parsed'] = time.gmtime(mtime)
 
         # apply any filters
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index 7d9d1b0..17874a3 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
-import unittest, StringIO
-from planet.spider import scrub
+import unittest, StringIO, time
+from copy import deepcopy
+from planet.scrub import scrub
 from planet import feedparser, config
 
 feed = '''
@@ -10,7 +11,7 @@ feed = '''
   <entry xml:lang="en">
     <id>ignoreme</id>
     <author><name>F&amp;ouml;o</name></author>
-    <updated>2000-01-01T00:00:00Z</updated>
+    <updated>%d-12-31T23:59:59Z</updated>
     <title>F&amp;ouml;o</title>
     <summary>F&amp;ouml;o</summary>
     <content>F&amp;ouml;o</content>
@@ -19,11 +20,10 @@ feed = '''
     </source>
   </entry>
 </feed>
-'''
+''' % (time.gmtime()[0] + 1)
 
 configData = '''
 [testfeed]
-ignore_in_feed = id updated xml:lang
 name_type = html
 title_type = html
 summary_type = html
@@ -32,16 +32,17 @@ content_type = html
 
 class ScrubTest(unittest.TestCase):
 
-    def test_scrub(self):
-        data = feedparser.parse(feed)
+    def test_scrub_ignore(self):
+        base = feedparser.parse(feed)
+
+        self.assertTrue(base.entries[0].has_key('id'))
+        self.assertTrue(base.entries[0].has_key('updated'))
+        self.assertTrue(base.entries[0].has_key('updated_parsed'))
+        self.assertTrue(base.entries[0].summary_detail.has_key('language'))
+
         config.parser.readfp(StringIO.StringIO(configData))
-
-        self.assertEqual('F&ouml;o', data.feed.author_detail.name)
-        self.assertTrue(data.entries[0].has_key('id'))
-        self.assertTrue(data.entries[0].has_key('updated'))
-        self.assertTrue(data.entries[0].has_key('updated_parsed'))
-        self.assertTrue(data.entries[0].summary_detail.has_key('language'))
-
+        config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
+        data = deepcopy(base)
         scrub('testfeed', data)
 
         self.assertFalse(data.entries[0].has_key('id'))
@@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase):
         self.assertFalse(data.entries[0].has_key('updated_parsed'))
         self.assertFalse(data.entries[0].summary_detail.has_key('language'))
 
+    def test_scrub_type(self):
+        base = feedparser.parse(feed)
+
+        self.assertEqual('F&ouml;o', base.feed.author_detail.name)
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        data = deepcopy(base)
+        scrub('testfeed', data)
+
         self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
         self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
         self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
@@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase):
         self.assertEqual('text/html', data.entries[0].summary_detail.type)
         self.assertEqual('text/html', data.entries[0].content[0].type)
 
+    def test_scrub_future(self):
+        base = feedparser.parse(feed)
+        self.assertEqual(1, len(base.entries))
+        self.assertTrue(base.entries[0].has_key('updated'))
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        config.parser.set('testfeed', 'future_dates', 'ignore_date')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertFalse(data.entries[0].has_key('updated'))
+
+        config.parser.set('testfeed', 'future_dates', 'ignore_entry')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual(0, len(data.entries))

From 45eda384cb77ee69d3c296e831153271fa00280d Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Tue, 26 Dec 2006 11:49:03 -0500
Subject: [PATCH 30/39] Fixed bug with config feed_timeouts not being ints.

---
 planet/config.py            | 8 +++++++-
 tests/data/config/basic.ini | 1 +
 tests/test_config.py        | 8 ++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/planet/config.py b/planet/config.py
index cc12ecc..4743b14 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -70,6 +70,11 @@ def __init__():
         setattr(config, name, lambda default=default: get(None,name,default))
         planet_predefined_options.append(name)
 
+    # define a list planet-level variable
+    def define_planet_int(name, default=0):
+        setattr(config, name, lambda : int(get(None,name,default)))
+        planet_predefined_options.append(name)
+
     # define a list planet-level variable
     def define_planet_list(name, default=''):
         setattr(config, name, lambda : expand(get(None,name,default)))
@@ -91,7 +96,6 @@ def __init__():
     define_planet('cache_directory', "cache")
     define_planet('log_level', "WARNING")
     define_planet('log_format', "%(levelname)s:%(name)s:%(message)s")
-    define_planet('feed_timeout', 20)
     define_planet('date_format', "%B %d, %Y %I:%M %p")
     define_planet('new_date_format', "%B %d, %Y")
     define_planet('generator', 'Venus')
@@ -102,6 +106,8 @@ def __init__():
     define_planet('output_dir', 'output')
     define_planet('spider_threads', 0) 
 
+    define_planet_int('feed_timeout', 20)
+
     define_planet_list('template_files')
     define_planet_list('bill_of_materials')
     define_planet_list('template_directories', '.')
diff --git a/tests/data/config/basic.ini b/tests/data/config/basic.ini
index 842d6b3..bb6ec46 100644
--- a/tests/data/config/basic.ini
+++ b/tests/data/config/basic.ini
@@ -4,6 +4,7 @@ link = http://example.com/
 template_files = index.html.tmpl atom.xml.tmpl
 items_per_page = 50
 filters = foo
+feed_timeout=30
 
 [index.html.tmpl]
 days_per_page = 7
diff --git a/tests/test_config.py b/tests/test_config.py
index 6021757..e21bfb6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -58,3 +58,11 @@ class ConfigTest(unittest.TestCase):
     def test_filters(self):
         self.assertEqual(['foo','bar'], config.filters('feed2'))
         self.assertEqual(['foo'], config.filters('feed1'))
+
+    # ints
+
+    def test_timeout(self):
+        self.assertEqual(30,
+            config.feed_timeout())
+
+

From 7dd301cdf48089bbfb686ef6c9d8b6f2004772b3 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Wed, 27 Dec 2006 01:25:06 -0500
Subject: [PATCH 31/39] Moved creation of cache directory so that it is created
 consistently

---
 planet/config.py |  2 ++
 planet/spider.py | 10 ++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/planet/config.py b/planet/config.py
index cc12ecc..8958623 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -288,6 +288,8 @@ def http_cache_directory():
     if parser.has_option('Planet', 'http_cache_directory'):
         os.path.join(cache_directory(), 
             parser.get('Planet', 'http_cache_directory'))
+    else:
+        return os.path.join(cache_directory(), "cache")
 
 def cache_sources_directory():
     if parser.has_option('Planet', 'cache_sources_directory'):
diff --git a/planet/spider.py b/planet/spider.py
index 6630be9..ab09e80 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -355,11 +355,13 @@ def spiderPlanet(only_if_new = False):
     parse_queue = Queue()
 
     threads = {}
-    if int(config.spider_threads()):
-        http_cache = config.http_cache_directory()
-        if http_cache and not os.path.exists(http_cache):
-            os.makedirs(http_cache, 0700)
+    http_cache = config.http_cache_directory()
+    # Should this be done in config?
+    if http_cache and not os.path.exists(http_cache):
+        os.makedirs(http_cache)
 
+
+    if int(config.spider_threads()):
         # Start all the worker threads
         for i in range(int(config.spider_threads())):
             threads[i] = Thread(target=httpThread,

From 51c80375199f4db6db168c96c1649b5b87266791 Mon Sep 17 00:00:00 2001
From: Joe Gregorio <joe@bitworking.org>
Date: Wed, 27 Dec 2006 01:30:32 -0500
Subject: [PATCH 32/39] Fixed unit tests, which now reflect the http cache
 directory being created.

---
 tests/test_spider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_spider.py b/tests/test_spider.py
index 7bc24ed..ecf5986 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -83,7 +83,7 @@ class SpiderTest(unittest.TestCase):
         files = glob.glob(workdir+"/*")
 
         # verify that exactly eight files + 1 source dir were produced
-        self.assertEqual(13, len(files))
+        self.assertEqual(14, len(files))
 
         # verify that the file names are as expected
         self.assertTrue(os.path.join(workdir,

From 8c7cfe1e510849d56d7f86336de594af4f2d27ec Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 27 Dec 2006 13:35:13 -0500
Subject: [PATCH 33/39] Pop-over index of active weblog entries

---
 filters/stripAd/google_ad_map.sed |  1 +
 themes/asf/default.css            | 56 ++++++++++++++++++++++---------
 themes/asf/index.html.xslt        | 29 +++++++++++++++-
 themes/asf/personalize.js         |  3 ++
 themes/common/validate.html.xslt  |  2 +-
 5 files changed, 74 insertions(+), 17 deletions(-)
 create mode 100644 filters/stripAd/google_ad_map.sed

diff --git a/filters/stripAd/google_ad_map.sed b/filters/stripAd/google_ad_map.sed
new file mode 100644
index 0000000..b802a09
--- /dev/null
+++ b/filters/stripAd/google_ad_map.sed
@@ -0,0 +1 @@
+s|<p><map name="google_ad_map.*</p>||
diff --git a/themes/asf/default.css b/themes/asf/default.css
index 2de9db5..10f4b98 100644
--- a/themes/asf/default.css
+++ b/themes/asf/default.css
@@ -30,6 +30,10 @@ a:active {
 a:focus {
 }
 
+a.active {
+	font-weight: bold;
+}
+
 h1 {
 	font-size: x-large;
 	text-transform: uppercase;
@@ -97,6 +101,33 @@ h1 {
 	text-decoration: underline;
 }
 
+#sidebar ul li a:visited {
+	color: #000;
+}
+
+#sidebar ul li ul {
+	display: none;
+}
+
+#sidebar ul li {
+	position: relative;
+}
+
+#sidebar ul li:hover ul {
+	background-color: #EEE;
+	border: 2px solid #BBB;
+	color:#000;
+	display: block;
+	margin-left: -300px;
+	margin-right: 115px;
+	padding: 10px;
+	padding-left: 25px;
+	position: absolute;
+	right: 80px;
+	top: -12px;
+	z-index: 1;
+}
+
 #sidebar img {
 	border: 0;
 }
@@ -135,19 +166,19 @@ h1 {
 /* ---------------------------- Footer --------------------------- */
 
 #footer ul {
-        margin: 0 20px 0 -25px;
-        padding: 0;
+	margin: 0 20px 0 -25px;
+	padding: 0;
 }
 
 #footer li {
-        margin: 0;
-        padding: 0;
-        list-style: none;
-        display: inline;
+	margin: 0;
+	padding: 0;
+	list-style: none;
+	display: inline;
 }
 
 #footer img {
-        display: none;
+	display: none;
 }
 
 /* ----------------------------- Body ---------------------------- */
@@ -266,9 +297,9 @@ img.icon {
 }
 
 .news code {
-    font-family: monospace;
-    font-size: medium;
-    font-weight: bold;
+	font-family: monospace;
+	font-size: medium;
+	font-weight: bold;
 }
 
 .news .content a {
@@ -408,11 +439,6 @@ img.floatright {
 	background-color: #f8f8f8;
 }
 
-/* GigaOM */
-p img {
-	float: left;
-}
-
 /* Tantek */
 ul.tags,ul.tags li,h4.tags {
 	display:inline;
diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt
index 102aa9a..68e377c 100644
--- a/themes/asf/index.html.xslt
+++ b/themes/asf/index.html.xslt
@@ -83,6 +83,9 @@
           <ul>
             <xsl:for-each select="planet:source">
               <xsl:sort select="planet:name"/>
+              <xsl:variable name="id" select="atom:id"/>
+              <xsl:variable name="posts"
+                select="/atom:feed/atom:entry[atom:source/atom:id = $id]"/>
               <xsl:text>&#10;</xsl:text>
               <li>
                 <!-- icon -->
@@ -107,7 +110,10 @@
                 <a href="{atom:link[@rel='alternate']/@href}">
                   <xsl:choose>
                     <xsl:when test="planet:message">
-                      <xsl:attribute name="class">message</xsl:attribute>
+                      <xsl:attribute name="class">
+                        <xsl:if test="$posts">active message</xsl:if>
+                        <xsl:if test="not($posts)">message</xsl:if>
+                      </xsl:attribute>
                       <xsl:attribute name="title">
                         <xsl:value-of select="planet:message"/>
                       </xsl:attribute>
@@ -116,10 +122,31 @@
                       <xsl:attribute name="title">
                         <xsl:value-of select="atom:title"/>
                       </xsl:attribute>
+                      <xsl:if test="$posts">
+                        <xsl:attribute name="class">active</xsl:attribute>
+                      </xsl:if>
                     </xsl:when>
                   </xsl:choose>
                   <xsl:value-of select="planet:name"/>
                 </a>
+
+                <xsl:if test="$posts">
+                  <ul>
+                    <xsl:for-each select="$posts">
+                      <xsl:if test="string-length(atom:title) &gt; 0">
+                        <li>
+                          <a href="{atom:link[@rel='alternate']/@href}">
+                            <xsl:if test="atom:title/@xml:lang != @xml:lang">
+                              <xsl:attribute name="xml:lang"
+                                select="{atom:title/@xml:lang}"/>
+                            </xsl:if>
+                            <xsl:value-of select="atom:title"/>
+                          </a>
+                        </li>
+                      </xsl:if>
+                    </xsl:for-each>
+                  </ul>
+                </xsl:if>
               </li>
             </xsl:for-each>
             <xsl:text>&#10;</xsl:text>
diff --git a/themes/asf/personalize.js b/themes/asf/personalize.js
index f7508e0..0d6beeb 100644
--- a/themes/asf/personalize.js
+++ b/themes/asf/personalize.js
@@ -210,6 +210,9 @@ function moveDateHeaders() {
 
 function moveSidebar() {
   var sidebar = document.getElementById('sidebar');
+  if (sidebar.currentStyle && sidebar.currentStyle['float'] == 'none') return;
+  if (window.getComputedStyle && document.defaultView.getComputedStyle(sidebar,null).getPropertyValue('float') == 'none') return;
+
   var h1 = sidebar.previousSibling;
   while (h1.nodeType != 1) h1=h1.previousSibling;
   h1.parentNode.removeChild(h1);
diff --git a/themes/common/validate.html.xslt b/themes/common/validate.html.xslt
index efab5c6..2d1a2e4 100644
--- a/themes/common/validate.html.xslt
+++ b/themes/common/validate.html.xslt
@@ -74,7 +74,7 @@
     <xsl:text>&#10;</xsl:text>
     <tr>
       <xsl:if test="planet:bozo='true'">
-        <xsl:attribute name="bgcolor">#FCC</xsl:attribute>
+        <xsl:attribute name="style">background-color:#FCC</xsl:attribute>
       </xsl:if>
       <td>
         <a title="feed validator">

From 6f7eddf0f0d581c9b2fad2b5cb6004e92d706f54 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 27 Dec 2006 15:37:16 -0500
Subject: [PATCH 34/39] Allow one to subscribe to planet feeds

---
 planet/reconstitute.py                       |  8 +++++---
 planet/splice.py                             |  4 +++-
 tests/data/reconstitute/source_planet_id.xml | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)
 create mode 100644 tests/data/reconstitute/source_planet_id.xml

diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index e2707e6..51f21c7 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -262,10 +262,12 @@ def reconstitute(feed, entry):
     # merge in planet:* from feed (or simply use the feed if no source)
     src = entry.get('source')
     if src:
-      for name,value in feed.feed.items():
-        if name.startswith('planet_'): src[name]=value
+        for name,value in feed.feed.items():
+            if name.startswith('planet_'): src[name]=value
+        if feed.feed.has_key('id'):
+            src['planet_id'] = feed.feed.id
     else:
-      src = feed.feed
+        src = feed.feed
 
     # source:author
     src_author = src.get('author_detail',{})
diff --git a/planet/splice.py b/planet/splice.py
index 4853619..750040d 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -81,7 +81,9 @@ def splice():
             if sources:
                 ids = sources[0].getElementsByTagName('id')
                 if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
-                    continue
+                    ids = sources[0].getElementsByTagName('planet_id')
+                    if not ids: continue
+                    if ids[0].childNodes[0].nodeValue not in sub_ids: continue
 
             # add entry to feed
             feed.appendChild(entry.documentElement)
diff --git a/tests/data/reconstitute/source_planet_id.xml b/tests/data/reconstitute/source_planet_id.xml
new file mode 100644
index 0000000..71ec1c0
--- /dev/null
+++ b/tests/data/reconstitute/source_planet_id.xml
@@ -0,0 +1,14 @@
+<!--
+Description:  source id
+Expect:       source.planet_id == 'http://example.com/'
+-->
+
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <id>http://example.com/</id>
+  <entry>
+    <source>
+      <id>http://example.org/</id>
+    </source>
+  </entry>
+</feed>
+

From 04f35b8ccaa2f3e581b9a807935d5fb09feaa60e Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 27 Dec 2006 16:07:38 -0500
Subject: [PATCH 35/39] Fix typo

---
 planet/splice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/planet/splice.py b/planet/splice.py
index 750040d..2c23b09 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -81,7 +81,7 @@ def splice():
             if sources:
                 ids = sources[0].getElementsByTagName('id')
                 if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
-                    ids = sources[0].getElementsByTagName('planet_id')
+                    ids = sources[0].getElementsByTagName('planet:id')
                     if not ids: continue
                     if ids[0].childNodes[0].nodeValue not in sub_ids: continue
 

From 04ca70744341bf076b35034f2f805df4237a35ee Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Sat, 6 Jan 2007 13:03:21 -0500
Subject: [PATCH 36/39] Correct download link

---
 docs/docs.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs.js b/docs/docs.js
index efd6d3f..e5944f8 100644
--- a/docs/docs.js
+++ b/docs/docs.js
@@ -23,7 +23,7 @@ window.onload=function() {
 
   p = document.createElement('p');
   var a = document.createElement('a');
-  a.setAttribute('href',base+'index.html');
+  a.setAttribute('href',base);
   a.appendChild(document.createTextNode('Download'));
   p.appendChild(a);
   p.appendChild(document.createTextNode(" \u00b7 "));

From 3024af031fa9256eb5a8f2c698b029bb00d0f1cb Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 11 Jan 2007 15:05:30 -0500
Subject: [PATCH 37/39] Switch from Beautiful Soup to html5lib

---
 docs/index.html                               |    3 +-
 docs/normalization.html                       |    4 +-
 docs/venus.svg                                |    2 +-
 planet/BeautifulSoup.py                       | 1836 -----------------
 planet/html5lib/__init__.py                   |   34 +
 planet/html5lib/constants.py                  |  456 ++++
 planet/html5lib/html5parser.py                | 1719 +++++++++++++++
 planet/html5lib/inputstream.py                |  202 ++
 planet/html5lib/liberalxmlparser.py           |  106 +
 planet/html5lib/tokenizer.py                  |  745 +++++++
 planet/html5lib/treebuilders/__init__.py      |   36 +
 planet/html5lib/treebuilders/_base.py         |  312 +++
 planet/html5lib/treebuilders/dom.py           |  127 ++
 planet/html5lib/treebuilders/etree.py         |  208 ++
 planet/html5lib/treebuilders/simpletree.py    |  153 ++
 planet/html5lib/utils.py                      |   36 +
 planet/reconstitute.py                        |   59 +-
 .../reconstitute/content_illegal_char.xml     |    2 +-
 18 files changed, 4164 insertions(+), 1876 deletions(-)
 delete mode 100644 planet/BeautifulSoup.py
 create mode 100644 planet/html5lib/__init__.py
 create mode 100644 planet/html5lib/constants.py
 create mode 100644 planet/html5lib/html5parser.py
 create mode 100644 planet/html5lib/inputstream.py
 create mode 100644 planet/html5lib/liberalxmlparser.py
 create mode 100644 planet/html5lib/tokenizer.py
 create mode 100755 planet/html5lib/treebuilders/__init__.py
 create mode 100755 planet/html5lib/treebuilders/_base.py
 create mode 100755 planet/html5lib/treebuilders/dom.py
 create mode 100755 planet/html5lib/treebuilders/etree.py
 create mode 100755 planet/html5lib/treebuilders/simpletree.py
 create mode 100644 planet/html5lib/utils.py

diff --git a/docs/index.html b/docs/index.html
index b19d0c8..3ebc8c2 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -33,8 +33,9 @@
 <ul>
 <li><a href="http://www.planetplanet.org/">Planet</a></li>
 <li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
-<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
+<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
 <li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
+<li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
 <li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
 <li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
 </ul>
diff --git a/docs/normalization.html b/docs/normalization.html
index 08465f5..39dd279 100644
--- a/docs/normalization.html
+++ b/docs/normalization.html
@@ -11,7 +11,7 @@
 <h2>Normalization</h2>
 <p>Venus builds on, and extends, the <a
 href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
-href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
+href="http://code.google.com/p/html5lib/">html5lib</a> to
 convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
 meaning that you don't have to worry about funky feeds, tag soup, or character
 encoding.</p>
@@ -48,7 +48,7 @@ other security risks are removed.</p>
 links are resolved</a> within the HTML.  This is also done for links
 in other areas in the feed too.</p>
 <p>Finally, unmatched tags are closed.  This is done with a
-<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>.  Additionally, a
+<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>.  Additionally, a
 <a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
 subset of MathML</a>, as well as a
 <a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
diff --git a/docs/venus.svg b/docs/venus.svg
index 43474ac..3ae3e63 100644
--- a/docs/venus.svg
+++ b/docs/venus.svg
@@ -69,7 +69,7 @@
   <g font-size="32" fill="#FFF" text-anchor="middle">
     <text x="350" y="380" fill="#F00">Spider</text>
     <text x="350" y="460">Universal Feed Parser</text>
-    <text x="350" y="530">BeautifulSoup</text>
+    <text x="350" y="530">html5lib</text>
     <text x="350" y="600">Reconstitute</text>
     <text x="350" y="750">Filter(s)</text>
     <text x="850" y="250" fill="#F00">Splice</text>
diff --git a/planet/BeautifulSoup.py b/planet/BeautifulSoup.py
deleted file mode 100644
index 1a89b59..0000000
--- a/planet/BeautifulSoup.py
+++ /dev/null
@@ -1,1836 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup parses a (possibly invalid) XML or HTML document into a
-tree representation. It provides methods and Pythonic idioms that make
-it easy to navigate, search, and modify the tree.
-
-A well-structured XML/HTML document yields a well-behaved data
-structure. An ill-structured XML/HTML document yields a
-correspondingly ill-behaved data structure. If your document is only
-locally well-structured, you can use this library to find and process
-the well-structured part of it.
-
-Beautiful Soup works with Python 2.2 and up. It has no external
-dependencies, but you'll have more success at converting data to UTF-8
-if you also install these three packages:
-
-* chardet, for auto-detecting character encodings
-  http://chardet.feedparser.org/
-* cjkcodecs and iconv_codec, which add more encodings to the ones supported
-  by stock Python.
-  http://cjkpython.i18n.org/
-
-Beautiful Soup defines classes for two main parsing strategies:
-    
- * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
-   language that kind of looks like XML.
-
- * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
-   or invalid. This class has web browser-like heuristics for
-   obtaining a sensible parse tree in the face of common HTML errors.
-
-Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
-the encoding of an HTML or XML document, and converting it to
-Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
-Parser.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/documentation.html
-"""
-from __future__ import generators
-
-__author__ = "Leonard Richardson (crummy.com)"
-__contributors__ = ["Sam Ruby (intertwingly.net)",
-                    "the unwitting Mark Pilgrim (diveintomark.org)",
-                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
-__version__ = "3.0.3"
-__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
-__license__ = "PSF"
-
-from sgmllib import SGMLParser, SGMLParseError
-import codecs
-import types
-import re
-import sgmllib
-
-try:
-  from htmlentitydefs import name2codepoint
-except:
-  import htmlentitydefs
-  name2codepoint={}
-  for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
-    if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
-    name2codepoint[name]=ord(codepoint)
-
-# python 2.2 support
-if not hasattr(__builtins__, 'basestring'): basestring=str
-
-# This RE makes Beautiful Soup able to parse XML with namespaces.
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
-
-# This RE makes Beautiful Soup capable of recognizing numeric character
-# references that use hexadecimal.
-sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
-
-DEFAULT_OUTPUT_ENCODING = "utf-8"
-
-# First, the classes that represent markup elements.
-
-class PageElement:
-    """Contains the navigational information for some part of the page
-    (either a tag or a piece of text)"""
-
-    def setup(self, parent=None, previous=None):
-        """Sets up the initial relations between this element and
-        other elements."""        
-        self.parent = parent
-        self.previous = previous
-        self.next = None
-        self.previousSibling = None
-        self.nextSibling = None
-        if self.parent and self.parent.contents:
-            self.previousSibling = self.parent.contents[-1]
-            self.previousSibling.nextSibling = self
-
-    def replaceWith(self, replaceWith):        
-        oldParent = self.parent
-        myIndex = self.parent.contents.index(self)
-        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
-            # We're replacing this element with one of its siblings.
-            index = self.parent.contents.index(replaceWith)
-            if index and index < myIndex:
-                # Furthermore, it comes before this element. That
-                # means that when we extract it, the index of this
-                # element will change.
-                myIndex = myIndex - 1
-        self.extract()        
-        oldParent.insert(myIndex, replaceWith)
-        
-    def extract(self):
-        """Destructively rips this element out of the tree."""        
-        if self.parent:
-            try:
-                self.parent.contents.remove(self)
-            except ValueError:
-                pass
-
-        #Find the two elements that would be next to each other if
-        #this element (and any children) hadn't been parsed. Connect
-        #the two.        
-        lastChild = self._lastRecursiveChild()
-        nextElement = lastChild.next
-
-        if self.previous:
-            self.previous.next = nextElement
-        if nextElement:
-            nextElement.previous = self.previous
-        self.previous = None
-        lastChild.next = None
-
-        self.parent = None        
-        if self.previousSibling:
-            self.previousSibling.nextSibling = self.nextSibling
-        if self.nextSibling:
-            self.nextSibling.previousSibling = self.previousSibling
-        self.previousSibling = self.nextSibling = None       
-
-    def _lastRecursiveChild(self):
-        "Finds the last element beneath this object to be parsed."
-        lastChild = self
-        while hasattr(lastChild, 'contents') and lastChild.contents:
-            lastChild = lastChild.contents[-1]
-        return lastChild
-
-    def insert(self, position, newChild):
-        if (isinstance(newChild, basestring)
-            or isinstance(newChild, unicode)) \
-            and not isinstance(newChild, NavigableString):
-            newChild = NavigableString(newChild)        
-
-        position =  min(position, len(self.contents))
-        if hasattr(newChild, 'parent') and newChild.parent != None:
-            # We're 'inserting' an element that's already one
-            # of this object's children. 
-            if newChild.parent == self:
-                index = self.find(newChild)
-                if index and index < position:
-                    # Furthermore we're moving it further down the
-                    # list of this object's children. That means that
-                    # when we extract this element, our target index
-                    # will jump down one.
-                    position = position - 1
-            newChild.extract()
-            
-        newChild.parent = self
-        previousChild = None
-        if position == 0:
-            newChild.previousSibling = None
-            newChild.previous = self
-        else:
-            previousChild = self.contents[position-1]
-            newChild.previousSibling = previousChild
-            newChild.previousSibling.nextSibling = newChild
-            newChild.previous = previousChild._lastRecursiveChild()
-        if newChild.previous:
-            newChild.previous.next = newChild        
-
-        newChildsLastElement = newChild._lastRecursiveChild()
-
-        if position >= len(self.contents):
-            newChild.nextSibling = None
-            
-            parent = self
-            parentsNextSibling = None
-            while not parentsNextSibling:
-                parentsNextSibling = parent.nextSibling
-                parent = parent.parent
-                if not parent: # This is the last element in the document.
-                    break
-            if parentsNextSibling:
-                newChildsLastElement.next = parentsNextSibling
-            else:
-                newChildsLastElement.next = None
-        else:
-            nextChild = self.contents[position]            
-            newChild.nextSibling = nextChild            
-            if newChild.nextSibling:
-                newChild.nextSibling.previousSibling = newChild
-            newChildsLastElement.next = nextChild
-
-        if newChildsLastElement.next:
-            newChildsLastElement.next.previous = newChildsLastElement
-        self.contents.insert(position, newChild)
-
-    def findNext(self, name=None, attrs={}, text=None, **kwargs):
-        """Returns the first item that matches the given criteria and
-        appears after this Tag in the document."""
-        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
-
-    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
-                    **kwargs):
-        """Returns all items that match the given criteria and appear
-        before after Tag in the document."""
-        return self._findAll(name, attrs, text, limit, self.nextGenerator)
-
-    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
-        """Returns the closest sibling to this Tag that matches the
-        given criteria and appears after this Tag in the document."""
-        return self._findOne(self.findNextSiblings, name, attrs, text,
-                             **kwargs)
-
-    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
-                         **kwargs):
-        """Returns the siblings of this Tag that match the given
-        criteria and appear after this Tag in the document."""
-        return self._findAll(name, attrs, text, limit,
-                             self.nextSiblingGenerator, **kwargs)
-    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
-
-    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
-        """Returns the first item that matches the given criteria and
-        appears before this Tag in the document."""
-        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
-
-    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
-                        **kwargs):
-        """Returns all items that match the given criteria and appear
-        before this Tag in the document."""
-        return self._findAll(name, attrs, text, limit, self.previousGenerator,
-                           **kwargs)
-    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
-
-    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
-        """Returns the closest sibling to this Tag that matches the
-        given criteria and appears before this Tag in the document."""
-        return self._findOne(self.findPreviousSiblings, name, attrs, text,
-                             **kwargs)
-
-    def findPreviousSiblings(self, name=None, attrs={}, text=None,
-                             limit=None, **kwargs):
-        """Returns the siblings of this Tag that match the given
-        criteria and appear before this Tag in the document."""
-        return self._findAll(name, attrs, text, limit,
-                             self.previousSiblingGenerator, **kwargs)
-    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
-
-    def findParent(self, name=None, attrs={}, **kwargs):
-        """Returns the closest parent of this Tag that matches the given
-        criteria."""
-        # NOTE: We can't use _findOne because findParents takes a different
-        # set of arguments.
-        r = None
-        l = self.findParents(name, attrs, 1)
-        if l:
-            r = l[0]
-        return r
-
-    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
-        """Returns the parents of this Tag that match the given
-        criteria."""
-
-        return self._findAll(name, attrs, None, limit, self.parentGenerator,
-                             **kwargs)
-    fetchParents = findParents # Compatibility with pre-3.x
-
-    #These methods do the real heavy lifting.
-
-    def _findOne(self, method, name, attrs, text, **kwargs):
-        r = None
-        l = method(name, attrs, text, 1, **kwargs)
-        if l:
-            r = l[0]
-        return r
-    
-    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
-        "Iterates over a generator looking for things that match."
-
-        if isinstance(name, SoupStrainer):
-            strainer = name
-        else:
-            # Build a SoupStrainer
-            strainer = SoupStrainer(name, attrs, text, **kwargs)
-        results = ResultSet(strainer)
-        g = generator()
-        while True:
-            try:
-                i = g.next()
-            except StopIteration:
-                break
-            if i:
-                found = strainer.search(i)
-                if found:
-                    results.append(found)
-                    if limit and len(results) >= limit:
-                        break
-        return results
-
-    #These Generators can be used to navigate starting from both
-    #NavigableStrings and Tags.                
-    def nextGenerator(self):
-        i = self
-        while i:
-            i = i.next
-            yield i
-
-    def nextSiblingGenerator(self):
-        i = self
-        while i:
-            i = i.nextSibling
-            yield i
-
-    def previousGenerator(self):
-        i = self
-        while i:
-            i = i.previous
-            yield i
-
-    def previousSiblingGenerator(self):
-        i = self
-        while i:
-            i = i.previousSibling
-            yield i
-
-    def parentGenerator(self):
-        i = self
-        while i:
-            i = i.parent
-            yield i
-
-    # Utility methods
-    def substituteEncoding(self, str, encoding=None):
-        encoding = encoding or "utf-8"
-        return str.replace("%SOUP-ENCODING%", encoding)    
-
-    def toEncoding(self, s, encoding=None):
-        """Encodes an object to a string in some encoding, or to Unicode.
-        ."""
-        if isinstance(s, unicode):
-            if encoding:
-                s = s.encode(encoding)
-        elif isinstance(s, str):
-            if encoding:
-                s = s.encode(encoding)
-            else:
-                s = unicode(s)
-        else:
-            if encoding:
-                s  = self.toEncoding(str(s), encoding)
-            else:
-                s = unicode(s)
-        return s
-
-class NavigableString(unicode, PageElement):
-
-    def __getattr__(self, attr):
-        """text.string gives you text. This is for backwards
-        compatibility for Navigable*String, but for CData* it lets you
-        get the string without the CData wrapper."""
-        if attr == 'string':
-            return self
-        else:
-            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
-
-    def __unicode__(self):
-        return __str__(self, None)
-
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        if encoding:
-            return self.encode(encoding)
-        else:
-            return self
-        
-class CData(NavigableString):
-
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
-
-class ProcessingInstruction(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        output = self
-        if "%SOUP-ENCODING%" in output:
-            output = self.substituteEncoding(output, encoding)
-        return "<?%s?>" % self.toEncoding(output, encoding)
-
-class Comment(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!--%s-->" % NavigableString.__str__(self, encoding)    
-
-class Declaration(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!%s>" % NavigableString.__str__(self, encoding)        
-
-class Tag(PageElement):
-    """Represents a found HTML tag with its attributes and contents."""
-
-    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
-                              "quot" : '"',
-                              "amp" : "&",
-                              "lt" : "<",
-                              "gt" : ">"
-                              }
-    # An RE for finding ampersands that aren't the start of of a
-    # numeric entity.
-    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-
-    def __init__(self, parser, name, attrs=None, parent=None,
-                 previous=None):
-        "Basic constructor."
-
-        # We don't actually store the parser object: that lets extracted
-        # chunks be garbage-collected
-        self.parserClass = parser.__class__
-        self.isSelfClosing = parser.isSelfClosingTag(name)
-        self.convertHTMLEntities = parser.convertHTMLEntities
-        self.name = name
-        if attrs == None:
-            attrs = []
-        self.attrs = attrs
-        self.contents = []
-        self.setup(parent, previous)
-        self.hidden = False
-        self.containsSubstitutions = False
-
-    def get(self, key, default=None):
-        """Returns the value of the 'key' attribute for the tag, or
-        the value given for 'default' if it doesn't have that
-        attribute."""
-        return self._getAttrMap().get(key, default)    
-
-    def has_key(self, key):
-        return self._getAttrMap().has_key(key)
-
-    def __getitem__(self, key):
-        """tag[key] returns the value of the 'key' attribute for the tag,
-        and throws an exception if it's not there."""
-        return self._getAttrMap()[key]
-
-    def __iter__(self):
-        "Iterating over a tag iterates over its contents."
-        return iter(self.contents)
-
-    def __len__(self):
-        "The length of a tag is the length of its list of contents."
-        return len(self.contents)
-
-    def __contains__(self, x):
-        return x in self.contents
-
-    def __nonzero__(self):
-        "A tag is non-None even if it has no contents."
-        return True
-
-    def __setitem__(self, key, value):        
-        """Setting tag[key] sets the value of the 'key' attribute for the
-        tag."""
-        self._getAttrMap()
-        self.attrMap[key] = value
-        found = False
-        for i in range(0, len(self.attrs)):
-            if self.attrs[i][0] == key:
-                self.attrs[i] = (key, value)
-                found = True
-        if not found:
-            self.attrs.append((key, value))
-        self._getAttrMap()[key] = value
-
-    def __delitem__(self, key):
-        "Deleting tag[key] deletes all 'key' attributes for the tag."
-        for item in self.attrs:
-            if item[0] == key:
-                self.attrs.remove(item)
-                #We don't break because bad HTML can define the same
-                #attribute multiple times.
-            self._getAttrMap()
-            if self.attrMap.has_key(key):
-                del self.attrMap[key]
-
-    def __call__(self, *args, **kwargs):
-        """Calling a tag like a function is the same as calling its
-        findAll() method. Eg. tag('a') returns a list of all the A tags
-        found within this tag."""
-        return apply(self.findAll, args, kwargs)
-
-    def __getattr__(self, tag):
-        #print "Getattr %s.%s" % (self.__class__, tag)
-        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
-            return self.find(tag[:-3])
-        elif tag.find('__') != 0:
-            return self.find(tag)
-
-    def __eq__(self, other):
-        """Returns true iff this tag has the same name, the same attributes,
-        and the same contents (recursively) as the given tag.
-
-        NOTE: right now this will return false if two tags have the
-        same attributes in a different order. Should this be fixed?"""
-        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
-            return False
-        for i in range(0, len(self.contents)):
-            if self.contents[i] != other.contents[i]:
-                return False
-        return True
-
-    def __ne__(self, other):
-        """Returns true iff this tag is not identical to the other tag,
-        as defined in __eq__."""
-        return not self == other
-
-    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        """Renders this tag as a string."""
-        return self.__str__(encoding)
-
-    def __unicode__(self):
-        return self.__str__(None)
-
-    def _convertEntities(self, match):
-        x = match.group(1)
-        if x in name2codepoint:
-            c = unichr(name2codepoint[x])            
-            if c in self.XML_ENTITIES_TO_CHARS.values():
-                return '&%s;' % x
-            else:
-                return c
-        elif x in self.XML_ENTITIES_TO_CHARS:
-            return '&%s;' % x
-        else:
-            return '&amp;%s;' % x
-
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                prettyPrint=False, indentLevel=0):
-        """Returns a string or Unicode representation of this tag and
-        its contents. To get Unicode, pass None for encoding.
-
-        NOTE: since Python's HTML parser consumes whitespace, this
-        method is not certain to reproduce the whitespace present in
-        the original string."""
-
-        encodedName = self.toEncoding(self.name, encoding)
-
-        attrs = []
-        if self.attrs:
-            for key, val in self.attrs:
-                fmt = '%s="%s"'
-                if isString(val):                    
-                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
-                        val = self.substituteEncoding(val, encoding)
-
-                    # The attribute value either:
-                    #
-                    # * Contains no embedded double quotes or single quotes.
-                    #   No problem: we enclose it in double quotes.
-                    # * Contains embedded single quotes. No problem:
-                    #   double quotes work here too.
-                    # * Contains embedded double quotes. No problem:
-                    #   we enclose it in single quotes.
-                    # * Embeds both single _and_ double quotes. This
-                    #   can't happen naturally, but it can happen if
-                    #   you modify an attribute value after parsing
-                    #   the document. Now we have a bit of a
-                    #   problem. We solve it by enclosing the
-                    #   attribute in single quotes, and escaping any
-                    #   embedded single quotes to XML entities.
-                    if '"' in val:
-                        # This can't happen naturally, but it can happen
-                        # if you modify an attribute value after parsing.
-                        if "'" in val:
-                            val = val.replace('"', "&quot;")
-                        else:
-                            fmt = "%s='%s'"
-
-                    # Optionally convert any HTML entities
-                    if self.convertHTMLEntities:
-                        val = re.sub("&(\w+);", self._convertEntities, val)
-
-                    # Now we're okay w/r/t quotes. But the attribute
-                    # value might also contain angle brackets, or
-                    # ampersands that aren't part of entities. We need
-                    # to escape those to XML entities too.
-                    val = val.replace("<", "&lt;").replace(">", "&gt;")
-                    val = self.BARE_AMPERSAND.sub("&amp;", val)
-
-                                      
-                attrs.append(fmt % (self.toEncoding(key, encoding),
-                                    self.toEncoding(val, encoding)))
-        close = ''
-        closeTag = ''
-        if self.isSelfClosing:
-            close = ' /'
-        else:
-            closeTag = '</%s>' % encodedName
-
-        indentTag, indentContents = 0, 0
-        if prettyPrint:
-            indentTag = indentLevel
-            space = (' ' * (indentTag-1))
-            indentContents = indentTag + 1
-        contents = self.renderContents(encoding, prettyPrint, indentContents)
-        if self.hidden:
-            s = contents
-        else:
-            s = []
-            attributeString = ''
-            if attrs:
-                attributeString = ' ' + ' '.join(attrs)            
-            if prettyPrint:
-                s.append(space)
-            s.append('<%s%s%s>' % (encodedName, attributeString, close))
-            if prettyPrint:
-                s.append("\n")
-            s.append(contents)
-            if prettyPrint and contents and contents[-1] != "\n":
-                s.append("\n")
-            if prettyPrint and closeTag:
-                s.append(space)
-            s.append(closeTag)
-            if prettyPrint and closeTag and self.nextSibling:
-                s.append("\n")
-            s = ''.join(s)
-        return s
-
-    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return self.__str__(encoding, True)
-
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        """Renders the contents of this tag as a string in the given
-        encoding. If encoding is None, returns a Unicode string.."""
-        s=[]
-        for c in self:
-            text = None
-            if isinstance(c, NavigableString):
-                text = c.__str__(encoding)
-            elif isinstance(c, Tag):
-                s.append(c.__str__(encoding, prettyPrint, indentLevel))
-            if text and prettyPrint:
-                text = text.strip()              
-            if text:
-                if prettyPrint:
-                    s.append(" " * (indentLevel-1))
-                s.append(text)
-                if prettyPrint:
-                    s.append("\n")
-        return ''.join(s)    
-
-    #Soup methods
-
-    def find(self, name=None, attrs={}, recursive=True, text=None,
-             **kwargs):
-        """Return only the first child of this Tag matching the given
-        criteria."""
-        r = None
-        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
-        if l:
-            r = l[0]
-        return r
-    findChild = find
-
-    def findAll(self, name=None, attrs={}, recursive=True, text=None,
-                limit=None, **kwargs):
-        """Extracts a list of Tag objects that match the given
-        criteria.  You can specify the name of the Tag and any
-        attributes you want the Tag to have.
-
-        The value of a key-value pair in the 'attrs' map can be a
-        string, a list of strings, a regular expression object, or a
-        callable that takes a string and returns whether or not the
-        string matches for some custom definition of 'matches'. The
-        same is true of the tag name."""
-        generator = self.recursiveChildGenerator
-        if not recursive:
-            generator = self.childGenerator
-        return self._findAll(name, attrs, text, limit, generator, **kwargs)
-    findChildren = findAll
-
-    # Pre-3.x compatibility methods
-    first = find
-    fetch = findAll
-    
-    def fetchText(self, text=None, recursive=True, limit=None):
-        return self.findAll(text=text, recursive=recursive, limit=limit)
-
-    def firstText(self, text=None, recursive=True):
-        return self.find(text=text, recursive=recursive)
-    
-    #Utility methods
-
-    def append(self, tag):
-        """Appends the given tag to the contents of this tag."""
-        self.contents.append(tag)
-
-    #Private methods
-
-    def _getAttrMap(self):
-        """Initializes a map representation of this tag's attributes,
-        if not already initialized."""
-        if not getattr(self, 'attrMap'):
-            self.attrMap = {}
-            for (key, value) in self.attrs:
-                self.attrMap[key] = value 
-        return self.attrMap
-
-    #Generator methods
-    def childGenerator(self):
-        for i in range(0, len(self.contents)):
-            yield self.contents[i]
-        raise StopIteration
-    
-    def recursiveChildGenerator(self):
-        stack = [(self, 0)]
-        while stack:
-            tag, start = stack.pop()
-            if isinstance(tag, Tag):            
-                for i in range(start, len(tag.contents)):
-                    a = tag.contents[i]
-                    yield a
-                    if isinstance(a, Tag) and tag.contents:
-                        if i < len(tag.contents) - 1:
-                            stack.append((tag, i+1))
-                        stack.append((a, 0))
-                        break
-        raise StopIteration
-
-# Next, a couple classes to represent queries and their results.
-class SoupStrainer:
-    """Encapsulates a number of ways of matching a markup element (tag or
-    text)."""
-
-    def __init__(self, name=None, attrs={}, text=None, **kwargs):
-        self.name = name
-        if isString(attrs):
-            kwargs['class'] = attrs
-            attrs = None
-        if kwargs:
-            if attrs:
-                attrs = attrs.copy()
-                attrs.update(kwargs)
-            else:
-                attrs = kwargs
-        self.attrs = attrs
-        self.text = text
-
-    def __str__(self):
-        if self.text:
-            return self.text
-        else:
-            return "%s|%s" % (self.name, self.attrs)
-    
-    def searchTag(self, markupName=None, markupAttrs={}):
-        found = None
-        markup = None
-        if isinstance(markupName, Tag):
-            markup = markupName
-            markupAttrs = markup
-        callFunctionWithTagData = callable(self.name) \
-                                and not isinstance(markupName, Tag)
-
-        if (not self.name) \
-               or callFunctionWithTagData \
-               or (markup and self._matches(markup, self.name)) \
-               or (not markup and self._matches(markupName, self.name)):
-            if callFunctionWithTagData:
-                match = self.name(markupName, markupAttrs)
-            else:
-                match = True            
-                markupAttrMap = None
-                for attr, matchAgainst in self.attrs.items():
-                    if not markupAttrMap:
-                         if hasattr(markupAttrs, 'get'):
-                            markupAttrMap = markupAttrs
-                         else:
-                            markupAttrMap = {}
-                            for k,v in markupAttrs:
-                                markupAttrMap[k] = v
-                    attrValue = markupAttrMap.get(attr)
-                    if not self._matches(attrValue, matchAgainst):
-                        match = False
-                        break
-            if match:
-                if markup:
-                    found = markup
-                else:
-                    found = markupName
-        return found
-
-    def search(self, markup):
-        #print 'looking for %s in %s' % (self, markup)
-        found = None
-        # If given a list of items, scan it for a text element that
-        # matches.        
-        if isList(markup) and not isinstance(markup, Tag):
-            for element in markup:
-                if isinstance(element, NavigableString) \
-                       and self.search(element):
-                    found = element
-                    break
-        # If it's a Tag, make sure its name or attributes match.
-        # Don't bother with Tags if we're searching for text.
-        elif isinstance(markup, Tag):
-            if not self.text:
-                found = self.searchTag(markup)
-        # If it's text, make sure the text matches.
-        elif isinstance(markup, NavigableString) or \
-                 isString(markup):
-            if self._matches(markup, self.text):
-                found = markup
-        else:
-            raise Exception, "I don't know how to match against a %s" \
-                  % markup.__class__
-        return found
-        
-    def _matches(self, markup, matchAgainst):    
-        #print "Matching %s against %s" % (markup, matchAgainst)
-        result = False
-        if matchAgainst == True and (not hasattr(types, 'BooleanType') or
-            type(matchAgainst) == types.BooleanType):
-            result = markup != None
-        elif callable(matchAgainst):
-            result = matchAgainst(markup)
-        else:
-            #Custom match methods take the tag as an argument, but all
-            #other ways of matching match the tag name as a string.
-            if isinstance(markup, Tag):
-                markup = markup.name
-            if markup and not isString(markup):
-                markup = unicode(markup)
-            #Now we know that chunk is either a string, or None.
-            if hasattr(matchAgainst, 'match'):
-                # It's a regexp object.
-                result = markup and matchAgainst.search(markup)
-            elif isList(matchAgainst):
-                result = markup in matchAgainst
-            elif hasattr(matchAgainst, 'items'):
-                result = markup.has_key(matchAgainst)
-            elif matchAgainst and isString(markup):
-                if isinstance(markup, unicode):
-                    matchAgainst = unicode(matchAgainst)
-                else:
-                    matchAgainst = str(matchAgainst)
-
-            if not result:
-                result = matchAgainst == markup
-        return result
-
-class ResultSet(list):
-    """A ResultSet is just a list that keeps track of the SoupStrainer
-    that created it."""
-    def __init__(self, source):
-        list.__init__([])
-        self.source = source
-
-# Now, some helper functions.
-
-def isList(l):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is listlike."""
-    return hasattr(l, '__iter__') \
-           or (type(l) in (types.ListType, types.TupleType))
-
-def isString(s):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is stringlike."""
-    try:
-        return isinstance(s, unicode) or isinstance(s, basestring) 
-    except NameError:
-        return isinstance(s, str)
-
-def buildTagMap(default, *args):
-    """Turns a list of maps, lists, or scalars into a single map.
-    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
-    NESTING_RESET_TAGS maps out of lists and partial maps."""
-    built = {}
-    for portion in args:
-        if hasattr(portion, 'items'):
-            #It's a map. Merge it.
-            for k,v in portion.items():
-                built[k] = v
-        elif isList(portion):
-            #It's a list. Map each item to the default.
-            for k in portion:
-                built[k] = default
-        else:
-            #It's a scalar. Map it to the default.
-            built[portion] = default
-    return built
-
-# Now, the parser classes.
-
-class BeautifulStoneSoup(Tag, SGMLParser):
-
-    """This class contains the basic parser and search code. It defines
-    a parser that knows nothing about tag behavior except for the
-    following:
-   
-      You can't close a tag without closing all the tags it encloses.
-      That is, "<foo><bar></foo>" actually means
-      "<foo><bar></bar></foo>".
-
-    [Another possible explanation is "<foo><bar /></foo>", but since
-    this class defines no SELF_CLOSING_TAGS, it will never use that
-    explanation.]
-
-    This class is useful for parsing XML or made-up markup languages,
-    or when BeautifulSoup makes an assumption counter to what you were
-    expecting."""
-
-    SELF_CLOSING_TAGS = {}
-    NESTABLE_TAGS = {}
-    RESET_NESTING_TAGS = {}
-    QUOTE_TAGS = {}
-
-    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
-                       lambda x: x.group(1) + ' />'),
-                      (re.compile('<!\s+([^<>]*)>'),
-                       lambda x: '<!' + x.group(1) + '>')
-                      ]
-
-    ROOT_TAG_NAME = u'[document]'
-
-    HTML_ENTITIES = "html"
-    XML_ENTITIES = "xml"
-    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
-
-    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
-                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
-                 convertEntities=None, selfClosingTags=None):
-        """The Soup object is initialized as the 'root tag', and the
-        provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser. 
-
-        sgmllib will process most bad HTML, and the BeautifulSoup
-        class has some tricks for dealing with some HTML that kills
-        sgmllib, but Beautiful Soup can nonetheless choke or lose data
-        if your data uses self-closing tags or declarations
-        incorrectly.
-
-        By default, Beautiful Soup uses regexes to sanitize input,
-        avoiding the vast majority of these problems. If the problems
-        don't apply to you, pass in False for markupMassage, and
-        you'll get better performance.
-
-        The default parser massage techniques fix the two most common
-        instances of invalid HTML that choke sgmllib:
-
-         <br/> (No space between name of closing tag and tag close)
-         <! --Comment--> (Extraneous whitespace in declaration)
-
-        You can pass in a custom list of (RE object, replace method)
-        tuples to get Beautiful Soup to scrub your input the way you
-        want."""
-
-        self.parseOnlyThese = parseOnlyThese
-        self.fromEncoding = fromEncoding
-        self.smartQuotesTo = smartQuotesTo
-
-        if convertEntities:
-            # It doesn't make sense to convert encoded characters to
-            # entities even while you're converting entities to Unicode.
-            # Just convert it all to Unicode.
-            self.smartQuotesTo = None
-
-        if isList(convertEntities):
-            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
-            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
-        else:
-            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
-            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
-
-        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
-        SGMLParser.__init__(self)
-            
-        if hasattr(markup, 'read'):        # It's a file-type object.
-            markup = markup.read()
-        self.markup = markup
-        self.markupMassage = markupMassage
-        try:
-            self._feed()
-        except StopParsing:
-            pass
-        self.markup = None                 # The markup can now be GCed
-
-    def _feed(self, inDocumentEncoding=None):
-        # Convert the document to Unicode.
-        markup = self.markup
-        if isinstance(markup, unicode):
-            if not hasattr(self, 'originalEncoding'):
-                self.originalEncoding = None
-        else:
-            dammit = UnicodeDammit\
-                     (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo)
-            markup = dammit.unicode
-            self.originalEncoding = dammit.originalEncoding
-        if markup:
-            if self.markupMassage:
-                if not isList(self.markupMassage):
-                    self.markupMassage = self.MARKUP_MASSAGE            
-                for fix, m in self.markupMassage:
-                    markup = fix.sub(m, markup)
-        self.reset()
-
-        SGMLParser.feed(self, markup or "")
-        SGMLParser.close(self)
-        # Close out any unfinished strings and close all the open tags.
-        self.endData()
-        while self.currentTag.name != self.ROOT_TAG_NAME:
-            self.popTag()
-
-    def __getattr__(self, methodName):
-        """This method routes method call requests to either the SGMLParser
-        superclass or the Tag superclass, depending on the method name."""
-        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
-
-        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
-               or methodName.find('do_') == 0:
-            return SGMLParser.__getattr__(self, methodName)
-        elif methodName.find('__') != 0:
-            return Tag.__getattr__(self, methodName)
-        else:
-            raise AttributeError
-
-    def isSelfClosingTag(self, name):
-        """Returns true iff the given string is the name of a
-        self-closing tag according to this parser."""
-        return self.SELF_CLOSING_TAGS.has_key(name) \
-               or self.instanceSelfClosingTags.has_key(name)
-            
-    def reset(self):
-        Tag.__init__(self, self, self.ROOT_TAG_NAME)
-        self.hidden = 1
-        SGMLParser.reset(self)
-        self.currentData = []
-        self.currentTag = None
-        self.tagStack = []
-        self.quoteStack = []
-        self.pushTag(self)
-    
-    def popTag(self):
-        tag = self.tagStack.pop()
-        # Tags with just one string-owning child get the child as a
-        # 'string' property, so that soup.tag.string is shorthand for
-        # soup.tag.contents[0]
-        if len(self.currentTag.contents) == 1 and \
-           isinstance(self.currentTag.contents[0], NavigableString):
-            self.currentTag.string = self.currentTag.contents[0]
-
-        #print "Pop", tag.name
-        if self.tagStack:
-            self.currentTag = self.tagStack[-1]
-        return self.currentTag
-
-    def pushTag(self, tag):
-        #print "Push", tag.name
-        if self.currentTag:
-            self.currentTag.append(tag)
-        self.tagStack.append(tag)
-        self.currentTag = self.tagStack[-1]
-
-    def endData(self, containerClass=NavigableString):
-        if self.currentData:
-            currentData = ''.join(self.currentData)
-            if currentData.endswith('<') and self.convertHTMLEntities:
-                currentData = currentData[:-1] + '&lt;'
-            if not currentData.strip():
-                if '\n' in currentData:
-                    currentData = '\n'
-                else:
-                    currentData = ' '
-            self.currentData = []
-            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
-                   (not self.parseOnlyThese.text or \
-                    not self.parseOnlyThese.search(currentData)):
-                return
-            o = containerClass(currentData)
-            o.setup(self.currentTag, self.previous)
-            if self.previous:
-                self.previous.next = o
-            self.previous = o
-            self.currentTag.contents.append(o)
-
-
-    def _popToTag(self, name, inclusivePop=True):
-        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
-        stack up to but *not* including the most recent instqance of
-        the given tag."""
-        #print "Popping to %s" % name
-        if name == self.ROOT_TAG_NAME:
-            return            
-
-        numPops = 0
-        mostRecentTag = None
-        for i in range(len(self.tagStack)-1, 0, -1):
-            if name == self.tagStack[i].name:
-                numPops = len(self.tagStack)-i
-                break
-        if not inclusivePop:
-            numPops = numPops - 1
-
-        for i in range(0, numPops):
-            mostRecentTag = self.popTag()
-        return mostRecentTag    
-
-    def _smartPop(self, name):
-
-        """We need to pop up to the previous tag of this type, unless
-        one of this tag's nesting reset triggers comes between this
-        tag and the previous tag of this type, OR unless this tag is a
-        generic nesting trigger and another generic nesting trigger
-        comes between this tag and the previous tag of this type.
-
-        Examples:
-         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
-         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
-         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
-         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
-
-         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
-         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
-         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
-        """
-
-        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
-        isNestable = nestingResetTriggers != None
-        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
-        popTo = None
-        inclusive = True
-        for i in range(len(self.tagStack)-1, 0, -1):
-            p = self.tagStack[i]
-            if (not p or p.name == name) and not isNestable:
-                #Non-nestable tags get popped to the top or to their
-                #last occurance.
-                popTo = name
-                break
-            if (nestingResetTriggers != None
-                and p.name in nestingResetTriggers) \
-                or (nestingResetTriggers == None and isResetNesting
-                    and self.RESET_NESTING_TAGS.has_key(p.name)):
-                
-                #If we encounter one of the nesting reset triggers
-                #peculiar to this tag, or we encounter another tag
-                #that causes nesting to reset, pop up to but not
-                #including that tag.
-                popTo = p.name
-                inclusive = False
-                break
-            p = p.parent
-        if popTo:
-            self._popToTag(popTo, inclusive)
-
-    def unknown_starttag(self, name, attrs, selfClosing=0):
-        #print "Start tag %s: %s" % (name, attrs)
-        if self.quoteStack:
-            #This is not a real tag.
-            #print "<%s> is not real!" % name
-            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
-            self.currentData.append('<%s%s>' % (name, attrs))
-            return        
-        self.endData()
-
-        if not self.isSelfClosingTag(name) and not selfClosing:
-            self._smartPop(name)
-
-        if self.parseOnlyThese and len(self.tagStack) <= 1 \
-               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
-            return
-
-        tag = Tag(self, name, attrs, self.currentTag, self.previous)
-        if self.previous:
-            self.previous.next = tag
-        self.previous = tag
-        self.pushTag(tag)
-        if selfClosing or self.isSelfClosingTag(name):
-            self.popTag()                
-        if name in self.QUOTE_TAGS:
-            #print "Beginning quote (%s)" % name
-            self.quoteStack.append(name)
-            self.literal = 1
-        return tag
-
-    def unknown_endtag(self, name):
-        #print "End tag %s" % name
-        if self.quoteStack and self.quoteStack[-1] != name:
-            #This is not a real end tag.
-            #print "</%s> is not real!" % name
-            self.currentData.append('</%s>' % name)
-            return
-        self.endData()
-        self._popToTag(name)
-        if self.quoteStack and self.quoteStack[-1] == name:
-            self.quoteStack.pop()
-            self.literal = (len(self.quoteStack) > 0)
-
-    def handle_data(self, data):
-        if self.convertHTMLEntities:
-            if data[0] == '&':
-                data = self.BARE_AMPERSAND.sub("&amp;",data)
-            else:
-                data = data.replace('&','&amp;') \
-                           .replace('<','&lt;') \
-                           .replace('>','&gt;')
-        self.currentData.append(data)
-
-    def _toStringSubclass(self, text, subclass):
-        """Adds a certain piece of text to the tree as a NavigableString
-        subclass."""
-        self.endData()
-        self.handle_data(text)
-        self.endData(subclass)
-
-    def handle_pi(self, text):
-        """Handle a processing instruction as a ProcessingInstruction
-        object, possibly one with a %SOUP-ENCODING% slot into which an
-        encoding will be plugged later."""
-        if text[:3] == "xml":
-            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
-        self._toStringSubclass(text, ProcessingInstruction)
-
-    def handle_comment(self, text):
-        "Handle comments as Comment objects."
-        self._toStringSubclass(text, Comment)
-
-    def handle_charref(self, ref):
-        "Handle character references as data."
-        if ref[0] == 'x':
-            data = unichr(int(ref[1:],16))
-        else:
-            data = unichr(int(ref))
-        
-        if u'\x80' <= data <= u'\x9F':
-            data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
-        elif not self.convertHTMLEntities and not self.convertXMLEntities:
-            data = '&#%s;' % ref
-
-        self.handle_data(data)
-
-    def handle_entityref(self, ref):
-        """Handle entity references as data, possibly converting known
-        HTML entity references to the corresponding Unicode
-        characters."""
-        replaceWithXMLEntity = self.convertXMLEntities and \
-                               self.XML_ENTITIES_TO_CHARS.has_key(ref)
-        if self.convertHTMLEntities or replaceWithXMLEntity:
-            try:
-                data = unichr(name2codepoint[ref])
-            except KeyError:
-                if replaceWithXMLEntity:
-                    data = self.XML_ENTITIES_TO_CHARS.get(ref)
-                else:
-                    data="&amp;%s" % ref
-        else:
-            data = '&%s;' % ref
-        self.handle_data(data)
-        
-    def handle_decl(self, data):
-        "Handle DOCTYPEs and the like as Declaration objects."
-        self._toStringSubclass(data, Declaration)
-
-    def parse_declaration(self, i):
-        """Treat a bogus SGML declaration as raw data. Treat a CDATA
-        declaration as a CData object."""
-        j = None
-        if self.rawdata[i:i+9] == '<![CDATA[':
-             k = self.rawdata.find(']]>', i)
-             if k == -1:
-                 k = len(self.rawdata)
-             data = self.rawdata[i+9:k]
-             j = k+3
-             self._toStringSubclass(data, CData)
-        else:
-            try:
-                j = SGMLParser.parse_declaration(self, i)
-            except SGMLParseError:
-                toHandle = self.rawdata[i:]
-                self.handle_data(toHandle)
-                j = i + len(toHandle)
-        return j
-
-    def convert_charref(self, name):
-        return '&#%s;' % name
-
-    def convert_entityref(self, name):
-        return '&%s;' % name
-
-class BeautifulSoup(BeautifulStoneSoup):
-
-    """This parser knows the following facts about HTML:
-
-    * Some tags have no closing tag and should be interpreted as being
-      closed as soon as they are encountered.
-
-    * The text inside some tags (ie. 'script') may contain tags which
-      are not really part of the document and which should be parsed
-      as text, not tags. If you want to parse the text as tags, you can
-      always fetch it and parse it explicitly.
-
-    * Tag nesting rules:
-
-      Most tags can't be nested at all. For instance, the occurance of
-      a <p> tag should implicitly close the previous <p> tag.
-
-       <p>Para1<p>Para2
-        should be transformed into:
-       <p>Para1</p><p>Para2
-
-      Some tags can be nested arbitrarily. For instance, the occurance
-      of a <blockquote> tag should _not_ implicitly close the previous
-      <blockquote> tag.
-
-       Alice said: <blockquote>Bob said: <blockquote>Blah
-        should NOT be transformed into:
-       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
-
-      Some tags can be nested, but the nesting is reset by the
-      interposition of other tags. For instance, a <tr> tag should
-      implicitly close the previous <tr> tag within the same <table>,
-      but not close a <tr> tag in another table.
-
-       <table><tr>Blah<tr>Blah
-        should be transformed into:
-       <table><tr>Blah</tr><tr>Blah
-        but,
-       <tr>Blah<table><tr>Blah
-        should NOT be transformed into
-       <tr>Blah<table></tr><tr>Blah
-
-    Differing assumptions about tag nesting rules are a major source
-    of problems with the BeautifulSoup class. If BeautifulSoup is not
-    treating as nestable a tag your page author treats as nestable,
-    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
-    BeautifulStoneSoup before writing your own subclass."""
-
-    def __init__(self, *args, **kwargs):
-        if not kwargs.has_key('smartQuotesTo'):
-            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
-        BeautifulStoneSoup.__init__(self, *args, **kwargs)
-
-    SELF_CLOSING_TAGS = buildTagMap(None,
-                                    ['br' , 'hr', 'input', 'img', 'meta',
-                                    'spacer', 'link', 'frame', 'base'])
-
-    QUOTE_TAGS = {'script': None}
-    
-    #According to the HTML standard, each of these inline tags can
-    #contain another tag of the same type. Furthermore, it's common
-    #to actually use these tags this way.
-    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
-                            'center']
-
-    #According to the HTML standard, these block tags can contain
-    #another tag of the same type. Furthermore, it's common
-    #to actually use these tags this way.
-    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
-
-    #Lists can contain other lists, but there are restrictions.    
-    NESTABLE_LIST_TAGS = { 'ol' : [],
-                           'ul' : [],
-                           'li' : ['ul', 'ol'],
-                           'dl' : [],
-                           'dd' : ['dl'],
-                           'dt' : ['dl'] }
-
-    #Tables can contain other tables, but there are restrictions.    
-    NESTABLE_TABLE_TAGS = {'table' : [], 
-                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
-                           'td' : ['tr'],
-                           'th' : ['tr'],
-                           'thead' : ['table'],
-                           'tbody' : ['table'],
-                           'tfoot' : ['table'],
-                           }
-
-    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
-
-    #If one of these tags is encountered, all tags up to the next tag of
-    #this type are popped.
-    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
-                                     NON_NESTABLE_BLOCK_TAGS,
-                                     NESTABLE_LIST_TAGS,
-                                     NESTABLE_TABLE_TAGS)
-
-    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
-                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
-
-    # Used to detect the charset in a META tag; see start_meta
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
-
-    def start_meta(self, attrs):
-        """Beautiful Soup can detect a charset included in a META tag,
-        try to convert the document to that charset, and re-parse the
-        document from the beginning."""
-        httpEquiv = None
-        contentType = None
-        contentTypeIndex = None
-        tagNeedsEncodingSubstitution = False
-
-        for i in range(0, len(attrs)):
-            key, value = attrs[i]
-            key = key.lower()
-            if key == 'http-equiv':
-                httpEquiv = value
-            elif key == 'content':
-                contentType = value
-                contentTypeIndex = i
-
-        if httpEquiv and contentType: # It's an interesting meta tag.
-            match = self.CHARSET_RE.search(contentType)
-            if match:
-                if getattr(self, 'declaredHTMLEncoding') or \
-                       (self.originalEncoding == self.fromEncoding):
-                    # This is our second pass through the document, or
-                    # else an encoding was specified explicitly and it
-                    # worked. Rewrite the meta tag.
-                    newAttr = self.CHARSET_RE.sub\
-                              (lambda(match):match.group(1) +
-                               "%SOUP-ENCODING%", value)
-                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
-                                               newAttr)
-                    tagNeedsEncodingSubstitution = True
-                else:
-                    # This is our first pass through the document.
-                    # Go through it again with the new information.
-                    newCharset = match.group(3)
-                    if newCharset and newCharset != self.originalEncoding:
-                        self.declaredHTMLEncoding = newCharset
-                        self._feed(self.declaredHTMLEncoding)
-                        raise StopParsing
-        tag = self.unknown_starttag("meta", attrs)
-        if tag and tagNeedsEncodingSubstitution:
-            tag.containsSubstitutions = True
-
-class StopParsing(Exception):
-    pass
-   
-class ICantBelieveItsBeautifulSoup(BeautifulSoup):
-
-    """The BeautifulSoup class is oriented towards skipping over
-    common HTML errors like unclosed tags. However, sometimes it makes
-    errors of its own. For instance, consider this fragment:
-
-     <b>Foo<b>Bar</b></b>
-
-    This is perfectly valid (if bizarre) HTML. However, the
-    BeautifulSoup class will implicitly close the first b tag when it
-    encounters the second 'b'. It will think the author wrote
-    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
-    there's no real-world reason to bold something that's already
-    bold. When it encounters '</b></b>' it will close two more 'b'
-    tags, for a grand total of three tags closed instead of two. This
-    can throw off the rest of your document structure. The same is
-    true of a number of other tags, listed below.
-
-    It's much more common for someone to forget to close a 'b' tag
-    than to actually use nested 'b' tags, and the BeautifulSoup class
-    handles the common case. This class handles the not-co-common
-    case: where you can't believe someone wrote what they did, but
-    it's valid HTML and BeautifulSoup screwed up by assuming it
-    wouldn't be."""
-
-    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
-     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
-      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
-      'big']
-
-    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
-
-    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
-                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
-                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
-
-class MinimalSoup(BeautifulSoup):
-    """The MinimalSoup class is for parsing HTML that contains
-    pathologically bad markup. It makes no assumptions about tag
-    nesting, but it does know which tags are self-closing, that
-    <script> tags contain Javascript and should not be parsed, that
-    META tags may contain encoding information, and so on.
-
-    This also makes it better for subclassing than BeautifulStoneSoup
-    or BeautifulSoup."""
-    
-    RESET_NESTING_TAGS = buildTagMap('noscript')
-    NESTABLE_TAGS = {}
-
-class BeautifulSOAP(BeautifulStoneSoup):
-    """This class will push a tag with only a single string child into
-    the tag's parent as an attribute. The attribute's name is the tag
-    name, and the value is the string child. An example should give
-    the flavor of the change:
-
-    <foo><bar>baz</bar></foo>
-     =>
-    <foo bar="baz"><bar>baz</bar></foo>
-
-    You can then access fooTag['bar'] instead of fooTag.barTag.string.
-
-    This is, of course, useful for scraping structures that tend to
-    use subelements instead of attributes, such as SOAP messages. Note
-    that it modifies its input, so don't print the modified version
-    out.
-
-    I'm not sure how many people really want to use this class; let me
-    know if you do. Mainly I like the name."""
-
-    def popTag(self):
-        if len(self.tagStack) > 1:
-            tag = self.tagStack[-1]
-            parent = self.tagStack[-2]
-            parent._getAttrMap()
-            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
-                isinstance(tag.contents[0], NavigableString) and 
-                not parent.attrMap.has_key(tag.name)):
-                parent[tag.name] = tag.contents[0]
-        BeautifulStoneSoup.popTag(self)
-
-#Enterprise class names! It has come to our attention that some people
-#think the names of the Beautiful Soup parser classes are too silly
-#and "unprofessional" for use in enterprise screen-scraping. We feel
-#your pain! For such-minded folk, the Beautiful Soup Consortium And
-#All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisitude,
-#"RobustParserBeanInterface.class") and using the following
-#enterprise-friendly class aliases:
-class RobustXMLParser(BeautifulStoneSoup):
-    pass
-class RobustHTMLParser(BeautifulSoup):
-    pass
-class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
-    pass
-class RobustInsanelyWackAssHTMLParser(MinimalSoup):
-    pass
-class SimplifyingSOAPParser(BeautifulSOAP):
-    pass
-
-######################################################
-#
-# Bonus library: Unicode, Dammit
-#
-# This class forces XML data into a standard format (usually to UTF-8
-# or Unicode).  It is heavily based on code from Mark Pilgrim's
-# Universal Feed Parser. It does not rewrite the XML or HTML to
-# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
-# (XML) and BeautifulSoup.start_meta (HTML).
-
-# Autodetects character encodings.
-# Download from http://chardet.feedparser.org/
-try:
-    import chardet
-#    import chardet.constants
-#    chardet.constants._debug = 1
-except:
-    chardet = None
-chardet = None
-
-# cjkcodecs and iconv_codec make Python know about more character encodings.
-# Both are available from http://cjkpython.i18n.org/
-# They're built in if you use Python 2.4.
-try:
-    import cjkcodecs.aliases
-except:
-    pass
-try:
-    import iconv_codec
-except:
-    pass
-
-class UnicodeDammit:
-    """A class for detecting the encoding of a *ML document and
-    converting it to a Unicode string. If the source encoding is
-    windows-1252, can replace MS smart quotes with their HTML or XML
-    equivalents."""
-
-    # This dictionary maps commonly seen values for "charset" in HTML
-    # meta tags to the corresponding Python codec names. It only covers
-    # values that aren't in Python's aliases and can't be determined
-    # by the heuristics in find_codec.
-    CHARSET_ALIASES = { "macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
-    
-    def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml'):
-        self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup)
-        self.smartQuotesTo = smartQuotesTo
-        self.triedEncodings = []
-        if isinstance(markup, unicode):
-            return markup
-
-        u = None
-        for proposedEncoding in overrideEncodings:
-            u = self._convertFrom(proposedEncoding)
-            if u: break
-        if not u:
-            for proposedEncoding in (documentEncoding, sniffedEncoding):
-                u = self._convertFrom(proposedEncoding)
-                if u: break
-                
-        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
-
-        # As a last resort, try utf-8 and windows-1252:
-        if not u:
-            for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convertFrom(proposed_encoding)
-                if u: break
-        self.unicode = u
-        if not u: self.originalEncoding = None
-
-    def subMSChar(orig, smartQuotesTo):
-        """Changes a MS smart quote character to an XML or HTML
-        entity."""
-        sub = UnicodeDammit.MS_CHARS.get(orig)
-        if type(sub) == types.TupleType:
-            if smartQuotesTo == 'xml':
-                sub = '&#x%s;' % sub[1]
-            elif smartQuotesTo == 'html':
-                sub = '&%s;' % sub[0]
-            else:
-                sub = unichr(int(sub[1],16))
-        return sub            
-    subMSChar = staticmethod(subMSChar)
-
-    def _convertFrom(self, proposed):        
-        proposed = self.find_codec(proposed)
-        if not proposed or proposed in self.triedEncodings:
-            return None
-        self.triedEncodings.append(proposed)
-        markup = self.markup
-
-        # Convert smart quotes to HTML if coming from an encoding
-        # that might have them.
-        if self.smartQuotesTo and proposed in("windows-1252",
-                                              "ISO-8859-1",
-                                              "ISO-8859-2"):
-            markup = re.compile("([\x80-\x9f])").sub \
-                     (lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
-                      markup)
-
-        try:
-            # print "Trying to convert document to %s" % proposed
-            u = self._toUnicode(markup, proposed)
-            self.markup = u       
-            self.originalEncoding = proposed
-        except Exception, e:
-            # print "That didn't work!"
-            # print e
-            return None        
-        #print "Correct encoding: %s" % proposed
-        return self.markup
-
-    def _toUnicode(self, data, encoding):
-        '''Given a string and its encoding, decodes the string into Unicode.
-        %encoding is a string recognized by encodings.aliases'''
-
-        if not data: return u''
-
-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding)
-        return newdata
-    
-    def _detectEncoding(self, xml_data):
-        """Given a document, tries to detect its XML encoding."""
-        xml_encoding = sniffed_xml_encoding = None
-        try:
-            if xml_data[:4] == '\x4c\x6f\xa7\x94':
-                # EBCDIC
-                xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == '\x00\x3c\x00\x3f':
-                # UTF-16BE
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
-                # UTF-16BE with BOM
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x3f\x00':
-                # UTF-16LE
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
-                # UTF-16LE with BOM
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\x00\x3c':
-                # UTF-32BE
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\x3c\x00\x00\x00':
-                # UTF-32LE
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == '\x00\x00\xfe\xff':
-                # UTF-32BE with BOM
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == '\xff\xfe\x00\x00':
-                # UTF-32LE with BOM
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == '\xef\xbb\xbf':
-                # UTF-8 with BOM
-                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-            else:
-                sniffed_xml_encoding = 'ascii'
-                pass
-            xml_encoding_match = re.compile \
-                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
-                                 .match(xml_data)
-        except:
-            xml_encoding_match = None
-        if xml_encoding_match:
-            xml_encoding = xml_encoding_match.groups()[0].lower()
-            if sniffed_xml_encoding and \
-               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
-                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
-                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
-                                 'utf16', 'u16')):
-                xml_encoding = sniffed_xml_encoding
-        return xml_data, xml_encoding, sniffed_xml_encoding
-
-
-    def find_codec(self, charset):
-        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
-               or charset
-
-    def _codec(self, charset):
-        if not charset: return charset 
-        codec = None
-        try:
-            codecs.lookup(charset)
-            codec = charset
-        except LookupError:
-            pass
-        return codec
-
-    EBCDIC_TO_ASCII_MAP = None
-    def _ebcdic_to_ascii(self, s):
-        c = self.__class__
-        if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
-            import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
-        return s.translate(c.EBCDIC_TO_ASCII_MAP)
-
-    MS_CHARS = { '\x80' : ('euro', '20AC'),
-                 '\x81' : ' ',
-                 '\x82' : ('sbquo', '201A'),
-                 '\x83' : ('fnof', '192'),
-                 '\x84' : ('bdquo', '201E'),
-                 '\x85' : ('hellip', '2026'),
-                 '\x86' : ('dagger', '2020'),
-                 '\x87' : ('Dagger', '2021'),
-                 '\x88' : ('circ', '2C6'),
-                 '\x89' : ('permil', '2030'),
-                 '\x8A' : ('Scaron', '160'),
-                 '\x8B' : ('lsaquo', '2039'),
-                 '\x8C' : ('OElig', '152'),
-                 '\x8D' : '?',
-                 '\x8E' : ('#x17D', '17D'),
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : ('lsquo', '2018'),
-                 '\x92' : ('rsquo', '2019'),
-                 '\x93' : ('ldquo', '201C'),
-                 '\x94' : ('rdquo', '201D'),
-                 '\x95' : ('bull', '2022'),
-                 '\x96' : ('ndash', '2013'),
-                 '\x97' : ('mdash', '2014'),
-                 '\x98' : ('tilde', '2DC'),
-                 '\x99' : ('trade', '2122'),
-                 '\x9a' : ('scaron', '161'),
-                 '\x9b' : ('rsaquo', '203A'),
-                 '\x9c' : ('oelig', '153'),
-                 '\x9d' : '?',
-                 '\x9e' : ('#x17E', '17E'),
-                 '\x9f' : ('Yuml', '178'),}
-
-#######################################################################
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
-    import sys
-    soup = BeautifulSoup(sys.stdin.read())
-    print soup.prettify()
diff --git a/planet/html5lib/__init__.py b/planet/html5lib/__init__.py
new file mode 100644
index 0000000..eaa8fe3
--- /dev/null
+++ b/planet/html5lib/__init__.py
@@ -0,0 +1,34 @@
+""" 
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+p = html5lib.HTMLParser()
+tree = p.parse(f)
+
+By default the returned treeformat is a custom "simpletree", similar
+to a DOM tree; each element has attributes childNodes and parent
+holding the parents and children respectively, a name attribute
+holding the Element name, a data attribute holding the element data
+(for text and comment nodes) and an attributes dictionary holding the
+element's attributes (for Element nodes).
+
+To get output in ElementTree format:
+
+import html5lib
+from html5lib.treebuilders import etree
+p = html5lib.HTMLParser(tree=etree.TreeBuilder)
+elementtree = p.parse(f)
+
+Note: Because HTML documents support various features not in the
+default ElementTree (e.g. doctypes), we suppy our own simple
+serializer; html5lib.treebuilders.etree.tostring At present this does not
+have the encoding support offered by the elementtree serializer.
+
+"""
+from html5parser import HTMLParser
diff --git a/planet/html5lib/constants.py b/planet/html5lib/constants.py
new file mode 100644
index 0000000..ef5f641
--- /dev/null
+++ b/planet/html5lib/constants.py
@@ -0,0 +1,456 @@
+import string
+
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+EOF = None
+
+contentModelFlags = {
+    "PCDATA":0,
+    "RCDATA":1,
+    "CDATA":2,
+    "PLAINTEXT":3
+}
+
+scopingElements = frozenset((
+    "button",
+    "caption",
+    "html",
+    "marquee",
+    "object",
+    "table",
+    "td",
+    "th"
+))
+
+formattingElements = frozenset((
+    "a",
+    "b",
+    "big",
+    "em",
+    "font",
+    "i",
+    "nobr",
+    "s",
+    "small",
+    "strike",
+    "strong",
+    "tt",
+    "u"
+))
+
+specialElements = frozenset((
+    "address",
+    "area",
+    "base",
+    "basefont",
+    "bgsound",
+    "blockquote",
+    "body",
+    "br",
+    "center",
+    "col",
+    "colgroup",
+    "dd",
+    "dir",
+    "div",
+    "dl",
+    "dt",
+    "embed",
+    "fieldset",
+    "form",
+    "frame",
+    "frameset",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "head",
+    "hr",
+    "iframe",
+    "image",
+    "img",
+    "input",
+    "isindex",
+    "li",
+    "link",
+    "listing",
+    "menu",
+    "meta",
+    "noembed",
+    "noframes",
+    "noscript",
+    "ol",
+    "optgroup",
+    "option",
+    "p",
+    "param",
+    "plaintext",
+    "pre",
+    "script",
+    "select",
+    "spacer",
+    "style",
+    "tbody",
+    "textarea",
+    "tfoot",
+    "thead",
+    "title",
+    "tr",
+    "ul",
+    "wbr"
+))
+
+spaceCharacters = frozenset((
+    u"\t",
+    u"\n",
+    u"\u000B",
+    u"\u000C",
+    u" "
+))
+
+tableInsertModeElements = frozenset((
+    "table",
+    "tbody",
+    "tfoot", 
+    "thead", 
+    "tr"
+))
+
+asciiLowercase = frozenset(string.ascii_lowercase)
+asciiLetters = frozenset(string.ascii_letters)
+digits = frozenset(string.digits)
+hexDigits = frozenset(string.hexdigits)
+
+asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
+    for c in string.ascii_uppercase])
+
+# Heading elements need to be ordered 
+headingElements = (
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6"
+)
+
+# XXX What about event-source and command?
+voidElements = frozenset((
+    "base",
+    "link",
+    "meta",
+    "hr",
+    "br",
+    "img",
+    "embed",
+    "param",
+    "area",
+    "col",
+    "input"
+))
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
+# therefore can't be a frozenset.
+entitiesWindows1252 = (
+    8364,  # 0x80  0x20AC  EURO SIGN
+    65533, # 0x81          UNDEFINED
+    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,  # 0x86  0x2020  DAGGER
+    8225,  # 0x87  0x2021  DOUBLE DAGGER
+    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,  # 0x89  0x2030  PER MILLE SIGN
+    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533, # 0x8D          UNDEFINED
+    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533, # 0x8F          UNDEFINED
+    65533, # 0x90          UNDEFINED
+    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,  # 0x95  0x2022  BULLET
+    8211,  # 0x96  0x2013  EN DASH
+    8212,  # 0x97  0x2014  EM DASH
+    732,   # 0x98  0x02DC  SMALL TILDE
+    8482,  # 0x99  0x2122  TRADE MARK SIGN
+    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533, # 0x9D          UNDEFINED
+    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+)
+
+entities = {
+    "AElig": u"\u00C6",
+    "Aacute": u"\u00C1",
+    "Acirc": u"\u00C2",
+    "Agrave": u"\u00C0",
+    "Alpha": u"\u0391",
+    "Aring": u"\u00C5",
+    "Atilde": u"\u00C3",
+    "Auml": u"\u00C4",
+    "Beta": u"\u0392",
+    "Ccedil": u"\u00C7",
+    "Chi": u"\u03A7",
+    "Dagger": u"\u2021",
+    "Delta": u"\u0394",
+    "ETH": u"\u00D0",
+    "Eacute": u"\u00C9",
+    "Ecirc": u"\u00CA",
+    "Egrave": u"\u00C8",
+    "Epsilon": u"\u0395",
+    "Eta": u"\u0397",
+    "Euml": u"\u00CB",
+    "Gamma": u"\u0393",
+    "Iacute": u"\u00CD",
+    "Icirc": u"\u00CE",
+    "Igrave": u"\u00CC",
+    "Iota": u"\u0399",
+    "Iuml": u"\u00CF",
+    "Kappa": u"\u039A",
+    "Lambda": u"\u039B",
+    "Mu": u"\u039C",
+    "Ntilde": u"\u00D1",
+    "Nu": u"\u039D",
+    "OElig": u"\u0152",
+    "Oacute": u"\u00D3",
+    "Ocirc": u"\u00D4",
+    "Ograve": u"\u00D2",
+    "Omega": u"\u03A9",
+    "Omicron": u"\u039F",
+    "Oslash": u"\u00D8",
+    "Otilde": u"\u00D5",
+    "Ouml": u"\u00D6",
+    "Phi": u"\u03A6",
+    "Pi": u"\u03A0",
+    "Prime": u"\u2033",
+    "Psi": u"\u03A8",
+    "Rho": u"\u03A1",
+    "Scaron": u"\u0160",
+    "Sigma": u"\u03A3",
+    "THORN": u"\u00DE",
+    "Tau": u"\u03A4",
+    "Theta": u"\u0398",
+    "Uacute": u"\u00DA",
+    "Ucirc": u"\u00DB",
+    "Ugrave": u"\u00D9",
+    "Upsilon": u"\u03A5",
+    "Uuml": u"\u00DC",
+    "Xi": u"\u039E",
+    "Yacute": u"\u00DD",
+    "Yuml": u"\u0178",
+    "Zeta": u"\u0396",
+    "aacute": u"\u00E1",
+    "acirc": u"\u00E2",
+    "acute": u"\u00B4",
+    "aelig": u"\u00E6",
+    "agrave": u"\u00E0",
+    "alefsym": u"\u2135",
+    "alpha": u"\u03B1",
+    "amp": u"\u0026",
+    "AMP": u"\u0026",
+    "and": u"\u2227",
+    "ang": u"\u2220",
+    "apos": u"\u0027",
+    "aring": u"\u00E5",
+    "asymp": u"\u2248",
+    "atilde": u"\u00E3",
+    "auml": u"\u00E4",
+    "bdquo": u"\u201E",
+    "beta": u"\u03B2",
+    "brvbar": u"\u00A6",
+    "bull": u"\u2022",
+    "cap": u"\u2229",
+    "ccedil": u"\u00E7",
+    "cedil": u"\u00B8",
+    "cent": u"\u00A2",
+    "chi": u"\u03C7",
+    "circ": u"\u02C6",
+    "clubs": u"\u2663",
+    "cong": u"\u2245",
+    "copy": u"\u00A9",
+    "COPY": u"\u00A9",
+    "crarr": u"\u21B5",
+    "cup": u"\u222A",
+    "curren": u"\u00A4",
+    "dArr": u"\u21D3",
+    "dagger": u"\u2020",
+    "darr": u"\u2193",
+    "deg": u"\u00B0",
+    "delta": u"\u03B4",
+    "diams": u"\u2666",
+    "divide": u"\u00F7",
+    "eacute": u"\u00E9",
+    "ecirc": u"\u00EA",
+    "egrave": u"\u00E8",
+    "empty": u"\u2205",
+    "emsp": u"\u2003",
+    "ensp": u"\u2002",
+    "epsilon": u"\u03B5",
+    "equiv": u"\u2261",
+    "eta": u"\u03B7",
+    "eth": u"\u00F0",
+    "euml": u"\u00EB",
+    "euro": u"\u20AC",
+    "exist": u"\u2203",
+    "fnof": u"\u0192",
+    "forall": u"\u2200",
+    "frac12": u"\u00BD",
+    "frac14": u"\u00BC",
+    "frac34": u"\u00BE",
+    "frasl": u"\u2044",
+    "gamma": u"\u03B3",
+    "ge": u"\u2265",
+    "gt": u"\u003E",
+    "GT": u"\u003E",
+    "hArr": u"\u21D4",
+    "harr": u"\u2194",
+    "hearts": u"\u2665",
+    "hellip": u"\u2026",
+    "iacute": u"\u00ED",
+    "icirc": u"\u00EE",
+    "iexcl": u"\u00A1",
+    "igrave": u"\u00EC",
+    "image": u"\u2111",
+    "infin": u"\u221E",
+    "int": u"\u222B",
+    "iota": u"\u03B9",
+    "iquest": u"\u00BF",
+    "isin": u"\u2208",
+    "iuml": u"\u00EF",
+    "kappa": u"\u03BA",
+    "lArr": u"\u21D0",
+    "lambda": u"\u03BB",
+    "lang": u"\u2329",
+    "laquo": u"\u00AB",
+    "larr": u"\u2190",
+    "lceil": u"\u2308",
+    "ldquo": u"\u201C",
+    "le": u"\u2264",
+    "lfloor": u"\u230A",
+    "lowast": u"\u2217",
+    "loz": u"\u25CA",
+    "lrm": u"\u200E",
+    "lsaquo": u"\u2039",
+    "lsquo": u"\u2018",
+    "lt": u"\u003C",
+    "LT": u"\u003C",
+    "macr": u"\u00AF",
+    "mdash": u"\u2014",
+    "micro": u"\u00B5",
+    "middot": u"\u00B7",
+    "minus": u"\u2212",
+    "mu": u"\u03BC",
+    "nabla": u"\u2207",
+    "nbsp": u"\u00A0",
+    "ndash": u"\u2013",
+    "ne": u"\u2260",
+    "ni": u"\u220B",
+    "not": u"\u00AC",
+    "notin": u"\u2209",
+    "nsub": u"\u2284",
+    "ntilde": u"\u00F1",
+    "nu": u"\u03BD",
+    "oacute": u"\u00F3",
+    "ocirc": u"\u00F4",
+    "oelig": u"\u0153",
+    "ograve": u"\u00F2",
+    "oline": u"\u203E",
+    "omega": u"\u03C9",
+    "omicron": u"\u03BF",
+    "oplus": u"\u2295",
+    "or": u"\u2228",
+    "ordf": u"\u00AA",
+    "ordm": u"\u00BA",
+    "oslash": u"\u00F8",
+    "otilde": u"\u00F5",
+    "otimes": u"\u2297",
+    "ouml": u"\u00F6",
+    "para": u"\u00B6",
+    "part": u"\u2202",
+    "permil": u"\u2030",
+    "perp": u"\u22A5",
+    "phi": u"\u03C6",
+    "pi": u"\u03C0",
+    "piv": u"\u03D6",
+    "plusmn": u"\u00B1",
+    "pound": u"\u00A3",
+    "prime": u"\u2032",
+    "prod": u"\u220F",
+    "prop": u"\u221D",
+    "psi": u"\u03C8",
+    "quot": u"\u0022",
+    "QUOT": u"\u0022",
+    "rArr": u"\u21D2",
+    "radic": u"\u221A",
+    "rang": u"\u232A",
+    "raquo": u"\u00BB",
+    "rarr": u"\u2192",
+    "rceil": u"\u2309",
+    "rdquo": u"\u201D",
+    "real": u"\u211C",
+    "reg": u"\u00AE",
+    "REG": u"\u00AE",
+    "rfloor": u"\u230B",
+    "rho": u"\u03C1",
+    "rlm": u"\u200F",
+    "rsaquo": u"\u203A",
+    "rsquo": u"\u2019",
+    "sbquo": u"\u201A",
+    "scaron": u"\u0161",
+    "sdot": u"\u22C5",
+    "sect": u"\u00A7",
+    "shy": u"\u00AD",
+    "sigma": u"\u03C3",
+    "sigmaf": u"\u03C2",
+    "sim": u"\u223C",
+    "spades": u"\u2660",
+    "sub": u"\u2282",
+    "sube": u"\u2286",
+    "sum": u"\u2211",
+    "sup": u"\u2283",
+    "sup1": u"\u00B9",
+    "sup2": u"\u00B2",
+    "sup3": u"\u00B3",
+    "supe": u"\u2287",
+    "szlig": u"\u00DF",
+    "tau": u"\u03C4",
+    "there4": u"\u2234",
+    "theta": u"\u03B8",
+    "thetasym": u"\u03D1",
+    "thinsp": u"\u2009",
+    "thorn": u"\u00FE",
+    "tilde": u"\u02DC",
+    "times": u"\u00D7",
+    "trade": u"\u2122",
+    "uArr": u"\u21D1",
+    "uacute": u"\u00FA",
+    "uarr": u"\u2191",
+    "ucirc": u"\u00FB",
+    "ugrave": u"\u00F9",
+    "uml": u"\u00A8",
+    "upsih": u"\u03D2",
+    "upsilon": u"\u03C5",
+    "uuml": u"\u00FC",
+    "weierp": u"\u2118",
+    "xi": u"\u03BE",
+    "yacute": u"\u00FD",
+    "yen": u"\u00A5",
+    "yuml": u"\u00FF",
+    "zeta": u"\u03B6",
+    "zwj": u"\u200D",
+    "zwnj": u"\u200C"
+}
diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py
new file mode 100644
index 0000000..6e1e70a
--- /dev/null
+++ b/planet/html5lib/html5parser.py
@@ -0,0 +1,1719 @@
+# Differences from the current specification (23 December 2006) are as follows:
+# * Phases and insertion modes are one concept in parser.py.
+# * EOF handling is slightly different to make sure <html>, <head> and <body>
+#   always exist.
+# * We also deal with content when there's no DOCTYPE.
+# It is expected that the specification will catch up with us in due course ;-)
+#
+# It should be trivial to add the following cases. However, we should probably
+# also look into comment handling and such then...
+# * A <p> element end tag creates an empty <p> element when there's no <p>
+#   element in scope.
+# * A <br> element end tag creates an empty <br> element.
+
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+import gettext
+_ = gettext.gettext
+
+import tokenizer
+
+import treebuilders
+from treebuilders._base import Marker
+from treebuilders import simpletree
+
+import utils
+from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
+from constants import scopingElements, formattingElements, specialElements
+from constants import headingElements, tableInsertModeElements, voidElements
+
+class HTMLParser(object):
+    """HTML parser. Generates a tree structure from a stream of (possibly
+        malformed) HTML"""
+
+    def __init__(self, strict = False, tree=simpletree.TreeBuilder):
+        """
+        strict - raise an exception when a parse error is encountered 
+        
+        tree - a treebuilder class controlling the type of tree that will be 
+        returned. This class is almost always a subclass of 
+        html5lib.treebuilders._base.TreeBuilder
+        """
+
+        # Raise an exception on the first error encountered
+        self.strict = strict
+
+        self.tree = tree()
+        self.errors = []
+
+        self.phases = {
+            "initial": InitialPhase(self, self.tree),
+            "rootElement": RootElementPhase(self, self.tree),
+            "beforeHead": BeforeHeadPhase(self, self.tree),
+            "inHead": InHeadPhase(self, self.tree),
+            "afterHead": AfterHeadPhase(self, self.tree),
+            "inBody": InBodyPhase(self, self.tree),
+            "inTable": InTablePhase(self, self.tree),
+            "inCaption": InCaptionPhase(self, self.tree),
+            "inColumnGroup": InColumnGroupPhase(self, self.tree),
+            "inTableBody": InTableBodyPhase(self, self.tree),
+            "inRow": InRowPhase(self, self.tree),
+            "inCell": InCellPhase(self, self.tree),
+            "inSelect": InSelectPhase(self, self.tree),
+            "afterBody": AfterBodyPhase(self, self.tree),
+            "inFrameset": InFramesetPhase(self, self.tree),
+            "afterFrameset": AfterFramesetPhase(self, self.tree),
+            "trailingEnd": TrailingEndPhase(self, self.tree)
+        }
+
+    def parse(self, stream, encoding=None, innerHTML=False):
+        """Parse a HTML document into a well-formed tree
+        
+        stream - a filelike object or string containing the HTML to be parsed
+        
+        innerHTML - Are we parsing in innerHTML mode (note innerHTML=True 
+        is not yet supported)
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+
+        self.tree.reset()
+        self.errors = []
+
+        self.phase = self.phases["initial"]
+        # We only seem to have InBodyPhase testcases where the following is
+        # relevant ... need others too
+        self.lastPhase = None
+
+        # We don't actually support innerHTML yet but this should allow
+        # assertations
+        self.innerHTML = innerHTML
+
+        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
+
+        # XXX This is temporary for the moment so there isn't any other
+        # changes needed for the parser to work with the iterable tokenizer
+        for token in self.tokenizer:
+            token = self.normalizeToken(token)
+            type = token["type"]
+            method = getattr(self.phase, "process%s" % type, None)
+            if type in ("Characters", "SpaceCharacters", "Comment"):
+                method(token["data"])
+            elif type in ("StartTag", "Doctype"):
+                method(token["name"], token["data"])
+            elif type == "EndTag":
+                method(token["name"])
+            else:
+                self.parseError(token["data"])
+
+        # When the loop finishes it's EOF
+        self.phase.processEOF()
+
+        return self.tree.getDocument()
+
+    def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise ParseError
+
+    def atheistParseError(self):
+        """This error is not an error"""
+        pass
+
+    def normalizeToken(self, token):
+        """ HTML5 specific normalizations to the token stream """
+       
+        if token["type"] == "EmptyTag":
+            # When a solidus (/) is encountered within a tag name what happens
+            # depends on whether the current tag name matches that of a void
+            # element.  If it matches a void element atheists did the wrong
+            # thing and if it doesn't it's wrong for everyone.
+
+            if token["name"] in voidElements:
+                self.atheistParseError()
+            else:
+                self.parseError(_("Solidus (/) incorrectly placed in tag."))
+
+            token["type"] = "StartTag"
+
+        if token["type"] == "StartTag":
+            token["name"] = token["name"].translate(asciiUpper2Lower)
+
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            if token["data"]:
+                token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
+                    for attr,value in token["data"][::-1]])
+            else:
+                token["data"] = {}
+
+        elif token["type"] == "EndTag":
+            token["name"] = token["name"].lower()
+
+        return token
+
+    #XXX - almost everthing after this point should be moved into a
+    #seperate treebuilder object
+
+
+    def resetInsertionMode(self):
+        # The name of this method is mostly historical. (It's also used in the
+        # specification.)
+        last = False
+        newModes = {
+            "select":"inSelect",
+            "td":"inCell",
+            "th":"inCell",
+            "tr":"inRow",
+            "tbody":"inTableBody",
+            "thead":"inTableBody",
+            "tfoot":"inTableBody",
+            "caption":"inCaption",
+            "colgroup":"inColumnGroup",
+            "table":"inTable",
+            "head":"inBody",
+            "body":"inBody",
+            "frameset":"inFrameset"
+        }
+        for node in self.tree.openElements[::-1]:
+            if node == self.tree.openElements[0]:
+                last = True
+                if node.name not in ['td', 'th']:
+                    # XXX
+                    assert self.innerHTML
+                    raise NotImplementedError
+            # Check for conditions that should only happen in the innerHTML
+            # case
+            if node.name in ("select", "colgroup", "head", "frameset"):
+                # XXX
+                assert self.innerHTML
+            if node.name in newModes:
+                self.phase = self.phases[newModes[node.name]]
+                break
+            elif node.name == "html":
+                if self.tree.headPointer is None:
+                    self.phase = self.phases["beforeHead"]
+                else:
+                   self.phase = self.phases["afterHead"]
+                break
+            elif last:
+                self.phase = self.phases["body"]
+                break
+
+class Phase(object):
+    """Base class for helper object that implements each phase of processing
+    """
+    # Order should be (they can be omitted):
+    # * EOF
+    # * Comment
+    # * Doctype
+    # * SpaceCharacters
+    # * Characters
+    # * StartTag
+    #   - startTag* methods
+    # * EndTag
+    #   - endTag* methods
+
+    def __init__(self, parser, tree):
+        self.parser = parser
+        self.tree = tree
+
+    def processEOF(self):
+        self.tree.generateImpliedEndTags()
+        if self.parser.innerHTML == True and len(self.tree.openElements) > 1:
+            # XXX No need to check for "body" because our EOF handling is not
+            # per specification. (Specification needs an update.)
+            #
+            # XXX Need to check this more carefully in the future.
+            self.parser.parseError()
+        # Stop parsing
+
+    def processComment(self, data):
+        # For most phases the following is correct. Where it's not it will be
+        # overridden.
+        self.tree.insertComment(data, self.tree.openElements[-1])
+
+    def processDoctype(self, name, error):
+        self.parser.parseError()
+
+    def processSpaceCharacters(self, data):
+        self.tree.insertText(data)
+
+    def processStartTag(self, name, attributes):
+        self.startTagHandler[name](name, attributes)
+
+    def startTagHtml(self, name, attributes):
+        # XXX Need a check here to see if the first start tag token emitted is
+        # this token... If it's not, invoke self.parser.parseError().
+        for attr, value in attributes.iteritems():
+            if attr not in self.tree.openElements[0].attributes:
+                self.tree.openElements[0].attributes[attr] = value
+
+    def processEndTag(self, name):
+        self.endTagHandler[name](name)
+
+
+class InitialPhase(Phase):
+    # This phase deals with error handling as well which is currently not
+    # covered in the specification. The error handling is typically known as
+    # "quirks mode". It is expected that a future version of HTML5 will defin
+    # this.
+    def processEOF(self):
+        self.parser.parseError(_("No DOCTYPE seen."))
+        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.phase.processEOF()
+
+    def processComment(self, data):
+        self.tree.insertComment(data, self.tree.document)
+
+    def processDoctype(self, name, error):
+        if error:
+            self.parser.parseError(_("DOCTYPE is in error."))
+        self.tree.insertDoctype(name)
+        self.parser.phase = self.parser.phases["rootElement"]
+
+    def processSpaceCharacters(self, data):
+        self.tree.insertText(data, self.tree.document)
+
+    def processCharacters(self, data):
+        self.parser.parseError(_("No DOCTYPE seen."))
+        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.phase.processCharacters(data)
+
+    def processStartTag(self, name, attributes):
+        self.parser.parseError(_("No DOCTYPE seen."))
+        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.phase.processStartTag(name, attributes)
+
+    def processEndTag(self, name):
+        self.parser.parseError(_("No DOCTYPE seen."))
+        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.phase.processEndTag(name)
+
+
+class RootElementPhase(Phase):
+    # helper methods
+    def insertHtmlElement(self):
+        element = self.tree.createElement("html", {})
+        self.tree.openElements.append(element)
+        self.tree.document.appendChild(element)
+        self.parser.phase = self.parser.phases["beforeHead"]
+
+    # other
+    def processEOF(self):
+        self.insertHtmlElement()
+        self.parser.phase.processEOF()
+
+    def processComment(self, data):
+        self.tree.insertComment(data, self.tree.document)
+
+    def processSpaceCharacters(self, data):
+        self.tree.insertText(data, self.tree.document)
+
+    def processCharacters(self, data):
+        self.insertHtmlElement()
+        self.parser.phase.processCharacters(data)
+
+    def processStartTag(self, name, attributes):
+        self.insertHtmlElement()
+        self.parser.phase.processStartTag(name, attributes)
+
+    def processEndTag(self, name):
+        self.insertHtmlElement()
+        self.parser.phase.processEndTag(name)
+
+
+class BeforeHeadPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("head", self.startTagHead)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("html", self.endTagHtml)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processEOF(self):
+        self.startTagHead("head", {})
+        self.parser.phase.processEOF()
+
+    def processCharacters(self, data):
+        self.startTagHead("head", {})
+        self.parser.phase.processCharacters(data)
+
+    def startTagHead(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.tree.headPointer = self.tree.openElements[-1]
+        self.parser.phase = self.parser.phases["inHead"]
+
+    def startTagOther(self, name, attributes):
+        self.startTagHead("head", {})
+        self.parser.phase.processStartTag(name, attributes)
+
+    def endTagHtml(self, name):
+        self.startTagHead("head", {})
+        self.parser.phase.processEndTag(name)
+
+    def endTagOther(self, name):
+        self.parser.parseError(_("Unexpected end tag (" + name +\
+          ") after the root element."))
+
+class InHeadPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler =  utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            (("title", "style"), self.startTagTitleStyle),
+            ("script", self.startTagScript),
+            (("base", "link", "meta"), self.startTagBaseLinkMeta),
+            ("head", self.startTagHead)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self. endTagHandler = utils.MethodDispatcher([
+            ("head", self.endTagHead),
+            ("html", self.endTagHtml),
+            (("title", "style", "script"), self.endTagTitleStyleScript)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper
+    def appendToHead(self, element):
+        if self.tree.headPointer is not None:
+            self.tree.headPointer.appendChild(element)
+        else:
+            assert self.parser.innerHTML
+            self.tree.openElements[-1].appendChild(element)
+
+    # the real thing
+    def processEOF(self):
+        if self.tree.openElements[-1].name in ("title", "style", "script"):
+            self.tree.openElements.pop()
+        self.anythingElse()
+        self.parser.phase.processEOF()
+
+    def processCharacters(self, data):
+        if self.tree.openElements[-1].name in ("title", "style", "script"):
+            self.tree.insertText(data)
+        else:
+            self.anythingElse()
+            self.parser.phase.processCharacters(data)
+
+    def startTagHead(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.tree.headPointer = self.tree.openElements[-1]
+        self.parser.phase = self.parser.phases["inHead"]
+
+    def startTagTitleStyle(self, name, attributes):
+        cmFlags = {"title":"RCDATA", "style":"CDATA"}
+        element = self.tree.createElement(name, attributes)
+        self.appendToHead(element)
+        self.tree.openElements.append(element)
+        self.parser.tokenizer.contentModelFlag =\
+          contentModelFlags[cmFlags[name]]
+
+    def startTagScript(self, name, attributes):
+        element = self.tree.createElement(name, attributes)
+        element._flags.append("parser-inserted")
+
+        # XXX in theory we should check if we're actually in the InHead state
+        # here and if the headElementPointer is not zero but it seems to work
+        # without that being the case.
+        self.tree.openElements[-1].appendChild(element)
+        self.tree.openElements.append(element)
+
+        # XXX AT we could use self.tree.insertElement(name, attributes) ...
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+
+    def startTagBaseLinkMeta(self, name, attributes):
+        element = self.tree.createElement(name, attributes)
+        self.appendToHead(element)
+
+    def startTagOther(self, name, attributes):
+        self.anythingElse()
+        self.parser.phase.processStartTag(name, attributes)
+
+    def endTagHead(self, name):
+        if self.tree.openElements[-1].name == "head":
+            self.tree.openElements.pop()
+        else:
+            self.parser.parseError()
+        self.parser.phase = self.parser.phases["afterHead"]
+
+    def endTagHtml(self, name):
+        self.anythingElse()
+        self.parser.phase.processEndTag(name)
+
+    def endTagTitleStyleScript(self, name):
+        if self.tree.openElements[-1].name == name:
+            self.tree.openElements.pop()
+        else:
+            self.parser.parseError(_("Unexpected end tag " + name +\
+              ". Ignored."))
+
+    def endTagOther(self, name):
+        self.parser.parseError(_("Unexpected end tag " + name + ". Ignored."))
+
+    def anythingElse(self):
+        if self.tree.openElements[-1].name == "head":
+            self.endTagHead("head")
+        else:
+            self.parser.phase = self.parser.phases["afterHead"]
+
+class AfterHeadPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("body", self.startTagBody),
+            ("frameset", self.startTagFrameset),
+            (("base", "link", "meta", "script", "style", "title"),
+              self.startTagFromHead)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+    def processEOF(self):
+        self.anythingElse()
+        self.parser.phase.processEOF()
+
+    def processCharacters(self, data):
+        self.anythingElse()
+        self.parser.phase.processCharacters(data)
+
+    def startTagBody(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inBody"]
+
+    def startTagFrameset(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inFrameset"]
+
+    def startTagFromHead(self, name, attributes):
+        self.parser.parseError()
+        self.parser.phase = self.parser.phases["inHead"]
+        self.parser.phase.processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.anythingElse()
+        self.parser.phase.processStartTag(name, attributes)
+
+    def processEndTag(self, name):
+        self.anythingElse()
+        self.parser.phase.processEndTag(name)
+
+    def anythingElse(self):
+        self.tree.insertElement("body", {})
+        self.parser.phase = self.parser.phases["inBody"]
+
+
+class InBodyPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-body
+    # the crazy mode
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("script", self.startTagScript),
+            (("base", "link", "meta", "style", "title"),
+              self.startTagFromHead),
+            ("body", self.startTagBody),
+            (("address", "blockquote", "center", "dir", "div", "dl",
+              "fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
+              self.startTagCloseP),
+            ("form", self.startTagForm),
+            (("li", "dd", "dt"), self.startTagListItem),
+            ("plaintext",self.startTagPlaintext),
+            (headingElements, self.startTagHeading),
+            ("a", self.startTagA),
+            (("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
+              "strong", "tt", "u"),self.startTagFormatting),
+            ("button", self.startTagButton),
+            (("marquee", "object"), self.startTagMarqueeObject),
+            ("xmp", self.startTagXmp),
+            ("table", self.startTagTable),
+            (("area", "basefont", "bgsound", "br", "embed", "img", "param",
+              "spacer", "wbr"), self.startTagVoidFormatting),
+            ("hr", self.startTagHr),
+            ("image", self.startTagImage),
+            ("input", self.startTagInput),
+            ("isindex", self.startTagIsIndex),
+            ("textarea", self.startTagTextarea),
+            (("iframe", "noembed", "noframes", "noscript"), self.startTagCdata),
+            ("select", self.startTagSelect),
+            (("caption", "col", "colgroup", "frame", "frameset", "head",
+              "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+              "tr"), self.startTagMisplaced),
+            (("event-source", "section", "nav", "article", "aside", "header",
+              "footer", "datagrid", "command"), self.startTagNew)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("p",self.endTagP),
+            ("body",self.endTagBody),
+            ("html",self.endTagHtml),
+            (("address", "blockquote", "center", "div", "dl", "fieldset",
+              "listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
+            ("form", self.endTagForm),
+            (("dd", "dt", "li"), self.endTagListItem),
+            (headingElements, self.endTagHeading),
+            (("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
+              "strike", "strong", "tt", "u"), self.endTagFormatting),
+            (("marquee", "object", "button"), self.endTagButtonMarqueeObject),
+            (("caption", "col", "colgroup", "frame", "frameset", "head",
+              "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+              "tr", "area", "basefont", "bgsound", "br", "embed", "hr",
+              "image", "img", "input", "isindex", "param", "select", "spacer",
+              "table",  "wbr"),self.endTagMisplacedNone),
+            (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
+              self.endTagCdataTextAreaXmp),
+            (("event-source", "section", "nav", "article", "aside", "header",
+              "footer", "datagrid", "command"), self.endTagNew)
+            ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper
+    def addFormattingElement(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.tree.activeFormattingElements.append(
+            self.tree.openElements[-1])
+
+    # the real deal
+    def processCharacters(self, data):
+        # XXX The specification says to do this for every character at the
+        # moment, but apparently that doesn't match the real world so we don't
+        # do it for space characters.
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertText(data)
+
+    def startTagScript(self, name, attributes):
+        self.parser.phases["inHead"].processStartTag(name, attributes)
+
+    def startTagFromHead(self, name, attributes):
+        self.parser.parseError(_("Unexpected start tag " + name +\
+          " that belongs in the head. Moved."))
+        self.parser.phases["inHead"].processStartTag(name, attributes)
+
+    def startTagBody(self, name, attributes):
+        self.parser.parseError(_("Unexpected start tag body"))
+        if len(self.tree.openElements) == 1 \
+          or self.tree.openElements[1].name != "body":
+            assert self.parser.innerHTML
+        else:
+            for attr, value in attributes.iteritems():
+                if attr not in self.tree.openElements[1].attributes:
+                    self.tree.openElements[1].attributes[attr] = value
+
+    def startTagCloseP(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.endTagP("p")
+        self.tree.insertElement(name, attributes)
+
+    def startTagForm(self, name, attributes):
+        if self.tree.formPointer:
+            self.parser.parseError()
+        else:
+            if self.tree.elementInScope("p"):
+                self.endTagP("p")
+            self.tree.insertElement(name, attributes)
+            self.tree.formPointer = self.tree.openElements[-1]
+
+    def startTagListItem(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.endTagP("p")
+        stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
+        stopName = stopNames[name]
+        # AT Use reversed in Python 2.4...
+        for i, node in enumerate(self.tree.openElements[::-1]):
+            if node.name in stopName:
+                for j in range(i+1):
+                    self.tree.openElements.pop()
+                break
+
+            # Phrasing elements are all non special, non scoping, non
+            # formatting elements
+            if (node.name in (specialElements | scopingElements)
+              and node.name not in ("address", "div")):
+                break
+        # Always insert an <li> element.
+        self.tree.insertElement(name, attributes)
+
+    def startTagPlaintext(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.endTagP("p")
+        self.tree.insertElement(name, attributes)
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
+
+    def startTagHeading(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.endTagP("p")
+        for item in headingElements:
+            if self.tree.elementInScope(item):
+                self.parser.parseError()
+                item = self.tree.openElements.pop()
+                while item.name not in headingElements:
+                    item = self.tree.openElements.pop()
+                break
+        self.tree.insertElement(name, attributes)
+
+    def startTagA(self, name, attributes):
+        afeAElement = self.tree.elementInActiveFormattingElements("a")
+        if afeAElement:
+            self.parser.parseError()
+            self.endTagFormatting("a")
+            if afeAElement in self.tree.openElements:
+                self.tree.openElements.remove(afeAElement)
+            if afeAElement in self.tree.activeFormattingElements:
+                self.tree.activeFormattingElements.remove(afeAElement)
+        self.tree.reconstructActiveFormattingElements()
+        self.addFormattingElement(name, attributes)
+
+    def startTagFormatting(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.addFormattingElement(name, attributes)
+
+    def startTagButton(self, name, attributes):
+        if self.tree.elementInScope("button"):
+            self.parser.parseError(_("Unexpected start tag button. Implying"
+              "button end tag."))
+            self.processEndTag("button")
+            self.parser.phase.processStartTag(name, attributes)
+        else:
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(name, attributes)
+            self.tree.activeFormattingElements.append(Marker)
+
+    def startTagMarqueeObject(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+        self.tree.activeFormattingElements.append(Marker)
+
+    def startTagXmp(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+
+    def startTagTable(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.processEndTag("p")
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inTable"]
+
+    def startTagVoidFormatting(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+        self.tree.openElements.pop()
+
+    def startTagHr(self, name, attributes):
+        if self.tree.elementInScope("p"):
+            self.endTagP("p")
+        self.tree.insertElement(name, attributes)
+        self.tree.openElements.pop()
+
+    def startTagImage(self, name, attributes):
+        # No really...
+        self.parser.parseError(_("Unexpected start tag image. Use img "
+          "instead"))
+        self.processStartTag("img", attributes)
+
+    def startTagInput(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+        if self.tree.formPointer:
+            # XXX Not exactly sure what to do here
+            self.tree.openElements[-1].form = self.tree.formPointer
+        self.tree.openElements.pop()
+
+    def startTagIsIndex(self, name, attributes):
+        self.parser.parseError("Unexpected start tag isindex. Don't use it!")
+        if self.tree.formPointer:
+            return
+        self.processStartTag("form", {})
+        self.processStartTag("hr", {})
+        self.processStartTag("p", {})
+        self.processStartTag("label", {})
+        # XXX Localization ...
+        self.processCharacters(
+            "This is a searchable index. Insert your search keywords here:")
+        attributes["name"] = "isindex"
+        attrs = [[key,value] for key,value in attributes.iteritems()]
+        self.processStartTag("input", dict(attrs))
+        self.processEndTag("label")
+        self.processEndTag("p")
+        self.processStartTag("hr", {})
+        self.processEndTag("form")
+
+    def startTagTextarea(self, name, attributes):
+        # XXX Form element pointer checking here as well...
+        self.tree.insertElement(name, attributes)
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
+
+    def startTagCdata(self, name, attributes):
+        """iframe, noembed noframes, noscript(if scripting enabled)"""
+        self.tree.insertElement(name, attributes)
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+
+    def startTagSelect(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inSelect"]
+
+    def startTagMisplaced(self, name, attributes):
+        """ Elements that should be children of other elements that have a
+        different insertion mode; here they are ignored
+        "caption", "col", "colgroup", "frame", "frameset", "head",
+        "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+        "tr", "noscript"
+        """
+        self.parser.parseError()
+
+    def startTagNew(self, name, other):
+        """New HTML5 elements, "event-source", "section", "nav",
+        "article", "aside", "header", "footer", "datagrid", "command"
+        """
+        raise NotImplementedError
+
+    def startTagOther(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, attributes)
+
+    def endTagP(self, name):
+        self.tree.generateImpliedEndTags("p")
+        if self.tree.openElements[-1].name != "p":
+            self.parser.parseError()
+        while self.tree.elementInScope("p"):
+            self.tree.openElements.pop()
+
+    def endTagBody(self, name):
+        # XXX Need to take open <p> tags into account here. We shouldn't imply
+        # </p> but we should not throw a parse error either. Specification is
+        # likely to be updated.
+        if self.tree.openElements[1].name != "body":
+            # innerHTML case
+            self.parser.parseError()
+            return
+        if self.tree.openElements[-1].name != "body":
+            self.parser.parseError()
+        self.parser.phase = self.parser.phases["afterBody"]
+
+    def endTagHtml(self, name):
+        self.endTagBody(name)
+        if not self.parser.innerHTML:
+            self.parser.phase.processEndTag(name)
+
+    def endTagBlock(self, name):
+        inScope = self.tree.elementInScope(name)
+        if inScope:
+            self.tree.generateImpliedEndTags()
+        if self.tree.openElements[-1].name != name:
+             self.parser.parseError()
+        if inScope:
+            node = self.tree.openElements.pop()
+            while node.name != name:
+                node = self.tree.openElements.pop()
+
+    def endTagForm(self, name):
+        self.endTagBlock(name)
+        self.tree.formPointer = None
+
+    def endTagListItem(self, name):
+        # AT Could merge this with the Block case
+        if self.tree.elementInScope(name):
+            self.tree.generateImpliedEndTags(name)
+            if self.tree.openElements[-1].name != name:
+                self.parser.parseError()
+
+        if self.tree.elementInScope(name):
+            node = self.tree.openElements.pop()
+            while node.name != name:
+                node = self.tree.openElements.pop()
+
+    def endTagHeading(self, name):
+        for item in headingElements:
+            if self.tree.elementInScope(item):
+                self.tree.generateImpliedEndTags()
+                break
+        if self.tree.openElements[-1].name != name:
+            self.parser.parseError()
+
+        for item in headingElements:
+            if self.tree.elementInScope(item):
+                item = self.tree.openElements.pop()
+                while item.name not in headingElements:
+                    item = self.tree.openElements.pop()
+                break
+
+    def endTagFormatting(self, name):
+        """The much-feared adoption agency algorithm
+        """
+        while True:
+            # Step 1 paragraph 1
+            afeElement = self.tree.elementInActiveFormattingElements(name)
+            if not afeElement or (afeElement in self.tree.openElements and
+              not self.tree.elementInScope(afeElement.name)):
+                self.parser.parseError()
+                return
+
+            # Step 1 paragraph 2
+            elif afeElement not in self.tree.openElements:
+                self.parser.parseError()
+                self.tree.activeFormattingElements.remove(afeElement)
+                return
+
+            # Step 1 paragraph 3
+            if afeElement != self.tree.openElements[-1]:
+                self.parser.parseError()
+
+            # Step 2
+            # Start of the adoption agency algorithm proper
+            afeIndex = self.tree.openElements.index(afeElement)
+            furthestBlock = None
+            for element in self.tree.openElements[afeIndex:]:
+                if element.name in specialElements | scopingElements:
+                    furthestBlock = element
+                    break
+
+            # Step 3
+            if furthestBlock is None:
+                element = self.tree.openElements.pop()
+                while element != afeElement:
+                    element = self.tree.openElements.pop()
+                self.tree.activeFormattingElements.remove(element)
+                return
+            commonAncestor = self.tree.openElements[afeIndex-1]
+
+            # Step 5
+            if furthestBlock.parent:
+                furthestBlock.parent.removeChild(furthestBlock)
+
+            # Step 6
+            # The bookmark is supposed to help us identify where to reinsert
+            # nodes in step 12. We have to ensure that we reinsert nodes after
+            # the node before the active formatting element. Note the bookmark
+            # can move in step 7.4
+            bookmark = self.tree.activeFormattingElements.index(afeElement)
+
+            # Step 7
+            lastNode = node = furthestBlock
+            while True:
+                # AT replace this with a function and recursion?
+                # Node is element before node in open elements
+                node = self.tree.openElements[
+                    self.tree.openElements.index(node)-1]
+                while node not in self.tree.activeFormattingElements:
+                    tmpNode = node
+                    node = self.tree.openElements[
+                        self.tree.openElements.index(node)-1]
+                    self.tree.openElements.remove(tmpNode)
+                # Step 7.3
+                if node == afeElement:
+                    break
+                # Step 7.4
+                if lastNode == furthestBlock:
+                    # XXX should this be index(node) or index(node)+1
+                    # Anne: I think +1 is ok. Given x = [2,3,4,5]
+                    # x.index(3) gives 1 and then x[1 +1] gives 4...
+                    bookmark = self.tree.activeFormattingElements.\
+                      index(node) + 1
+                # Step 7.5
+                cite = node.parent
+                if node.hasContent():
+                    clone = node.cloneNode()
+                    # Replace node with clone
+                    self.tree.activeFormattingElements[
+                      self.tree.activeFormattingElements.index(node)] = clone
+                    self.tree.openElements[
+                      self.tree.openElements.index(node)] = clone
+                    node = clone
+                # Step 7.6
+                # Remove lastNode from its parents, if any
+                if lastNode.parent:
+                    lastNode.parent.removeChild(lastNode)
+                node.appendChild(lastNode)
+                # Step 7.7
+                lastNode = node
+                # End of inner loop
+
+            # Step 8
+            if lastNode.parent:
+                lastNode.parent.removeChild(lastNode)
+            commonAncestor.appendChild(lastNode)
+
+            # Step 9
+            clone = afeElement.cloneNode()
+
+            # Step 10
+            furthestBlock.reparentChildren(clone)
+
+            # Step 11
+            furthestBlock.appendChild(clone)
+
+            # Step 12
+            self.tree.activeFormattingElements.remove(afeElement)
+            self.tree.activeFormattingElements.insert(bookmark, clone)
+
+            # Step 13
+            self.tree.openElements.remove(afeElement)
+            self.tree.openElements.insert(
+              self.tree.openElements.index(furthestBlock) + 1, clone)
+
+    def endTagButtonMarqueeObject(self, name):
+        if self.tree.elementInScope(name):
+            self.tree.generateImpliedEndTags()
+        if self.tree.openElements[-1].name != name:
+            self.parser.parseError()
+
+        if self.tree.elementInScope(name):
+            element = self.tree.openElements.pop()
+            while element.name != name:
+                element = self.tree.openElements.pop()
+            self.tree.clearActiveFormattingElements()
+
+    def endTagMisplacedNone(self, name):
+        """ Elements that should be children of other elements that have a
+        different insertion mode or elements that have no end tag;
+        here they are ignored
+        "caption", "col", "colgroup", "frame", "frameset", "head",
+        "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+        "tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
+        "hr", "iframe", "image", "img", "input", "isindex", "noembed",
+        "noframes", "param", "select", "spacer", "table", "textarea", "wbr""
+        """
+        self.parser.parseError()
+
+    def endTagCdataTextAreaXmp(self, name):
+        if self.tree.openElements[-1].name == name:
+            self.tree.openElements.pop()
+        else:
+            self.parser.parseError(_("Unexpected end tag " + name +\
+              ". Ignored."))
+
+    def endTagNew(self, name):
+        """New HTML5 elements, "event-source", "section", "nav",
+        "article", "aside", "header", "footer", "datagrid", "command"
+        """
+        raise NotImplementedError
+
+    def endTagOther(self, name):
+        # XXX This logic should be moved into the treebuilder
+        # AT should use reversed instead of [::-1] when Python 2.4 == True.
+        for node in self.tree.openElements[::-1]:
+            if node.name == name:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != name:
+                    self.parser.parseError(_("Unexpected end tag " + name +\
+                      "."))
+                while self.tree.openElements.pop() != node:
+                    pass
+                break
+            else:
+                if node.name in specialElements | scopingElements:
+                    self.parser.parseError()
+                    break
+
+class InTablePhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("caption", self.startTagCaption),
+            ("colgroup", self.startTagColgroup),
+            ("col", self.startTagCol),
+            (("tbody", "tfoot", "thead"), self.startTagRowGroup),
+            (("td", "th", "tr"), self.startTagImplyTbody),
+            ("table", self.startTagTable)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("table", self.endTagTable),
+            (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+              "tfoot", "th", "thead", "tr"), self.endTagIgnore)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper methods
+    def clearStackToTableContext(self):
+        # "clear the stack back to a table context"
+        while self.tree.openElements[-1].name not in ("table", "html"):
+            self.tree.openElements.pop()
+            self.parser.parseError()
+        # When the current node is <html> it's an innerHTML case
+
+    # processing methods
+    def processCharacters(self, data):
+        self.parser.parseError()
+        # Make all the special element rearranging voodoo kick in
+        self.tree.insertFromTable = True
+        # Process the character in the "in body" mode
+        self.parser.phases["inBody"].processCharacters(data)
+        self.tree.insertFromTable = False
+
+    def startTagCaption(self, name, attributes):
+        self.clearStackToTableContext()
+        self.tree.activeFormattingElements.append(Marker)
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inCaption"]
+
+    def startTagColgroup(self, name, attributes):
+        self.clearStackToTableContext()
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inColumnGroup"]
+
+    def startTagCol(self, name, attributes):
+        self.startTagColgroup("colgroup", {})
+        self.parser.phase.processStartTag(name, attributes)
+
+    def startTagRowGroup(self, name, attributes):
+        self.clearStackToTableContext()
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inTableBody"]
+
+    def startTagImplyTbody(self, name, attributes):
+        self.startTagRowGroup("tbody", {})
+        self.parser.phase.processStartTag(name, attributes)
+
+    def startTagTable(self, name, attributes):
+        self.parser.parseError()
+        self.parser.phase.processEndTag("table")
+        if not self.parser.innerHTML:
+            self.parser.phase.processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.parser.parseError()
+        # Make all the special element rearranging voodoo kick in
+        self.tree.insertFromTable = True
+        # Process the start tag in the "in body" mode
+        self.parser.phases["inBody"].processStartTag(name, attributes)
+        self.tree.insertFromTable = False
+
+    def endTagTable(self, name):
+        if self.tree.elementInScope("table", True):
+            self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name == "table":
+                self.parser.parseError()
+            while self.tree.openElements[-1].name != "table":
+                self.tree.openElements.pop()
+            self.tree.openElements.pop()
+            self.parser.resetInsertionMode()
+        else:
+            self.parser.parseError()
+            # innerHTML case
+
+    def endTagIgnore(self, name):
+        self.parser.parseError()
+
+    def endTagOther(self, name):
+        # Make all the special element rearranging voodoo kick in
+        self.parser.insertFromTable = True
+        # Process the end tag in the "in body" mode
+        self.parser.phases["inBody"].processEndTag(name)
+        self.parser.insertFromTable = False
+
+
+class InCaptionPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), self.startTagTableElement)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("caption", self.endTagCaption),
+            ("table", self.endTagTable),
+            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), self.endTagIgnore)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processCharacters(self, data):
+        self.parser.phases["inBody"].processCharacters(data)
+
+    def startTagTableElement(self, name, attributes):
+        self.parser.parseError()
+        self.parser.phase.processEndTag("caption")
+        # XXX how do we know the tag is _always_ ignored in the innerHTML
+        # case and therefore shouldn't be processed again? I'm not sure this
+        # strategy makes sense...
+        if not self.parser.innerHTML:
+            self.parser.phase.processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.parser.phases["inBody"].processStartTag(name, attributes)
+
+    def endTagCaption(self, name):
+        if self.tree.elementInScope(name, True):
+            # AT this code is quite similar to endTagTable in "InTable"
+            self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name == "caption":
+                self.parser.parseError()
+            while self.tree.openElements[-1].name != "caption":
+                self.tree.openElements.pop()
+            self.tree.clearActiveFormattingElements()
+            self.parser.phase = self.parser.phases["inTable"]
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def endTagTable(self, name):
+        self.parser.parseError()
+        self.parser.phase.processEndTag("caption")
+        # XXX ...
+        if not self.parser.innerHTML:
+            self.parser.phase.processStartTag(name, attributes)
+
+    def endTagIgnore(self, name):
+        self.parser.parseError()
+
+    def endTagOther(self, name):
+        self.parser.phases["inBody"].processEndTag(name)
+
+
+class InColumnGroupPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("col", self.startTagCol)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("colgroup", self.endTagColgroup),
+            ("col", self.endTagCol)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processCharacters(self, data):
+        self.endTagColgroup("colgroup")
+        # XXX
+        if not self.parser.innerHTML:
+            self.parser.phase.processCharacters(data)
+
+    def startTagCol(self, name ,attributes):
+        self.tree.insertElement(name, attributes)
+        self.tree.openElements.pop()
+
+    def startTagOther(self, name, attributes):
+        self.endTagColgroup("colgroup")
+        # XXX how can be sure it's always ignored?
+        if not self.parser.innerHTML:
+            self.parser.phase.processStartTag(name, attributes)
+
+    def endTagColgroup(self, name):
+        if self.tree.openElements[-1].name == "html":
+            # innerHTML case
+            self.parser.parseError()
+        else:
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.phases["inTable"]
+
+    def endTagCol(self, name):
+        self.parser.parseError()
+
+    def endTagOther(self, name):
+        self.endTagColgroup("colgroup")
+        # XXX how can be sure it's always ignored?
+        if not self.parser.innerHTML:
+            self.parser.phase.processEndTag(name)
+
+
+class InTableBodyPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("tr", self.startTagTr),
+            (("td", "th"), self.startTagTableCell),
+            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+            ("table", self.endTagTable),
+            (("body", "caption", "col", "colgroup", "html", "td", "th",
+              "tr"), self.endTagIgnore)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper methods
+    def clearStackToTableBodyContext(self):
+        while self.tree.openElements[-1].name not in ("tbody", "tfoot",
+          "thead", "html"):
+            self.tree.openElements.pop()
+            self.parser.parseError()
+
+    # the rest
+    def processCharacters(self,data):
+        self.parser.phases["inTable"].processCharacters(data)
+
+    def startTagTr(self, name, attributes):
+        self.clearStackToTableBodyContext()
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inRow"]
+
+    def startTagTableCell(self, name, attributes):
+        self.parser.parseError()
+        self.startTagTr("tr", {})
+        self.parser.phase.processStartTag(name, attributes)
+
+    def startTagTableOther(self, name, attributes):
+        # XXX AT Any ideas on how to share this with endTagTable?
+        if self.tree.elementInScope("tbody", True) or \
+          self.tree.elementInScope("thead", True) or \
+          self.tree.elementInScope("tfoot", True):
+            self.clearStackToTableBodyContext()
+            self.endTagTableRowGroup(self.tree.openElements[-1].name)
+            self.parser.phase.processStartTag(name, attributes)
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def startTagOther(self, name, attributes):
+        self.parser.phases["inTable"].processStartTag(name, attributes)
+
+    def endTagTableRowGroup(self, name):
+        if self.tree.elementInScope(name, True):
+            self.clearStackToTableBodyContext()
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.phases["inTable"]
+        else:
+            self.parser.parseError()
+
+    def endTagTable(self, name):
+        if self.tree.elementInScope("tbody", True) or \
+          self.tree.elementInScope("thead", True) or \
+          self.tree.elementInScope("tfoot", True):
+            self.clearStackToTableBodyContext()
+            self.endTagTableRowGroup(self.tree.openElements[-1].name)
+            self.parser.phase.processEndTag(name)
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def endTagIgnore(self, name):
+        self.parser.parseError()
+
+    def endTagOther(self, name):
+        self.parser.phases["inTable"].processEndTag(name)
+
+
+class InRowPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            (("td", "th"), self.startTagTableCell),
+            (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+              "tr"), self.startTagTableOther)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("tr", self.endTagTr),
+            ("table", self.endTagTable),
+            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+            (("body", "caption", "col", "colgroup", "html", "td", "th"),
+              self.endTagIgnore)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper methods (XXX unify this with other table helper methods)
+    def clearStackToTableRowContext(self):
+        while self.tree.openElements[-1].name not in ("tr", "html"):
+            self.tree.openElements.pop()
+            self.parser.parseError()
+
+    # the rest
+    def processCharacters(self, data):
+        self.parser.phases["inTable"].processCharacters(data)
+
+    def startTagTableCell(self, name, attributes):
+        self.clearStackToTableRowContext()
+        self.tree.insertElement(name, attributes)
+        self.parser.phase = self.parser.phases["inCell"]
+        self.tree.activeFormattingElements.append(Marker)
+
+    def startTagTableOther(self, name, attributes):
+        self.endTagTr("tr")
+        # XXX how are we sure it's always ignored in the innerHTML case?
+        if not self.parser.innerHTML:
+            self.parser.phase.processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.parser.phases["inTable"].processStartTag(name, attributes)
+
+    def endTagTr(self, name):
+        if self.tree.elementInScope("tr", True):
+            self.clearStackToTableRowContext()
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.phases["inTableBody"]
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def endTagTable(self, name):
+        self.endTagTr("tr")
+        # Reprocess the current tag if the tr end tag was not ignored
+        # XXX how are we sure it's always ignored in the innerHTML case?
+        if not self.parser.innerHTML:
+            self.parser.phase.processEndTag(name)
+
+    def endTagTableRowGroup(self, name):
+        if self.tree.elementInScope(name, True):
+            self.endTagTr("tr")
+            self.parser.phase.processEndTag(name)
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def endTagIgnore(self, name):
+        self.parser.parseError()
+
+    def endTagOther(self, name):
+        self.parser.phases["inTable"].processEndTag(name)
+
+class InCellPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+              "thead", "tr"), self.startTagTableOther)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            (("td", "th"), self.endTagTableCell),
+            (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
+            (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    # helper
+    def closeCell(self):
+        if self.tree.elementInScope("td", True):
+            self.endTagTableCell("td")
+        elif self.tree.elementInScope("th", True):
+            self.endTagTableCell("th")
+
+    # the rest
+    def processCharacters(self, data):
+        self.parser.phases["inBody"].processCharacters(data)
+
+    def startTagTableOther(self, name, attributes):
+        if self.tree.elementInScope("td", True) or \
+          self.tree.elementInScope("th", True):
+            self.closeCell()
+            self.parser.phase.processStartTag(name, attributes)
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def startTagOther(self, name, attributes):
+        self.parser.phases["inBody"].processStartTag(name, attributes)
+        # Optimize this for subsequent invocations. Can't do this initially
+        # because self.phases doesn't really exist at that point.
+        self.startTagHandler.default =\
+          self.parser.phases["inBody"].processStartTag
+
+    def endTagTableCell(self, name):
+        if self.tree.elementInScope(name, True):
+            self.tree.generateImpliedEndTags(name)
+            if self.tree.openElements[-1].name != name:
+                self.parser.parseError()
+                while True:
+                    node = self.tree.openElements.pop()
+                    if node.name == name:
+                        break
+            else:
+                self.tree.openElements.pop()
+            self.tree.clearActiveFormattingElements()
+            self.parser.phase = self.parser.phases["inRow"]
+        else:
+            self.parser.parseError()
+
+    def endTagIgnore(self, name):
+        self.parser.parseError()
+
+    def endTagImply(self, name):
+        if self.tree.elementInScope(name, True):
+            self.closeCell()
+            self.parser.phase.processEndTag(name)
+        else:
+            # sometimes innerHTML case
+            self.parser.parseError()
+
+    def endTagOther(self, name):
+        self.parser.phases["inBody"].processEndTag(name)
+        # Optimize this for subsequent invocations. Can't do this initially
+        # because self.phases doesn't really exist at that point.
+        self.endTagHandler.default = self.parser.phases["inBody"].processEndTag
+
+
+class InSelectPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("option", self.startTagOption),
+            ("optgroup", self.startTagOptgroup),
+            ("select", self.startTagSelect)
+        ])
+        self.startTagHandler.default = self.processAnythingElse
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("option", self.endTagOption),
+            ("optgroup", self.endTagOptgroup),
+            ("select", self.endTagSelect),
+            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
+              "th"), self.endTagTableElements)
+        ])
+        self.endTagHandler.default = self.processAnythingElse
+
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+    def processCharacters(self, data):
+        self.tree.insertText(data)
+
+    def startTagOption(self, name, attributes):
+        # We need to imply </option> if <option> is the current node.
+        if self.tree.openElements[-1].name == "option":
+            self.tree.openElements.pop()
+        self.tree.insertElement(name, attributes)
+
+    def startTagOptgroup(self, name, attributes):
+        if self.tree.openElements[-1].name == "option":
+            self.tree.openElements.pop()
+        if self.tree.openElements[-1].name == "optgroup":
+            self.tree.openElements.pop()
+        self.tree.insertElement(name, attributes)
+
+    def startTagSelect(self, name, attributes):
+        self.parser.parseError()
+        self.endTagSelect("select")
+
+    def endTagOption(self, name):
+        if self.tree.openElements[-1].name == "option":
+            self.tree.openElements.pop()
+        else:
+            self.parser.parseError()
+
+    def endTagOptgroup(self, name):
+        # </optgroup> implicitly closes <option>
+        if self.tree.openElements[-1].name == "option" and \
+          self.tree.openElements[-2].name == "optgroup":
+            self.tree.openElements.pop()
+        # It also closes </optgroup>
+        if self.tree.openElements[-1].name == "optgroup":
+            self.tree.openElements.pop()
+        # But nothing else
+        else:
+            self.parser.parseError()
+
+    def endTagSelect(self, name):
+        if self.tree.elementInScope(name, True):
+            node = self.tree.openElements.pop()
+            while node.name != "select":
+                node = self.tree.openElements.pop()
+            self.parser.resetInsertionMode()
+        else:
+            # innerHTML case
+            self.parser.parseError()
+
+    def endTagTableElements(self, name):
+        self.parser.parseError()
+        if self.tree.elementInScope(name, True):
+            self.endTagSelect()
+            self.parser.phase.processEndTag(name)
+
+    def processAnythingElse(self, name, attributes={}):
+        self.parser.parseError()
+
+
+class AfterBodyPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        # XXX We should prolly add a handler for "html" here as well...
+        self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+        self.endTagHandler.default = self.endTagOther
+
+    def processComment(self, data):
+        # This is needed because data is to be appended to the <html> element
+        # here and not to whatever is currently open.
+        self.tree.insertComment(data, self.tree.openElements[0])
+
+    def processCharacters(self, data):
+        self.parser.parseError()
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processCharacters(data)
+
+    def processStartTag(self, name, attributes):
+        self.parser.parseError()
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processStartTag(name, attributes)
+
+    def endTagHtml(self,name):
+        if self.parser.innerHTML:
+            self.parser.parseError()
+        else:
+            self.parser.lastPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["trailingEnd"]
+
+    def endTagOther(self, name):
+        self.parser.parseError()
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processEndTag(name)
+
+class InFramesetPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("frameset", self.startTagFrameset),
+            ("frame", self.startTagFrame),
+            ("noframes", self.startTagNoframes)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("frameset", self.endTagFrameset),
+            ("noframes", self.endTagNoframes)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processCharacters(self, data):
+        self.parser.parseError(_("Unepxected characters in the frameset phase. "
+          "Characters ignored."))
+
+    def startTagFrameset(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+
+    def startTagFrame(self, name, attributes):
+        self.tree.insertElement(name, attributes)
+        self.tree.openElements.pop()
+
+    def startTagNoframes(self, name, attributes):
+        self.parser.phases["inBody"].processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.parser.parseError(_("Unexpected start tag token (" + name +\
+          ") in the frameset phase."))
+
+    def endTagFrameset(self, name):
+        if self.tree.openElements[-1].name == "html":
+            # innerHTML case
+            self.parser.parseError(_("Unexpected end tag token (frameset) in the"
+              "frameset phase (innerHTML)"))
+        else:
+            self.tree.openElements.pop()
+        if not self.parser.innerHTML and\
+          self.tree.openElements[-1].name != "frameset":
+            # If we're not in innerHTML mode and the the current node is not a
+            # "frameset" element (anymore) then switch.
+            self.parser.phase = self.parser.phases["afterFrameset"]
+
+    def endTagNoframes(self, name):
+        self.parser.phases["inBody"].processEndTag(name)
+
+    def endTagOther(self, name):
+        self.parser.parseError(_("Unexpected end tag token (" + name +
+          ") in the frameset phase."))
+
+
+class AfterFramesetPhase(Phase):
+    # http://www.whatwg.org/specs/web-apps/current-work/#after3
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("noframes", self.startTagNoframes)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            ("html", self.endTagHtml)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processCharacters(self, data):
+        self.parser.parseError()
+
+    def startTagNoframes(self, name, attributes):
+        self.parser.phases["inBody"].processStartTag(name, attributes)
+
+    def startTagOther(self, name, attributes):
+        self.parser.parseError()
+
+    def endTagHtml(self, name):
+        self.parser.lastPhase = self.parser.phase
+        self.parser.phase = self.parser.phases["trailingEnd"]
+
+    def endTagOther(self, name):
+        self.parser.parseError()
+
+
+class TrailingEndPhase(Phase):
+    def processEOF(self):
+        pass
+
+    def processComment(self, data):
+        self.parser.insertCommenr(data, self.tree.document)
+
+    def processSpaceCharacters(self, data):
+        self.parser.lastPhase.processCharacters(data)
+
+    def processCharacters(self, data):
+        self.parser.parseError()
+        self.parser.phase = self.parser.lastPhase
+        self.parser.phase.processCharacters(data)
+
+    def processStartTag(self, name, attributes):
+        self.parser.parseError()
+        self.parser.phase = self.parser.lastPhase
+        self.parser.phase.processStartTag(name, attributes)
+
+    def processEndTag(self, name):
+        self.parser.parseError()
+        self.parser.phase = self.parser.lastPhase
+        self.parser.phase.processEndTag(name)
+
+
+class ParseError(Exception):
+    """Error in parsed document"""
+    pass
diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py
new file mode 100644
index 0000000..dbe6abc
--- /dev/null
+++ b/planet/html5lib/inputstream.py
@@ -0,0 +1,202 @@
+import codecs
+import re
+
+from constants import EOF
+
+class HTMLInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, encoding=None):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by the HTML5Lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # List of where new lines occur
+        self.newLines = []
+
+        # Encoding Information
+        self.charEncoding = encoding
+
+        # Raw Stream
+        self.rawStream = self.openStream(source)
+
+        # Try to detect the encoding of the stream by looking for a BOM
+        detectedEncoding = self.detectEncoding()
+
+        # If an encoding was specified or detected from the BOM don't allow
+        # the encoding to be changed futher into the stream
+        if self.charEncoding or detectedEncoding:
+            self.allowEncodingOverride = False
+        else:
+            self.allowEncodingOverride = True
+
+        # If an encoding wasn't specified, use the encoding detected from the
+        # BOM, if present, otherwise use the default encoding
+        if not self.charEncoding:
+            self.charEncoding = detectedEncoding or "cp1252"
+
+        # Read bytes from stream decoding them into Unicode
+        uString = self.rawStream.read().decode(self.charEncoding, 'replace')
+
+        # Normalize new lines and null characters
+        uString = re.sub('\r\n?', '\n', uString)
+        uString = re.sub('\x00', '\xFFFD', uString)
+
+        # Convert the unicode string into a list to be used as the data stream
+        self.dataStream = uString
+
+        self.queue = []
+
+        # Reset position in the list to read from
+        self.reset()
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            # Otherwise treat source as a string and convert to a file object
+            import cStringIO
+            stream = cStringIO.StringIO(str(source))
+        return stream
+
+    def detectEncoding(self):
+        # Attempts to detect the character encoding of the stream. If
+        # an encoding can be determined from the BOM return the name of the
+        # encoding otherwise return None
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        self.rawStream.seek(0)
+        string = self.rawStream.read(4)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])       # UTF-8
+        seek = 3
+        if not encoding:
+            encoding = bomDict.get(string[:2])   # UTF-16
+            seek = 2
+            if not encoding:
+                encoding = bomDict.get(string)   # UTF-32
+                seek = 4
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        self.rawStream.seek(encoding and seek or 0)
+
+        return encoding
+
+    def declareEncoding(self, encoding):
+        """Report the encoding declared by the meta element
+
+        If the encoding is currently only guessed, then this
+        will read subsequent characters in that encoding.
+
+        If the encoding is not compatible with the guessed encoding
+        and non-US-ASCII characters have been seen, return True indicating
+        parsing will have to begin again.
+
+        """
+        pass
+
+    def determineNewLines(self):
+        # Looks through the stream to find where new lines occur so
+        # the position method can tell where it is.
+        self.newLines.append(0)
+        for i in xrange(len(self.dataStream)):
+            if self.dataStream[i] == u"\n":
+                self.newLines.append(i)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        # Generate list of new lines first time around
+        if not self.newLines:
+            self.determineNewLines()
+
+        line = 0
+        tell = self.tell
+        for pos in self.newLines:
+            if pos < tell:
+                line += 1
+            else:
+                break
+        col = tell - self.newLines[line-1] - 1
+        return (line, col)
+
+    def reset(self):
+        """Resets the position in the stream back to the start."""
+        self.tell = 0
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        if self.queue:
+            return self.queue.pop(0)
+        else:
+            try:
+                self.tell += 1
+                return self.dataStream[self.tell - 1]
+            except:
+                return EOF
+
+    def charsUntil(self, characters, opposite = False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in characters or EOF. characters can be
+        any container that supports the in method being called on it.
+        """
+        charStack = [self.char()]
+
+        # First from the queue
+        while charStack[-1] and (charStack[-1] in characters) == opposite \
+          and self.queue:
+            charStack.append(self.queue.pop(0))
+
+        # Then the rest
+        while charStack[-1] and (charStack[-1] in characters) == opposite:
+            try:
+                self.tell += 1
+                charStack.append(self.dataStream[self.tell - 1])
+            except:
+                charStack.append(EOF)
+
+        # Put the character stopped on back to the front of the queue
+        # from where it came.
+        self.queue.insert(0, charStack.pop())
+        return "".join(charStack)
+
+if __name__ == "__main__":
+    stream = HTMLInputStream("../tests/utf-8-bom.html")
+
+    c = stream.char()
+    while c:
+        line, col = stream.position()
+        if c == u"\n":
+            print "Line %s, Column %s: Line Feed" % (line, col)
+        else:
+            print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
+        c = stream.char()
+    print "EOF"
diff --git a/planet/html5lib/liberalxmlparser.py b/planet/html5lib/liberalxmlparser.py
new file mode 100644
index 0000000..93bbee9
--- /dev/null
+++ b/planet/html5lib/liberalxmlparser.py
@@ -0,0 +1,106 @@
+""" 
+Warning: this module is experimental and subject to change and even removal
+at any time. 
+
+For background/rationale, see:
+ * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
+ * http://tinyurl.com/ylfj8k (and follow-ups)
+
+References:
+ * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
+ * http://wiki.whatwg.org/wiki/HtmlVsXhtml
+
+@@TODO:
+ * Build a Treebuilder that produces Python DOM objects:
+     http://docs.python.org/lib/module-xml.dom.html
+ * Produce SAX events based on the produced DOM.  This is intended not to
+   support streaming, but rather to support application level compatibility. 
+ * Optional namespace support
+ * Special case the output of XHTML <script> elements so that the empty
+   element syntax is never used, even when the src attribute is provided.
+   Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
+   indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
+ * Map illegal XML characters to U+FFFD, possibly with additional markup in
+   the case of XHTML
+ * Selectively lowercase only XHTML, but not foreign markup
+"""
+
+import html5parser
+import gettext
+_ = gettext.gettext
+
+class XHTMLParser(html5parser.HTMLParser):
+    """ liberal XMTHML parser """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
+
+    def normalizeToken(self, token):
+        if token["type"] == "StartTag" or token["type"] == "EmptyTag":
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            token["data"] = dict(token["data"][::-1])
+
+            # For EmptyTags, process both a Start and an End tag
+            if token["type"] == "EmptyTag":
+                self.phase.processStartTag(token["name"], token["data"])
+                token["data"] = {}
+                token["type"] = "EndTag"
+
+        return token
+
+class XhmlRootPhase(html5parser.RootElementPhase):
+    def insertHtmlElement(self):
+        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
+        self.tree.openElements.append(element)
+        self.tree.document.appendChild(element)
+        self.parser.phase = self.parser.phases["beforeHead"]
+
+class XMLParser(XHTMLParser):
+    """ liberal XML parser """
+
+    def __init__(self, *args, **kwargs):
+        XHTMLParser.__init__(self, *args, **kwargs)
+        self.phases["initial"] = XmlRootPhase(self, self.tree)
+
+class XmlRootPhase(html5parser.Phase):
+    """ Prime the Xml parser """
+    def __getattr__(self, name):
+        self.tree.openElements.append(self.tree.document)
+        self.parser.phase = XmlElementPhase(self.parser, self.tree)
+        return getattr(self.parser.phase, name)
+
+class XmlElementPhase(html5parser.Phase):
+    """ Generic handling for all XML elements """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.Phase.__init__(self, *args, **kwargs)
+        self.startTagHandler = html5parser.utils.MethodDispatcher([])
+        self.startTagHandler.default = self.startTagOther
+        self.endTagHandler = html5parser.utils.MethodDispatcher([])
+        self.endTagHandler.default = self.endTagOther
+
+    def startTagOther(self, name, attributes):
+        element = self.tree.createElement(name, attributes)
+        self.tree.openElements[-1].appendChild(element)
+        self.tree.openElements.append(element)
+
+    def endTagOther(self, name):
+        for node in self.tree.openElements[::-1]:
+            if node.name == name:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != name:
+                    self.parser.parseError(_("Unexpected end tag " + name +\
+                      "."))
+                while self.tree.openElements.pop() != node:
+                    pass
+                break
+            else:
+                self.parser.parseError()
+
+    def processCharacters(self, data):
+        self.tree.insertText(data)
diff --git a/planet/html5lib/tokenizer.py b/planet/html5lib/tokenizer.py
new file mode 100644
index 0000000..85e0a0d
--- /dev/null
+++ b/planet/html5lib/tokenizer.py
@@ -0,0 +1,745 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+import gettext
+_ = gettext.gettext
+
+from constants import contentModelFlags, spaceCharacters
+from constants import entitiesWindows1252, entities
+from constants import asciiLowercase, asciiLetters
+from constants import digits, hexDigits, EOF
+
+from inputstream import HTMLInputStream
+
+class HTMLTokenizer(object):
+    """ This class takes care of tokenizing HTML.
+
+    * self.currentToken
+      Holds the token that is currently being processed.
+
+    * self.state
+      Holds a reference to the method to be invoked... XXX
+
+    * self.states
+      Holds a mapping between states and methods that implement the state.
+
+    * self.stream
+      Points to HTMLInputStream object.
+    """
+
+    # XXX need to fix documentation
+
+    def __init__(self, stream, encoding=None):
+        self.stream = HTMLInputStream(stream, encoding)
+
+        self.states = {
+            "data":self.dataState,
+            "entityData":self.entityDataState,
+            "tagOpen":self.tagOpenState,
+            "closeTagOpen":self.closeTagOpenState,
+            "tagName":self.tagNameState,
+            "beforeAttributeName":self.beforeAttributeNameState,
+            "attributeName":self.attributeNameState,
+            "afterAttributeName":self.afterAttributeNameState,
+            "beforeAttributeValue":self.beforeAttributeValueState,
+            "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
+            "attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
+            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
+            "bogusComment":self.bogusCommentState,
+            "markupDeclarationOpen":self.markupDeclarationOpenState,
+            "comment":self.commentState,
+            "commentDash":self.commentDashState,
+            "commentEnd":self.commentEndState,
+            "doctype":self.doctypeState,
+            "beforeDoctypeName":self.beforeDoctypeNameState,
+            "doctypeName":self.doctypeNameState,
+            "afterDoctypeName":self.afterDoctypeNameState,
+            "bogusDoctype":self.bogusDoctypeState
+        }
+
+        # Setup the initial tokenizer state
+        self.contentModelFlag = contentModelFlags["PCDATA"]
+        self.state = self.states["data"]
+
+        # The current token being created
+        self.currentToken = None
+
+        # Tokens to be processed.
+        self.tokenQueue = []
+
+    def __iter__(self):
+        """ This is where the magic happens.
+
+        We do our usually processing through the states and when we have a token
+        to return we yield the token which pauses processing until the next token
+        is requested.
+        """
+        self.stream.reset()
+        self.tokenQueue = []
+        # Start processing. When EOF is reached self.state will return False
+        # instead of True and the loop will terminate.
+        while self.state():
+            while self.tokenQueue:
+                yield self.tokenQueue.pop(0)
+
+    # Below are various helper functions the tokenizer states use worked out.
+    def processSolidusInTag(self):
+        """If the next character is a '>', convert the currentToken into
+        an EmptyTag
+        """
+
+        # We need to consume another character to make sure it's a ">"
+        data = self.stream.char()
+
+        if self.currentToken["type"] == "StartTag" and data == u">":
+            self.currentToken["type"] = "EmptyTag"
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Solidus (/) incorrectly placed in tag.")})
+
+        # The character we just consumed need to be put back on the stack so it
+        # doesn't get lost...
+        self.stream.queue.append(data)
+
+    def consumeNumberEntity(self, isHex):
+        """This function returns either U+FFFD or the character based on the
+        decimal or hexadecimal representation. It also discards ";" if present.
+        If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
+        """
+
+        allowed = digits
+        radix = 10
+        if isHex:
+            allowed = hexDigits
+            radix = 16
+
+        char = u"\uFFFD"
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = self.stream.char()
+        while c in allowed and c is not EOF:
+            charStack.append(c)
+            c = self.stream.char()
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = int("".join(charStack), radix)
+
+        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
+        # smaller) we need to do the "windows trick".
+        if 127 < charAsInt < 160:
+            #XXX - removed parse error from windows 1252 entity for now
+            #we may want to reenable this later
+            #self.tokenQueue.append({"type": "ParseError", "data":
+            #  _("Entity used with illegal number (windows-1252 reference).")})
+
+            charAsInt = entitiesWindows1252[charAsInt - 128]
+
+        # 0 is not a good number.
+        if charAsInt == 0:
+            charAsInt = 65533
+
+        try:
+            # XXX We should have a separate function that does "int" to
+            # "unicodestring" conversion since this doesn't always work
+            # according to hsivonen. Also, unichr has a limitation of 65535
+            char = unichr(charAsInt)
+        except:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity couldn't be converted to character.")})
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != u";":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity didn't end with ';'.")})
+            self.stream.queue.append(c)
+
+        return char
+
+    def consumeEntity(self):
+        char = None
+        charStack = [self.stream.char()]
+        if charStack[0] == u"#":
+            # We might have a number entity here.
+            charStack.extend([self.stream.char(), self.stream.char()])
+            if EOF in charStack:
+                # If we reach the end of the file put everything up to EOF
+                # back in the queue
+                charStack = charStack[:charStack.index(EOF)]
+                self.stream.queue.extend(charStack)
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Numeric entity expected. Got end of file instead.")})
+            else:
+                if charStack[1].lower() == u"x" \
+                  and charStack[2] in hexDigits:
+                    # Hexadecimal entity detected.
+                    self.stream.queue.append(charStack[2])
+                    char = self.consumeNumberEntity(True)
+                elif charStack[1] in digits:
+                    # Decimal entity detected.
+                    self.stream.queue.extend(charStack[1:])
+                    char = self.consumeNumberEntity(False)
+                else:
+                    # No number entity detected.
+                    self.stream.queue.extend(charStack)
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Numeric entity expected but none found.")})
+        # Break out if we reach the end of the file
+        elif charStack[0] == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Entity expected. Got end of file instead.")})
+        else:
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            filteredEntityList = [e for e in entities if \
+              e.startswith(charStack[0])]
+
+            def entitiesStartingWith(name):
+                return [e for e in filteredEntityList if e.startswith(name)]
+
+            while charStack[-1] != EOF and\
+              entitiesStartingWith("".join(charStack)):
+                charStack.append(self.stream.char())
+
+            # At this point we have a string that starts with some characters
+            # that may match an entity
+            entityName = None
+
+            # Try to find the longest entity the string will match
+            for entityLength in xrange(len(charStack)-1,1,-1):
+                possibleEntityName = "".join(charStack[:entityLength])
+                if possibleEntityName in entities:
+                    entityName = possibleEntityName
+                    break
+
+            if entityName is not None:
+                char = entities[entityName]
+
+                # Check whether or not the last character returned can be
+                # discarded or needs to be put back.
+                if not charStack[-1] == ";":
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Named entity did not  ';'.")})
+                    self.stream.queue.extend(charStack[entityLength:])
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Named entity expected. Got none.")})
+                self.stream.queue.extend(charStack)
+        return char
+
+    def processEntityInAttribute(self):
+        """This method replaces the need for "entityInAttributeValueState".
+        """
+        entity = self.consumeEntity()
+        if entity:
+            self.currentToken["data"][-1][1] += entity
+        else:
+            self.currentToken["data"][-1][1] += u"&"
+
+    def emitCurrentToken(self):
+        """This method is a generic handler for emitting the StartTag,
+        EndTag, Comment and Doctype. It also sets the state to
+        "data" because that's what's needed after a token has been emitted.
+        """
+
+        # Although isinstance() is http://www.canonical.org/~kragen/isinstance/
+        # considered harmful it should be ok here given that the classes are for
+        # internal usage.
+
+        token = self.currentToken
+
+        # If an end tag has attributes it's a parse error and they should
+        # be removed
+        if token["type"] == "EndTag" and token["data"]:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("End tag contains unexpected attributes.")})
+            token["data"] = {}
+
+        # Add token to the queue to be yielded
+        self.tokenQueue.append(token)
+        self.state = self.states["data"]
+
+    def emitCurrentTokenWithParseError(self, data=None):
+        # XXX if we want useful error messages we need to inline this method
+        """This method is equivalent to emitCurrentToken (well, it invokes it)
+        except that it also puts "data" back on the characters queue if a data
+        argument is provided and it throws a parse error."""
+        if data:
+            self.stream.queue.append(data)
+        self.tokenQueue.append({"type": "ParseError", "data":
+          _("XXX Something is wrong with the emitted token.")})
+        self.emitCurrentToken()
+
+    def attributeValueQuotedStateHandler(self, quoteType):
+        data = self.stream.char()
+        if data == quoteType:
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"&":
+            self.processEntityInAttribute()
+        elif data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
+              (quoteType, u"&"))
+
+    # Below are the various tokenizer states worked out.
+
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elif
+    # statements should be.
+
+    def dataState(self):
+        data = self.stream.char()
+        if data == u"&" and self.contentModelFlag in\
+          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
+            self.state = self.states["entityData"]
+        elif data == u"<" and self.contentModelFlag !=\
+          contentModelFlags["PLAINTEXT"]:
+            self.state = self.states["tagOpen"]
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            # XXX need to check if we don't need a special "spaces" flag on
+            # characters.
+            self.tokenQueue.append({"type": "SpaceCharacters", "data":
+              data + self.stream.charsUntil(spaceCharacters, True)})
+        else:
+            self.tokenQueue.append({"type": "Characters", "data": 
+              data + self.stream.charsUntil((u"&", u"<"))})
+        return True
+
+    def entityDataState(self):
+        entity = self.consumeEntity()
+        if entity:
+            self.tokenQueue.append({"type": "Characters", "data": entity})
+        else:
+            self.tokenQueue.append({"type": "Characters", "data": u"&"})
+        self.state = self.states["data"]
+        return True
+
+    def tagOpenState(self):
+        data = self.stream.char()
+        if self.contentModelFlag == contentModelFlags["PCDATA"]:
+            if data == u"!":
+                self.state = self.states["markupDeclarationOpen"]
+            elif data == u"/":
+                self.state = self.states["closeTagOpen"]
+            elif data in asciiLetters:
+                self.currentToken =\
+                  {"type": "StartTag", "name": data, "data": []}
+                self.state = self.states["tagName"]
+            elif data == u">":
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got '>' instead.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
+                self.state = self.states["data"]
+            elif data == u"?":
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
+                self.stream.queue.append(data)
+                self.state = self.states["bogusComment"]
+            else:
+                # XXX
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got something else instead")})
+                # XXX can't we do "<" + data here?
+                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.stream.queue.append(data)
+                self.state = self.states["data"]
+        else:
+            # We know the content model flag is set to either RCDATA or CDATA
+            # now because this state can never be entered with the PLAINTEXT
+            # flag.
+            if data == u"/":
+                self.state = self.states["closeTagOpen"]
+            else:
+                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.stream.queue.append(data)
+                self.state = self.states["data"]
+        return True
+
+    def closeTagOpenState(self):
+        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
+          contentModelFlags["CDATA"]):
+            charStack = []
+
+            # So far we know that "</" has been consumed. We now need to know
+            # whether the next few characters match the name of last emitted
+            # start tag which also happens to be the currentToken. We also need
+            # to have the character directly after the characters that could
+            # match the start tag name.
+            for x in xrange(len(self.currentToken["name"]) + 1):
+                charStack.append(self.stream.char())
+                # Make sure we don't get hit by EOF
+                if charStack[-1] == EOF:
+                    break
+
+            # Since this is just for checking. We put the characters back on
+            # the stack.
+            self.stream.queue.extend(charStack)
+
+            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
+              and charStack[-1] in (spaceCharacters |
+              frozenset((u">", u"/", u"<", EOF))):
+                # Because the characters are correct we can safely switch to
+                # PCDATA mode now. This also means we don't have to do it when
+                # emitting the end tag token.
+                self.contentModelFlag = contentModelFlags["PCDATA"]
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag after seeing '</'. None found.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"</"})
+                self.state = self.states["data"]
+
+                # Need to return here since we don't want the rest of the
+                # method to be walked through.
+                return True
+
+        if self.contentModelFlag == contentModelFlags["PCDATA"]:
+            data = self.stream.char()
+            if data in asciiLetters:
+                self.currentToken =\
+                  {"type": "EndTag", "name": data, "data": []}
+                self.state = self.states["tagName"]
+            elif data == u">":
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+                self.state = self.states["data"]
+            elif data == EOF:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Unexpected end of file.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"</"})
+                self.state = self.states["data"]
+            else:
+                # XXX data can be '...
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
+                self.stream.queue.append(data)
+                self.state = self.states["bogusComment"]
+        return True
+
+    def tagNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeAttributeName"]
+        elif data in asciiLetters:
+            self.currentToken["name"] += data +\
+              self.stream.charsUntil(asciiLetters, True)
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        else:
+            self.currentToken["name"] += data
+        return True
+
+    def beforeAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"/":
+            self.processSolidusInTag()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        return True
+
+    def attributeNameState(self):
+        data = self.stream.char()
+        leavingThisState = True
+        if data == u"=":
+            self.state = self.states["beforeAttributeValue"]
+        elif data in asciiLetters:
+            self.currentToken["data"][-1][0] += data +\
+              self.stream.charsUntil(asciiLetters, True)
+            leavingThisState = False
+        elif data == u">":
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+            pass
+        elif data in spaceCharacters:
+            self.state = self.states["afterAttributeName"]
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+            leavingThisState = False
+        else:
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+
+        if leavingThisState:
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            for name, value in self.currentToken["data"][:-1]:
+                if self.currentToken["data"][-1][0] == name:
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Dropped duplicate attribute on tag.")})
+            # XXX Fix for above XXX
+            if data == u">":
+                self.emitCurrentToken()
+        return True
+
+    def afterAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == u"=":
+            self.state = self.states["beforeAttributeValue"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        return True
+
+    def beforeAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == u"\"":
+            self.state = self.states["attributeValueDoubleQuoted"]
+        elif data == u"&":
+            self.state = self.states["attributeValueUnQuoted"]
+            self.stream.queue.append(data);
+        elif data == u"'":
+            self.state = self.states["attributeValueSingleQuoted"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data
+            self.state = self.states["attributeValueUnQuoted"]
+        return True
+
+    def attributeValueDoubleQuotedState(self):
+        # AT We could also let self.attributeValueQuotedStateHandler always
+        # return true and then return that directly here. Not sure what is
+        # faster or better...
+        self.attributeValueQuotedStateHandler(u"\"")
+        return True
+
+    def attributeValueSingleQuotedState(self):
+        self.attributeValueQuotedStateHandler(u"'")
+        return True
+
+    def attributeValueUnQuotedState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"&":
+            self.processEntityInAttribute()
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
+              frozenset(("&", ">","<")) | spaceCharacters)
+        return True
+
+    def bogusCommentState(self):
+        # Make a new comment token and give it as value all the characters
+        # until the first > or EOF (charsUntil checks for EOF automatically)
+        # and emit it.
+        self.tokenQueue.append(
+          {"type": "Comment", "data": self.stream.charsUntil((u">"))})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an EOF.
+        self.stream.char()
+        self.state = self.states["data"]
+        return True
+
+    def markupDeclarationOpenState(self):
+        charStack = [self.stream.char(), self.stream.char()]
+        if charStack == [u"-", u"-"]:
+            self.currentToken = {"type": "Comment", "data": ""}
+            self.state = self.states["comment"]
+        else:
+            for x in xrange(5):
+                charStack.append(self.stream.char())
+            # Put in explicit EOF check
+            if (not EOF in charStack and
+                "".join(charStack).upper() == u"DOCTYPE"):
+                self.currentToken =\
+                  {"type": "Doctype", "name": "", "data": True}
+                self.state = self.states["doctype"]
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected '--' or 'DOCTYPE'. Not found.")})
+                self.stream.queue.extend(charStack)
+                self.state = self.states["bogusComment"]
+        return True
+
+    def commentState(self):
+        data = self.stream.char()
+        if data == u"-":
+            self.state = self.states["commentDash"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+        return True
+
+    def commentDashState(self):
+        data = self.stream.char()
+        if data == u"-":
+            self.state = self.states["commentEnd"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["data"] += u"-" + data +\
+              self.stream.charsUntil(u"-")
+            # Consume the next character which is either a "-" or an EOF as
+            # well so if there's a "-" directly after the "-" we go nicely to
+            # the "comment end state" without emitting a ParseError() there.
+            self.stream.char()
+        return True
+
+    def commentEndState(self):
+        data = self.stream.char()
+        if data == u">":
+            # XXX EMIT
+            self.emitCurrentToken()
+        elif data == u"-":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected '-' after '--' found in comment.")})
+            self.currentToken["data"] += data
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            # XXX
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in comment found.")})
+            self.currentToken["data"] += u"--" + data
+            self.state = self.states["comment"]
+        return True
+
+    def doctypeState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeDoctypeName"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("No space after literal string 'DOCTYPE'.")})
+            self.stream.queue.append(data)
+            self.state = self.states["beforeDoctypeName"]
+        return True
+
+    def beforeDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data in asciiLowercase:
+            self.currentToken["name"] = data.upper()
+            self.state = self.states["doctypeName"]
+        elif data == u">":
+            # Character needs to be consumed per the specification so don't
+            # invoke emitCurrentTokenWithParseError with "data" as argument.
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["name"] = data
+            self.state = self.states["doctypeName"]
+        return True
+
+    def doctypeNameState(self):
+        data = self.stream.char()
+        needsDoctypeCheck = False
+        if data in spaceCharacters:
+            self.state = self.states["afterDoctypeName"]
+            needsDoctypeCheck = True
+        elif data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            # We can't just uppercase everything that arrives here. For
+            # instance, non-ASCII characters.
+            if data in asciiLowercase:
+                data = data.upper()
+            self.currentToken["name"] += data
+            needsDoctypeCheck = True
+
+        # After some iterations through this state it should eventually say
+        # "HTML". Otherwise there's an error.
+        if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
+            self.currentToken["data"] = False
+        return True
+
+    def afterDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.currentToken["data"] = True
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Expected space or '>'. Got '" + data + "'")})
+            self.currentToken["data"] = True
+            self.state = self.states["bogusDoctype"]
+        return True
+
+    def bogusDoctypeState(self):
+        data = self.stream.char()
+        if data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            pass
+        return True
diff --git a/planet/html5lib/treebuilders/__init__.py b/planet/html5lib/treebuilders/__init__.py
new file mode 100755
index 0000000..6171bd1
--- /dev/null
+++ b/planet/html5lib/treebuilders/__init__.py
@@ -0,0 +1,36 @@
+"""A collection of modules for building different kinds of tree from
+HTML documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1) A set of classes for various types of elements: Document, Doctype,
+Comment, Element. These must implement the interface of
+_base.treebuilders.Node (although comment nodes have a different
+signature for their constructor, see treebuilders.simpletree.Comment)
+Textual content may also be implemented as another node type, or not, as
+your tree implementation requires.
+
+2) A treebuilder object (called TreeBuilder by convention) that
+inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
+documentClass - the class to use for the bottommost node of a document
+elementClass - the class to use for HTML Elements
+commentClass - the class to use for comments
+doctypeClass - the class to use for doctypes
+It also has one required method:
+getDocument - Returns the root node of the complete document tree
+
+3) If you wish to run the unit tests, you must also create a
+testSerializer method on your treebuilder which accepts a node and
+returns a string containing Node and its children serialized according
+to the format used in the unittests
+
+The supplied simpletree module provides a python-only implementation
+of a full treebuilder and is a useful reference for the semantics of
+the various methods.
+"""
+
+import os.path
+__path__.append(os.path.dirname(__path__[0]))
+
+import dom, etree, simpletree
diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py
new file mode 100755
index 0000000..8cc2af1
--- /dev/null
+++ b/planet/html5lib/treebuilders/_base.py
@@ -0,0 +1,312 @@
+from constants import scopingElements, tableInsertModeElements
+
+# The scope markers are inserted when entering buttons, object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, buttons, object elements, and marquees.
+Marker = None
+
+#XXX - TODO; make the default interface more ElementTree-like
+#            rather than DOM-like
+
+class Node(object):
+    def __init__(self, name):
+        """Node representing an item in the tree.
+        name - The tag name associated with the node
+        parent - The parent of the current node (or None for the document node)
+        value - The value of the current node (applies to text nodes and 
+        comments
+        attributes - a dict holding name, value pairs for attributes of the node
+        childNodes - a list of child nodes of the current node. This must 
+        include all elements but not necessarily other node types
+        _flags - A list of miscellaneous flags that can be set on the node
+        """
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.attributes = {}
+        self.childNodes = []
+        self._flags = []
+
+    def __unicode__(self):
+        attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
+                                   for name, value in 
+                                   self.attributes.iteritems()])
+        if attributesStr:
+            return "<%s %s>"%(self.name,attributesStr)
+        else:
+            return "<%s>"%(self.name)
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__, self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the 
+        start of node insertBefore or to the end of the node's text.
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the 
+        list of child nodes. Raises ValueError if refNode is not a child of 
+        the current node"""
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent. 
+        This is needed so that trees that don't store text as nodes move the 
+        text in the correct way
+        """
+        #XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+    documentClass - the class to use for the bottommost node of a document
+    elementClass - the class to use for HTML Elements
+    commentClass - the class to use for comments
+    doctypeClass - the class to use for doctypes
+    """
+
+    #Document class
+    documentClass = None
+
+    #The class to use for creating a node
+    elementClass = None
+
+    #The class to use for creating comments
+    commentClass = None
+
+    #The class to use for creating doctypes
+    doctypeClass = None
+
+    def __init__(self):
+        self.reset()
+    
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = []
+
+        #XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, tableVariant=False):
+        # Exit early when possible.
+        if self.openElements[-1].name == target:
+            return True
+
+        # AT Use reverse instead of [::-1] when we can rely on Python 2.4
+        # AT How about while True and simply set node to [-1] and set it to
+        # [-2] at the end...
+        for node in self.openElements[::-1]:
+            if node.name == target:
+                return True
+            elif node.name == "table":
+                return False
+            elif not tableVariant and node.name in scopingElements:
+                return False
+            elif node.name == "html":
+                return False
+        assert False # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = -1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            # Step 5: let entry be one earlier in the list.
+            i -= 1
+            try:
+                entry = self.activeFormattingElements[i]
+            except:
+                # Step 4: at this point we need to jump to step 8. By not doing
+                # i += 1 which is also done in step 7 we achieve that.
+                break
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            clone = self.activeFormattingElements[i].cloneNode()
+
+            # Step 9
+            element = self.insertElement(clone.name, clone.attributes)
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertDoctype(self, name):
+        self.document.appendChild(self.doctypeClass(name))
+
+    def insertComment(self, data, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(data))
+                           
+    def createElement(self, name, attributes):
+        """Create an element but don't insert it anywhere"""
+        element = self.elementClass(name)
+        element.attributes = attributes
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+        
+    def insertElementNormal(self, name, attributes):
+        element = self.elementClass(name)
+        element.attributes = attributes
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, name, attributes):
+        """Create an element and insert it into the tree""" 
+        element = self.elementClass(name)
+        element.attributes = attributes
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(name, attributes)
+        else:
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not(self.insertFromTable) or (self.insertFromTable and
+                                          self.openElements[-1].name not in
+                                          tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+            
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        #The foster parent element is the one which comes before the most
+        #recently opened table element
+        #XXX - this is really inelegant
+        lastTable=None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == u"table":
+                lastTable = elm
+                break
+        if lastTable:
+            #XXX - we should really check that this parent is actually a
+            #node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            assert self.innerHTML
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
+            and name != exclude):
+            self.openElements.pop()
+            # XXX Until someone has broven that the above breaks stuff I think
+            # we should keep it in.
+            # self.processEndTag(name)
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        "Return the final tree"
+        return self.document
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+        node - the node from which to start serializing"""
+        raise NotImplementedError
diff --git a/planet/html5lib/treebuilders/dom.py b/planet/html5lib/treebuilders/dom.py
new file mode 100755
index 0000000..7d8b319
--- /dev/null
+++ b/planet/html5lib/treebuilders/dom.py
@@ -0,0 +1,127 @@
+import _base
+from xml.dom import minidom, Node
+
+import re
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+
+class AttrList:
+    def __init__(self, element):
+        self.element = element
+    def __iter__(self):
+        return self.element.attributes.items().__iter__()
+    def __setitem__(self, name, value):
+        value=illegal_xml_chars.sub(u'\uFFFD',value)
+        self.element.setAttribute(name, value)
+    def items(self):
+        return self.element.attributes.items()
+
+class NodeBuilder(_base.Node):
+    def __init__(self, element):
+        _base.Node.__init__(self, element.nodeName)
+        self.element = element
+
+    def appendChild(self, node):
+        node.parent = self
+        self.element.appendChild(node.element)
+
+    def insertText(self, data, insertBefore=None):
+        data=illegal_xml_chars.sub(u'\uFFFD',data)
+        text = self.element.ownerDocument.createTextNode(data)
+        if insertBefore:
+            self.element.insertBefore(text, insertBefore.element)
+        else:
+            self.element.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        self.element.insertBefore(node.element, refNode.element)
+        node.parent = self
+
+    def removeChild(self, node):
+        self.element.removeChild(node.element)
+        node.parent = None
+
+    def reparentChildren(self, newParent):
+        while self.element.hasChildNodes():
+            child = self.element.firstChild
+            self.element.removeChild(child)
+            newParent.element.appendChild(child)
+        self.childNodes = []
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes:
+            for name, value in attributes.items():
+                value=illegal_xml_chars.sub(u'\uFFFD',value)
+                self.element.setAttribute(name, value)
+
+    attributes = property(getAttributes, setAttributes)
+
+    def cloneNode(self):
+        return NodeBuilder(self.element.cloneNode(False))
+
+    def hasContent(self):
+        return self.element.hasChildNodes()
+
+class TreeBuilder(_base.TreeBuilder):
+    def documentClass(self):
+        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
+        return self
+
+    def doctypeClass(self,name):
+        domimpl = minidom.getDOMImplementation()
+        return NodeBuilder(domimpl.createDocumentType(name,None,None))
+
+    def elementClass(self, name):
+        return NodeBuilder(self.dom.createElement(name))
+        
+    def commentClass(self, data):
+        return NodeBuilder(self.dom.createComment(data))
+
+    def appendChild(self, node):
+        self.dom.appendChild(node.element)
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.dom
+
+    def insertText(self, data, parent=None):
+        data=illegal_xml_chars.sub(u'\uFFFD',data)
+        if parent <> self:
+            _base.TreeBuilder.insertText(self, data, parent)
+        else:
+            # HACK: allow text nodes as children of the document node
+            if hasattr(self.dom, '_child_node_types'):
+                if not Node.TEXT_NODE in self.dom._child_node_types:
+                    self.dom._child_node_types=list(self.dom._child_node_types)
+                    self.dom._child_node_types.append(Node.TEXT_NODE)
+            self.dom.appendChild(self.dom.createTextNode(data))
+
+    name = None
+
+def testSerializer(element):
+    element.normalize()
+    rv = []
+    def serializeElement(element, indent=0):
+        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+        elif element.nodeType == Node.DOCUMENT_NODE:
+            rv.append("#document")
+        elif element.nodeType == Node.COMMENT_NODE:
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
+        elif element.nodeType == Node.TEXT_NODE:
+            rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
+            if element.hasAttributes():
+                for name, value in element.attributes.items():
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+        indent += 2
+        for child in element.childNodes:
+            serializeElement(child, indent)
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
diff --git a/planet/html5lib/treebuilders/etree.py b/planet/html5lib/treebuilders/etree.py
new file mode 100755
index 0000000..1d1c5e6
--- /dev/null
+++ b/planet/html5lib/treebuilders/etree.py
@@ -0,0 +1,208 @@
+try:
+    from xml.etree import ElementTree
+except ImportError:
+    from elementtree import ElementTree
+
+import _base
+
+class Element(_base.Node):
+    def __init__(self, name):
+        self._element = ElementTree.Element(name)
+        self.name = name
+        self.parent = None
+        self._childNodes = []
+        self._flags = []
+
+        #Set the element text and tail to the empty string rather than None
+        #XXX - is this desirable or should we do it on a case by case basis?
+        self._element.text = ""
+        self._element.tail = ""
+
+    def _setName(self, name):
+        self._element.tag = name
+    
+    def _getName(self):
+        return self._element.tag
+
+    name = property(_getName, _setName)
+
+    def _getAttributes(self):
+        return self._element.attrib
+
+    def _setAttributes(self, attributes):
+        #Delete existing attributes first
+        #XXX - there may be a better way to do this...
+        for key in self._element.attrib.keys():
+            del self._element.attrib[key]
+        for key, value in attributes.iteritems():
+            self._element.set(key, value)
+
+    attributes = property(_getAttributes, _setAttributes)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    def _setChildNodes(self, value):
+        del self._element[:]
+        self._childNodes = []
+        for element in value:
+            self.insertChild(element)
+
+    childNodes = property(_getChildNodes, _setChildNodes)
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self._element.text or self._element.getchildren())
+
+    def appendChild(self, node):
+        self._childNodes.append(node)
+        self._element.append(node._element)
+        node.parent = self
+
+    def insertBefore(self, node, refNode):
+        index = self._element.getchildren().index(refNode._element)
+        self._element.insert(index, node._element)
+        node.parent = self
+
+    def removeChild(self, node):
+        self._element.remove(node._element)
+        node.parent=None
+
+    def insertText(self, data, insertBefore=None):
+        if not(len(self._element)):
+            self._element.text += data
+        elif insertBefore is None:
+            #Insert the text as the tail of the last child element
+            self._element[-1].tail += data
+        else:
+            #Insert the text before the specified node
+            children = self._element.getchildren()
+            index = children.index(insertBefore._element)
+            if index > 0:
+                self._element[index-1].tail += data
+            else:
+                self._element.text += data
+
+    def cloneNode(self):
+        element = Element(self.name)
+        element.attributes = self.attributes
+        return element
+
+    def reparentChildren(self, newParent):
+        if newParent.childNodes:
+            newParent.childNodes[-1]._element.tail += self._element.text
+        else:
+            newParent._element.text += self._element.text
+        self._element.text = ""
+        _base.Node.reparentChildren(self, newParent)
+
+class Comment(Element):
+    def __init__(self, data):
+        Element.__init__(self, Comment)
+        self._element.text = data
+
+    def _getData(self):
+        return self._element.text
+
+    def _setData(self, value):
+        self._element.text = value
+
+    data = property(_getData, _setData)
+
+class DocumentType(Element):
+    def __init__(self, name):
+        Element.__init__(self, DocumentType) 
+        self._element.text = name
+
+class Document(Element):
+    def __init__(self):
+        Element.__init__(self, Document) 
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    def serializeElement(element, indent=0):
+        if element.tag is DocumentType:
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+        elif element.tag is Document:
+            rv.append("#document")
+            if element.text:
+                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
+            if element.tail:
+                finalText = element.tail
+        elif element.tag is Comment:
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.tag))
+            if hasattr(element, "attrib"):
+                for name, value in element.attrib.iteritems():
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+        indent += 2
+        for child in element.getchildren():
+            serializeElement(child, indent)
+        if element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if element.tag is DocumentType:
+            rv.append("<!DOCTYPE %s>"%(element.text,))
+        elif element.tag is Document:
+            if element.text:
+                rv.append(element.text)
+            if element.tail:
+                finalText = element.tail
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+        elif element.tag is Comment:
+            rv.append("<!--%s-->"%(element.text,))
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = Comment
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.document._element
diff --git a/planet/html5lib/treebuilders/simpletree.py b/planet/html5lib/treebuilders/simpletree.py
new file mode 100755
index 0000000..d93b656
--- /dev/null
+++ b/planet/html5lib/treebuilders/simpletree.py
@@ -0,0 +1,153 @@
+import _base
+from xml.sax.saxutils import escape
+
+# Really crappy basic implementation of a DOM-core like thing
+class Node(_base.Node):
+    def __init__(self, name):
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.childNodes = []
+        self._flags = []
+
+    def __unicode__(self):
+        return self.name
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__, self.name)
+
+    def printTree(self, indent=0):
+        tree = '\n|%s%s' % (' '* indent, unicode(self))
+        for child in self.childNodes:
+            tree += child.printTree(indent + 2)
+        return tree
+
+    def appendChild(self, node, index=None):
+        if (isinstance(node, TextNode) and self.childNodes and
+          isinstance(self.childNodes[-1], TextNode)):
+            self.childNodes[-1].value += node.value
+        else:
+            self.childNodes.append(node)
+        node.parent = self
+
+    def insertText(self, data, insertBefore=None):
+        if insertBefore is None:
+            self.appendChild(TextNode(data))
+        else:
+            self.insertBefore(TextNode(data), insertBefore)
+
+    def insertBefore(self, node, refNode):
+        index = self.childNodes.index(refNode)
+        if (isinstance(node, TextNode) and index > 0 and
+          isinstance(self.childNodes[index - 1], TextNode)):
+            self.childNodes[index - 1].value += node.value
+        else:
+            self.childNodes.insert(index, node)
+        node.parent = self
+
+    def removeChild(self, node):
+        try:
+            self.childNodes.remove(node)
+        except:
+            # XXX
+            raise
+        node.parent = None
+
+    def cloneNode(self):
+        newNode = type(self)(self.name)
+        for attr, value in self.attributes.iteritems():
+            newNode.attributes[attr] = value
+        newNode.value = self.value
+        return newNode
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self.childNodes)
+
+class Document(Node):
+    def __init__(self):
+        Node.__init__(self, None)
+
+    def __unicode__(self):
+        return "#document"
+
+    def printTree(self):
+        tree = unicode(self)
+        for child in self.childNodes:
+            tree += child.printTree(2)
+        return tree
+
+    def toxml(self, encoding="utf=8"):
+        result = ''
+        for child in self.childNodes:
+            result += child.toxml()
+        return result.encode(encoding)
+
+class DocumentType(Node):
+    def __init__(self, name):
+        Node.__init__(self, name)
+
+    def __unicode__(self):
+        return "<!DOCTYPE %s>" % self.name
+
+class TextNode(Node):
+    def __init__(self, value):
+        Node.__init__(self, None)
+        self.value = value
+
+    def __unicode__(self):
+        return "\"%s\"" % self.value
+
+    def toxml(self):
+        return escape(self.value)
+
+class Element(Node):
+    def __init__(self, name):
+        Node.__init__(self, name)
+        self.attributes = {}
+        
+    def __unicode__(self):
+        return "<%s>" % self.name
+
+    def printTree(self, indent):
+        tree = '\n|%s%s' % (' '*indent, unicode(self))
+        indent += 2
+        if self.attributes:
+            for name, value in self.attributes.iteritems():
+                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
+        for child in self.childNodes:
+            tree += child.printTree(indent)
+        return tree
+
+    def toxml(self):
+        result = '<' + self.name
+        if self.attributes:
+            for name,value in self.attributes.iteritems():
+                result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
+        if self.childNodes:
+            result += '>'
+            for child in self.childNodes:
+                result += child.toxml()
+            result += '</%s>' % self.name
+        else:
+            result += '/>'
+        return result
+
+class CommentNode(Node):
+    def __init__(self, data):
+        Node.__init__(self, None)
+        self.data = data
+
+    def __unicode__(self):
+        return "<!-- %s -->" % self.data
+
+    toxml = __unicode__ 
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = CommentNode
+    
+    def testSerializer(self, node):
+        return node.printTree()
diff --git a/planet/html5lib/utils.py b/planet/html5lib/utils.py
new file mode 100644
index 0000000..c71e864
--- /dev/null
+++ b/planet/html5lib/utils.py
@@ -0,0 +1,36 @@
+try:
+    frozenset
+except NameError:
+    #Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name,value in items:
+            if type(name) in (list, tuple, frozenset, set):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 51f21c7..b298665 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -15,9 +15,9 @@ Todo:
 """
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
-from xml.dom import minidom
+from xml.dom import minidom, Node
 from BeautifulSoup import BeautifulSoup
-from xml.parsers.expat import ExpatError
+from planet.html5lib import liberalxmlparser, treebuilders
 import planet, config
 
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -59,22 +59,6 @@ def cssid(name):
         name = nonalpha.sub('-',name).lower()
     return name.strip('-')
 
-def normalize(text, bozo):
-    """ convert everything to well formed XML """
-    if text.has_key('type'):
-        if text.type.lower().find('html')<0:
-            text['value'] = escape(text.value)
-            text['type'] = 'text/html'
-        if text.type.lower() == 'text/html' or bozo:
-            dom=BeautifulSoup(text.value,convertEntities="html")
-            for tag in dom.findAll(True):
-                for attr,value in tag.attrs:
-                    value=sgmllib.charref.sub(ncr2c,value)
-                    value=illegal_xml_chars.sub(u'\uFFFD',value)
-                    tag[attr]=value
-            text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
-    return text
-
 def id(xentry, entry):
     """ copy or compute an id for the entry """
 
@@ -150,27 +134,32 @@ def author(xentry, name, detail):
 def content(xentry, name, detail, bozo):
     """ insert a content-like element into the entry """
     if not detail or not detail.value: return
-    normalize(detail, bozo)
+
+    data = None
+    xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
     xdoc = xentry.ownerDocument
     xcontent = xdoc.createElement(name)
+    if isinstance(detail.value,unicode):
+        detail.value=detail.value.encode('utf-8')
 
-    try:
-        # see if the resulting text is a well-formed XML fragment
-        div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
-        if isinstance(detail.value,unicode):
-            detail.value=detail.value.encode('utf-8')
-        data = minidom.parseString(div % detail.value).documentElement
+    parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+    html = parser.parse(xdiv % detail.value, encoding="utf-8")
+    for body in html.documentElement.childNodes:
+        if body.nodeType != Node.ELEMENT_NODE: continue
+        if body.nodeName != 'body': continue
+        for div in body.childNodes:
+            if div.nodeType != Node.ELEMENT_NODE: continue
+            if div.nodeName != 'div': continue
+            div.normalize()
+            if len(div.childNodes) == 1 and \
+                div.firstChild.nodeType == Node.TEXT_NODE:
+                data = div.firstChild
+            else:
+                data = div
+                xcontent.setAttribute('type', 'xhtml')
+            break
 
-        if detail.value.find('<') < 0:
-            xcontent.appendChild(data.firstChild)
-        else:
-            xcontent.setAttribute('type', 'xhtml')
-            xcontent.appendChild(data)
-
-    except ExpatError:
-        # leave as html
-        xcontent.setAttribute('type', 'html')
-        xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
+    if data: xcontent.appendChild(data)
 
     if detail.get("language"):
         xcontent.setAttribute('xml:lang', detail.language)
diff --git a/tests/data/reconstitute/content_illegal_char.xml b/tests/data/reconstitute/content_illegal_char.xml
index 369ea2c..0b0a5b1 100644
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
+Expect:       content[0].value == u'Page 1\ufffdPage 2'
 -->
 
 <feed xmns="http://www.w3.org/2005/Atom">

From f2ac92465da90c110072816fedd1fd03b5511140 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Fri, 12 Jan 2007 06:19:19 -0500
Subject: [PATCH 38/39] Properly handle content type text/plain

---
 planet/reconstitute.py | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index b298665..3c8ba71 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -16,7 +16,6 @@ Todo:
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from BeautifulSoup import BeautifulSoup
 from planet.html5lib import liberalxmlparser, treebuilders
 import planet, config
 
@@ -139,25 +138,33 @@ def content(xentry, name, detail, bozo):
     xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
     xdoc = xentry.ownerDocument
     xcontent = xdoc.createElement(name)
+
     if isinstance(detail.value,unicode):
         detail.value=detail.value.encode('utf-8')
 
-    parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
-    html = parser.parse(xdiv % detail.value, encoding="utf-8")
-    for body in html.documentElement.childNodes:
-        if body.nodeType != Node.ELEMENT_NODE: continue
-        if body.nodeName != 'body': continue
-        for div in body.childNodes:
-            if div.nodeType != Node.ELEMENT_NODE: continue
-            if div.nodeName != 'div': continue
-            div.normalize()
-            if len(div.childNodes) == 1 and \
-                div.firstChild.nodeType == Node.TEXT_NODE:
-                data = div.firstChild
-            else:
-                data = div
-                xcontent.setAttribute('type', 'xhtml')
-            break
+    if not detail.has_key('type') or detail.type.lower().find('html')<0:
+        detail['value'] = escape(detail.value)
+        detail['type'] = 'text/html'
+
+    if detail.type.find('xhtml')>=0 and not bozo:
+        data = minidom.parseString(xdiv % detail.value).documentElement
+    else:
+        parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+        html = parser.parse(xdiv % detail.value, encoding="utf-8")
+        for body in html.documentElement.childNodes:
+            if body.nodeType != Node.ELEMENT_NODE: continue
+            if body.nodeName != 'body': continue
+            for div in body.childNodes:
+                if div.nodeType != Node.ELEMENT_NODE: continue
+                if div.nodeName != 'div': continue
+                div.normalize()
+                if len(div.childNodes) == 1 and \
+                    div.firstChild.nodeType == Node.TEXT_NODE:
+                    data = div.firstChild
+                else:
+                    data = div
+                    xcontent.setAttribute('type', 'xhtml')
+                break
 
     if data: xcontent.appendChild(data)
 

From be5c093b347446605260ab22dc31a2e034e1064c Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Sat, 13 Jan 2007 10:51:46 -0500
Subject: [PATCH 39/39] Fix regression on handling non-bozo xhtml content and
 summaries

---
 planet/reconstitute.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 3c8ba71..010e27b 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -148,6 +148,7 @@ def content(xentry, name, detail, bozo):
 
     if detail.type.find('xhtml')>=0 and not bozo:
         data = minidom.parseString(xdiv % detail.value).documentElement
+        xcontent.setAttribute('type', 'xhtml')
     else:
         parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
         html = parser.parse(xdiv % detail.value, encoding="utf-8")