From a5e1fde287e83779aca83a9f4511fdd5d43dbbd1 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Mon, 30 Apr 2007 09:38:09 -0400
Subject: [PATCH] MeMeme and html2xhtml plugins

---
 .bzrignore                                 |   1 +
 docs/filters.html                          |  23 +-
 docs/index.html                            |   2 +-
 examples/opml-top100.ini                   |   7 +
 filters/html2xhtml.plugin                  |   6 +
 filters/mememe.plugin                      | 475 +++++++++++++++++++++
 planet/config.py                           |   9 +-
 planet/html5lib/html5parser.py             | 172 +++++---
 planet/html5lib/inputstream.py             |  15 +-
 planet/html5lib/tokenizer.py               |  42 +-
 planet/html5lib/treebuilders/_base.py      |  11 +-
 planet/html5lib/treebuilders/dom.py        |  43 +-
 planet/html5lib/treebuilders/etreefull.py  |   8 +
 planet/html5lib/treebuilders/simpletree.py |  47 +-
 planet/shell/__init__.py                   |   1 +
 planet/shell/plugin.py                     |  64 +++
 planet/spider.py                           |   1 -
 runtests.py                                |  25 +-
 tests/data/apply/config-mememe.ini         |  29 ++
 tests/test_apply.py                        |  16 +-
 20 files changed, 878 insertions(+), 119 deletions(-)
 create mode 100644 filters/html2xhtml.plugin
 create mode 100644 filters/mememe.plugin
 create mode 100644 planet/shell/plugin.py
 create mode 100644 tests/data/apply/config-mememe.ini
diff --git a/.bzrignore b/.bzrignore
index 1d1886c..a8f0629 100644
--- a/.bzrignore
+++ b/.bzrignore
@@ -1,3 +1,4 @@
 *.tmplc
 .DS_Store
 cache
+*.pluginc
diff --git a/docs/filters.html b/docs/filters.html
index 58eb6fe..228f323 100644
--- a/docs/filters.html
+++ b/docs/filters.html
@@ -8,12 +8,13 @@
 <title>Venus Filters</title>
 </head>
 <body>
-<h2>Filters</h2>
-<p>Filters are simple Unix pipes.  Input comes in <code>stdin</code>,
-parameters come from the config file, and output goes to <code>stdout</code>.
-Anything written to <code>stderr</code> is logged as an ERROR message.  If no
-<code>stdout</code> is produced, the entry is not written to the cache or
-processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
+<h2>Filters and Plugins</h2>
+<p>Filters and plugins are simple Unix pipes.  Input comes in
+<code>stdin</code>, parameters come from the config file, and output goes to
+<code>stdout</code>.  Anything written to <code>stderr</code> is logged as an
+ERROR message.  If no <code>stdout</code> is produced, the entry is not written
+to the cache or processed further; in fact, if the entry had previously been
+written to the cache, it will be removed.</p>
 
 <p>There are two types of filters supported by Venus, input and template.</p>
 <p>Input to an input filter is a aggressively
@@ -89,6 +90,16 @@ an HTML output stream from one source.</li>
 <li>Templates written using htmltmpl or django currently only have access to a
 fixed set of fields, whereas XSLT and genshi templates have access to
 everything.</li>
+
+<li>Plugins differ from filters in that while filters are forked, plugins are
+<a href="http://docs.python.org/lib/module-imp.html">imported</a>.  This
+means that plugins are limited to Python and are run in-process.  Plugins
+therefore have direct access to planet internals like configuration and
+logging facitilies, as well as access to the bundled libraries like the
+<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
+<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
+means that functions like <code>os.abort()</code> can't be recovered
+from.</li>
 </ul>
 </body>
 </html>
diff --git a/docs/index.html b/docs/index.html
index ebdd234..c461d7f 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -21,7 +21,7 @@
 <ul>
 <li><a href="venus.svg">Architecture</a></li>
 <li><a href="normalization.html">Normalization</a></li>
-<li><a href="filters.html">Filters</a></li>
+<li><a href="filters.html">Filters and Plugins</a></li>
 </ul>
 </li>
 <li>Other
diff --git a/examples/opml-top100.ini b/examples/opml-top100.ini
index 0522472..01b210d 100644
--- a/examples/opml-top100.ini
+++ b/examples/opml-top100.ini
@@ -36,6 +36,13 @@ filters = excerpt.py
 omit = img p br
 width = 500
 
+# add memes to output
+[index.html.tmpl]
+filters = mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@id="footer"]
+
 # subscription list
 [http://share.opml.org/opml/top100.opml]
 content_type = opml
diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin
new file mode 100644
index 0000000..456df48
--- /dev/null
+++ b/filters/html2xhtml.plugin
@@ -0,0 +1,6 @@
+import sys
+from planet import html5lib
+tree=html5lib.treebuilders.dom.TreeBuilder
+parser = html5lib.html5parser.HTMLParser(tree=tree)
+document = parser.parse(sys.stdin)
+sys.stdout.write(document.toxml("utf-8"))
diff --git a/filters/mememe.plugin b/filters/mememe.plugin
new file mode 100644
index 0000000..2ce3b30
--- /dev/null
+++ b/filters/mememe.plugin
@@ -0,0 +1,475 @@
+#
+# This Venus output filter will annotate an XHTML page with a list of
+# "memes" (or most popular linked destinations, based on the last week
+# of entries from the cache) and will update the subscription list with
+# links to recent entries from each subscription.
+#
+# Templates that don't produce XHTML natively will need their output passed
+# through html2xhtml.plugin first.
+#
+# Typical configuration (based on classic_fancy):
+#
+#   [index.html.tmpl]
+#   filters:
+#     html2xhtml.plugin
+#     mememe.plugin
+#  
+#   [mememe.plugin]
+#   sidebar = @class='sidebar'
+#
+
+import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
+from xml.sax.saxutils import escape
+from htmlentitydefs import entitydefs
+
+import planet
+from planet import config, feedparser
+from planet.spider import filename
+log = planet.getLogger(config.log_level(),config.log_format())
+options = config.filter_options(sys.argv[0])
+
+MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
+
+now = time.time()
+week = 7 * 86400
+week_ago = now - week
+
+cache = config.cache_directory()
+meme_cache = os.path.join(cache, 'memes')
+if not os.path.exists(meme_cache): os.makedirs(meme_cache)
+
+all_links = {}
+feed_links = {}
+
+def check_cache(url):
+  try:
+    file = open(filename(meme_cache, url))
+    headers = eval(file.read())
+    file.close()
+    return headers or {}
+  except:
+    return {}
+
+def cache_meme(url, headers):
+  json = []
+  for key,value in headers.items():
+    json.append('  %s: %s' % (toj(key), toj(value)))
+  file = open(filename(meme_cache, url),'w')
+  file.write('{\n' + ',\n'.join(json) + '\n}\n')
+  file.close()
+
+urlmap = {}
+def canonicalize(url):
+  url = urlmap.get(url,url)
+  parts = list(urlparse.urlparse(url))
+
+  parts[0] = parts[0].lower()
+  parts[1] = parts[1].lower()
+  if parts[1].startswith('www.'): parts[1]=parts[1][4:]
+  if not parts[2]: parts[2] = '/'
+  parts[-1] = ''
+  return urlparse.urlunparse(parts)
+  
+log.debug("Loading cached data")
+for name in glob.glob(os.path.join(cache, '*')):
+  # ensure that this is within the past week
+  if os.path.isdir(name): continue
+  mtime = os.stat(name).st_mtime
+  if mtime < week_ago: continue
+
+  # parse the file
+  try:
+    doc = libxml2.parseFile(name)
+  except:
+    continue
+  xp = doc.xpathNewContext()
+  xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
+  xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
+
+  # determine the entry
+  entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
+  if not entry: continue
+  entry = canonicalize(entry[0].prop("href"))
+
+  # determine the title
+  title = xp.xpathEval("/atom:entry/atom:title")
+  if title:
+    if title[0].prop('type') == 'html':
+      title = re.sub('<.*?>','',title[0].content)
+    else:
+      title = title[0].content
+  title = str(title or '')
+
+  # determine the feed id
+  feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
+  if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
+  if not feed: continue
+  feed = feed[0].content
+
+  # determine the author
+  author = xp.xpathEval("/atom:entry/atom:source/planet:name")
+  if author:
+    author = author[0].content
+  else:
+    author = ''
+
+  # track the feed_links
+  if author:
+    if not feed_links.has_key(author): feed_links[author] = list()
+    feed_links[author].append([mtime, entry, title])
+
+  # identify the unique links
+  entry_links = []
+  for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
+      parent = node.parent
+      while parent:
+          if parent.name == 'source': break
+          parent = parent.parent
+      else:
+          link = canonicalize(node.prop('href'))
+          if not link in entry_links:
+              entry_links.append(link)
+          if node.hasProp('title') and node.prop('title').startswith('http'):
+              link = canonicalize(node.prop('title'))
+              if not link in entry_links:
+                  entry_links.append(link)
+
+  # add the votes
+  weight = 1.0 - (now - mtime)**2 / week**2
+  vote = [(weight, str(entry), str(feed), title, author, mtime)]
+  for link in entry_links:
+    all_links[link] = all_links.get(link,list()) + vote
+
+  # free the entry
+  doc.freeDoc()
+
+# tally the votes
+weighted_links = []
+for link, votes in all_links.items():
+  site = {}
+  updated = 0
+  for weight, entry, feed, title, author, mtime in votes:
+    site[feed] = max(site.get(feed,0), weight) 
+    if mtime > updated: updated=mtime
+  weighted_links.append((sum(site.values()), link, updated))
+weighted_links.sort()
+weighted_links.reverse()
+
+cp1252 = {
+  128: 8364, # euro sign
+  130: 8218, # single low-9 quotation mark
+  131:  402, # latin small letter f with hook
+  132: 8222, # double low-9 quotation mark
+  133: 8230, # horizontal ellipsis
+  134: 8224, # dagger
+  135: 8225, # double dagger
+  136:  710, # modifier letter circumflex accent
+  137: 8240, # per mille sign
+  138:  352, # latin capital letter s with caron
+  139: 8249, # single left-pointing angle quotation mark
+  140:  338, # latin capital ligature oe
+  142:  381, # latin capital letter z with caron
+  145: 8216, # left single quotation mark
+  146: 8217, # right single quotation mark
+  147: 8220, # left double quotation mark
+  148: 8221, # right double quotation mark
+  149: 8226, # bullet
+  150: 8211, # en dash
+  151: 8212, # em dash
+  152:  732, # small tilde
+  153: 8482, # trade mark sign
+  154:  353, # latin small letter s with caron
+  155: 8250, # single right-pointing angle quotation mark
+  156:  339, # latin small ligature oe
+  158:  382, # latin small letter z with caron
+  159:  376} # latin capital letter y with diaeresis
+
+# determine the title for a given url
+class html(sgmllib.SGMLParser):
+  def __init__(self, url):
+    sgmllib.SGMLParser.__init__(self)
+    self.title = ""
+    self.feedurl = ""
+    self.intitle = False
+
+    headers = check_cache(url)
+
+    try:
+        # fetch the page
+        request = urllib2.Request(url)
+        request.add_header('User-Agent', 'Venus/MeMeme')
+        if headers.has_key('etag'):
+            request.add_header('If-None-Match', headers['etag'])
+        if headers.has_key('last_modified'):
+            request.add_header('If-Modified-Since', headers['last-modified'])
+        response = urllib2.urlopen(request)
+        self.feed(response.read())
+
+        # ensure the data is in utf-8
+        try:
+            self.title = self.title.decode('utf-8')
+        except:
+            self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
+                for c in self.title.decode('iso-8859-1')])
+
+        # cache the results
+        headers = {}
+        if self.feedurl: headers['feedurl'] = self.feedurl
+        if self.title: headers['title'] = self.title
+        headers.update(response.headers)
+        cache_meme(url, headers)
+    except:
+        self.feedurl = headers.get('feedurl')
+        if headers.has_key('title'):
+           if isinstance(headers['title'],str):
+               self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
+           else:
+               self.title=headers['title']
+
+    # if there is a feed, look for an entry that matches, and take that title
+    if self.feedurl and not self.title:
+        headers = check_cache(self.feedurl)
+        data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
+            modified=headers.get('last-modified'))
+
+        if data.has_key('headers') and data.has_key('status') and \
+            data.status in [200, 301, 302]:
+
+            titles = {}
+            for entry in data.entries:
+                if entry.has_key('title_detail') and entry.has_key('link'):
+                    titles[entry.link] = entry.title_detail.value
+                    if entry.title_detail.type == 'text/plain':
+                        titles[entry.link] = escape(titles[entry.link])
+
+            if titles.has_key(url): self.title = titles[url]
+
+            data.headers.update(titles)
+            cache_meme(self.feedurl, data.headers)
+        else:
+            if headers.has_key(url):
+               if isinstance(headers[url],str):
+                   self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
+               else:
+                   self.title=headers[url]
+
+    # fallback is the basename of the URI
+    if not self.title:
+        self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
+
+  # parse out the first autodiscovery link
+  def start_link(self, attrs):
+    if self.feedurl: return
+    attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
+    if not 'rel' in attrs: return
+    rels = attrs['rel'].split(' ')
+    if 'alternate' not in rels: return
+    if not 'type' in attrs or not attrs['type'].endswith('xml'): return
+    if 'href' in attrs:
+      self.feedurl = attrs['href']
+
+  # parse the page title
+  def start_title(self, attributes):
+    if not self.title: self.intitle = True
+  def end_title(self):
+    self.intitle = False
+  def handle_data(self, text):
+    if self.intitle: self.title += escape(text)
+
+# convert unicode string to a json string
+def toj(value):
+  result = repr(value).replace(r'\x',r'\u00')
+  if result[:1] == 'u': result=result[1:]
+  if result.startswith("'"):
+    result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
+  return result
+
+seenit = []
+count = 0
+
+# construct an empty feed
+feed_doc = libxml2.newDoc("1.0")
+meme_feed = feed_doc.newChild(None, "feed", None)
+meme_feed.newNs('http://www.w3.org/2005/Atom', None)
+meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
+author = meme_feed.newChild(None, 'author', None)
+author.newTextChild(None, 'name', config.owner_name())
+if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
+meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
+link = meme_feed.newChild(None, 'link', None)
+link.setProp('href', os.path.join(config.link(), 'memes.atom'))
+link.setProp('rel', 'self')
+meme_feed.newTextChild(None, 'updated',
+  time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
+
+# parse the input
+log.debug("Parse input")
+doc=libxml2.parseDoc(sys.stdin.read())
+
+# find the sidebar/footer
+sidebar = options.get('sidebar','//*[@class="sidebar"]')
+footer = doc.xpathEval(sidebar)
+if not hasattr(footer,'__len__') or len(footer) == 0:
+  raise Exception(sidebar + ' not found')
+if len(footer) > 1:
+  log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
+footer = footer[0]
+
+# add up to 10 entry links to each subscription
+subs_ul = footer.children
+while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
+child = subs_ul.children
+while child:
+  if child.name == 'li':
+    if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
+    link = child.lastChild()
+    while link.isText(): link=link.prev
+    author = link.getContent()
+    state = 'inactive'
+    if feed_links.has_key(author):
+      ul2 = child.newChild(None, 'ul', None)
+      feed_links[author].sort()
+      feed_links[author].reverse()
+      link_count = 0
+      for mtime, entry, title in feed_links[author]:
+        if not title: continue
+        li2 = ul2.newChild(None, 'li', None)
+        a = li2.newTextChild(None, 'a', title)
+        a.setProp('href', entry)
+        link_count = link_count + 1
+        if link_count >= 10: break
+      if link_count > 0: state = None
+    if state:
+      link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
+  child=child.next
+
+# create a h2 and ul for the memes list
+footer_top = footer.children
+memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
+memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
+
+# create a header for the memes list
+a = memes.newChild(None, 'a', None)
+a.setProp('href', 'memes.atom')
+img = a.newChild(None, 'img', None)
+img.setProp('src', 'images/feed-icon-10x10.png')
+
+# collect the results
+log.debug("Fetch titles and collect the results")
+from urllib import quote_plus
+for i in range(0,len(weighted_links)):
+  weight, link, updated = weighted_links[i]
+
+  # ensure that somebody new points to this entry.  This guards against
+  # groups of related links which several posts point to all.
+  novel = False
+  for weight, entry, feed, title, author, mtime in all_links[link]:
+    if entry not in seenit:
+      seenit.append(entry)
+      novel = True
+  if not novel: continue
+
+  all_links[link].sort()
+  all_links[link].reverse()
+  cache_file = filename(cache, link)
+  title = None
+
+  # when possible, take the title from the cache
+  if os.path.exists(cache_file):
+      entry = feedparser.parse(cache_file).entries[0]
+      if entry.has_key('title_detail'):
+        title = entry.title_detail.value
+        if entry.title_detail.type == 'text/plain': title = escape(title)
+
+  # otherwise, parse the html
+  if not title:
+    title = html(link).title
+
+  # dehtmlize
+  title = re.sub('&(\w+);',
+    lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
+  title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
+  title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
+
+  # title too long?  Insert zero width spaces where appropriate
+  if max(map(len,title.split())) > 30:
+    title=re.sub('(\W+)',u'\\1\u200b',title)
+
+  # save the entry title (it is used later)
+  entry_title = title.strip()
+
+  # add to the memes list
+  memes_ul.addContent('\n')
+  li = memes_ul.newChild(None, 'li', None)
+  memes_ul.addContent('\n')
+
+  # technorati link
+  a = li.newChild(None, 'a', None)
+  tlink = 'http://technorati.com/cosmos/search.html?url='
+  if link.startswith('http://'):
+    a.setProp('href',tlink + quote_plus(link[7:]))
+  else:
+    a.setProp('href',tlink + quote_plus(link))
+  a.setProp('title','cosmos')
+  img = a.newChild(None, 'img', None)
+  img.setProp('src','tcosm11.gif')
+
+  # main link
+  a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
+  a.setProp('href',link)
+  if (((i==0) or (updated>=weighted_links[i-1][2])) and
+    (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
+    rank = 0
+    for j in range(0,len(weighted_links)):
+      if updated < weighted_links[j][2]: rank = rank + 1
+    if rank < len(weighted_links)/2:
+      a.setProp('class','rising')
+
+  # voters
+  ul2 = li.newChild(None, 'ul', None)
+  voters = []
+  for weight, entry, feed, title, author, mtime in all_links[link]:
+    if entry in voters: continue
+    li2 = ul2.newChild(None, 'li', None)
+    a = li2.newTextChild(None, 'a' , author)
+    a.setProp('href',entry)
+    if title: a.setProp('title',title)
+    voters.append(entry)
+
+  # add to the meme feed
+  if len(all_links[link]) > 2:
+    meme_feed.addContent('\n')
+    entry = meme_feed.newChild(None, 'entry', None)
+    meme_feed.addContent('\n')
+
+    # entry
+    tagbase = config.link().split('/')
+    if not tagbase[-1]: tagbase = tagbase[:-1]
+    tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
+    entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
+    entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
+    meme_link = entry.newTextChild(None, 'link', None)
+    meme_link.setProp('href', link)
+    entry.newTextChild(None, 'updated', 
+      time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
+
+    # voters
+    content = entry.newChild(None, 'content', None)
+    content.setProp('type', 'xhtml')
+    div = content.newTextChild(None, 'div', 'Spotted by:')
+    div.newNs('http://www.w3.org/1999/xhtml', None)
+    content_ul = div.newChild(None, 'ul', None)
+    for weight, entry, feed, title, author, mtime in all_links[link]:
+      li2 = content_ul.newTextChild(None, 'li', author + ": ")
+      a = li2.newTextChild(None, 'a' , title or 'untitled')
+      a.setProp('href',entry)
+
+  count = count + 1
+  if count >= 10: break
+
+log.info("Writing " + MEMES_ATOM)
+output=open(MEMES_ATOM,'w')
+output.write(feed_doc.serialize('utf-8'))
+output.close()
+
+sys.stdout.write(doc.serialize('utf-8'))
diff --git a/planet/config.py b/planet/config.py
index d2b84e6..fb436e8 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -352,14 +352,15 @@ def filters(section=None):
     filters = []
     if parser.has_option('Planet', 'filters'):
         filters += parser.get('Planet', 'filters').split()
-    if section and parser.has_option(section, 'filters'):
-        filters += parser.get(section, 'filters').split()
     if filter(section):
         filters.append('regexp_sifter.py?require=' +
             urllib.quote(filter(section)))
     if exclude(section):
         filters.append('regexp_sifter.py?exclude=' +
             urllib.quote(exclude(section)))
+    for section in section and [section] or template_files():
+        if parser.has_option(section, 'filters'):
+            filters += parser.get(section, 'filters').split()
     return filters
 
 def planet_options():
@@ -382,6 +383,10 @@ def template_options(section):
     """ dictionary of template specific options"""
     return feed_options(section)
 
+def filter_options(section):
+    """ dictionary of filter specific options"""
+    return feed_options(section)
+
 def write(file=sys.stdout):
     """ write out an updated template """
     print parser.write(file)
diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py
index a007616..898ec9f 100644
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@@ -71,35 +71,40 @@ class HTMLParser(object):
             "trailingEnd": TrailingEndPhase(self, self.tree)
         }
 
-    def parse(self, stream, encoding=None, innerHTML=False):
-        """Parse a HTML document into a well-formed tree
-
-        stream - a filelike object or string containing the HTML to be parsed
-
-        innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
-        is not yet supported)
-
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
-        """
-
+    def _parse(self, stream, innerHTML=False, container="div",
+               encoding=None):
+        
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
 
-        self.phase = self.phases["initial"]
+        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
+                                                 parseMeta=innerHTML)
+
+        if innerHTML:
+            self.innerHTML = container.lower()
+
+            if self.innerHTML in ('title', 'textarea'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+            elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+            elif self.innerHTML == 'plaintext':
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+            else:
+                # contentModelFlag already is PCDATA
+                #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+                pass
+            self.phase = self.phases["rootElement"]
+            self.phase.insertHtmlElement()
+            self.resetInsertionMode()
+        else:
+            self.innerHTML = False
+            self.phase = self.phases["initial"]
+
         # We only seem to have InBodyPhase testcases where the following is
         # relevant ... need others too
         self.lastPhase = None
 
-        # We don't actually support innerHTML yet but this should allow
-        # assertations
-        self.innerHTML = innerHTML
-
-        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
-
         # XXX This is temporary for the moment so there isn't any other
         # changes needed for the parser to work with the iterable tokenizer
         for token in self.tokenizer:
@@ -118,7 +123,34 @@ class HTMLParser(object):
         # When the loop finishes it's EOF
         self.phase.processEOF()
 
+    def parse(self, stream, encoding=None):
+        """Parse a HTML document into a well-formed tree
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+        self._parse(stream, innerHTML=False, encoding=encoding)
         return self.tree.getDocument()
+    
+    def parseFragment(self, stream, container="div", encoding=None):
+        """Parse a HTML fragment into a well-formed tree fragment
+        
+        container - name of the element we're setting the innerHTML property
+        if set to None, default to 'div'
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+        self._parse(stream, True, container=container, encoding=encoding)
+        return self.tree.getFragment()
 
     def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
         # XXX The idea is to make data mandatory.
@@ -187,28 +219,29 @@ class HTMLParser(object):
             "frameset":"inFrameset"
         }
         for node in self.tree.openElements[::-1]:
+            nodeName = node.name
             if node == self.tree.openElements[0]:
                 last = True
-                if node.name not in ['td', 'th']:
+                if nodeName not in ['td', 'th']:
                     # XXX
                     assert self.innerHTML
-                    raise NotImplementedError
+                    nodeName = self.innerHTML
             # Check for conditions that should only happen in the innerHTML
             # case
-            if node.name in ("select", "colgroup", "head", "frameset"):
+            if nodeName in ("select", "colgroup", "head", "frameset"):
                 # XXX
                 assert self.innerHTML
-            if node.name in newModes:
-                self.phase = self.phases[newModes[node.name]]
+            if nodeName in newModes:
+                self.phase = self.phases[newModes[nodeName]]
                 break
-            elif node.name == "html":
+            elif nodeName == "html":
                 if self.tree.headPointer is None:
                     self.phase = self.phases["beforeHead"]
                 else:
                    self.phase = self.phases["afterHead"]
                 break
             elif last:
-                self.phase = self.phases["body"]
+                self.phase = self.phases["inBody"]
                 break
 
 class Phase(object):
@@ -434,9 +467,7 @@ class InHeadPhase(Phase):
             self.parser.phase.processCharacters(data)
 
     def startTagHead(self, name, attributes):
-        self.tree.insertElement(name, attributes)
-        self.tree.headPointer = self.tree.openElements[-1]
-        self.parser.phase = self.parser.phases["inHead"]
+        self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
 
     def startTagTitle(self, name, attributes):
         element = self.tree.createElement(name, attributes)
@@ -455,10 +486,11 @@ class InHeadPhase(Phase):
         self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
 
     def startTagScript(self, name, attributes):
+        #XXX Inner HTML case may be wrong
         element = self.tree.createElement(name, attributes)
         element._flags.append("parser-inserted")
-        if self.tree.headPointer is not None and\
-          self.parser.phase == self.parser.phases["inHead"]:
+        if (self.tree.headPointer is not None and
+            self.parser.phase == self.parser.phases["inHead"]):
             self.appendToHead(element)
         else:
             self.tree.openElements[-1].appendChild(element)
@@ -653,8 +685,8 @@ class InBodyPhase(Phase):
 
     def startTagBody(self, name, attributes):
         self.parser.parseError(_(u"Unexpected start tag (body)."))
-        if len(self.tree.openElements) == 1 \
-          or self.tree.openElements[1].name != "body":
+        if (len(self.tree.openElements) == 1
+            or self.tree.openElements[1].name != "body"):
             assert self.parser.innerHTML
         else:
             for attr, value in attributes.iteritems():
@@ -1179,6 +1211,7 @@ class InTablePhase(Phase):
             self.parser.resetInsertionMode()
         else:
             # innerHTML case
+            assert self.parser.innerHTML
             self.parser.parseError()
 
     def endTagIgnore(self, name):
@@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase):
         ])
         self.endTagHandler.default = self.endTagOther
 
+    def ignoreEndTagCaption(self):
+        return not self.tree.elementInScope("caption", True)
+
     def processCharacters(self, data):
         self.parser.phases["inBody"].processCharacters(data)
 
     def startTagTableElement(self, name, attributes):
         self.parser.parseError()
+        #XXX Have to duplicate logic here to find out if the tag is ignored
+        ignoreEndTag = self.ignoreEndTagCaption()
         self.parser.phase.processEndTag("caption")
-        # XXX how do we know the tag is _always_ ignored in the innerHTML
-        # case and therefore shouldn't be processed again? I'm not sure this
-        # strategy makes sense...
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processStartTag(name, attributes)
 
     def startTagOther(self, name, attributes):
         self.parser.phases["inBody"].processStartTag(name, attributes)
 
     def endTagCaption(self, name):
-        if self.tree.elementInScope(name, True):
+        if not self.ignoreEndTagCaption():
             # AT this code is quite similar to endTagTable in "InTable"
             self.tree.generateImpliedEndTags()
             if self.tree.openElements[-1].name != "caption":
@@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase):
             self.parser.phase = self.parser.phases["inTable"]
         else:
             # innerHTML case
+            assert self.parser.innerHTML
             self.parser.parseError()
 
     def endTagTable(self, name):
         self.parser.parseError()
+        ignoreEndTag = self.ignoreEndTagCaption()
         self.parser.phase.processEndTag("caption")
-        # XXX ...
-        if not self.parser.innerHTML:
-            self.parser.phase.processStartTag(name, attributes)
+        if not ignoreEndTag:
+            self.parser.phase.processEndTag(name)
 
     def endTagIgnore(self, name):
         self.parser.parseError(_("Unexpected end tag (" + name +\
@@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase):
         ])
         self.endTagHandler.default = self.endTagOther
 
+    def ignoreEndTagColgroup(self):
+        return self.tree.openElements[-1].name == "html"
+
     def processCharacters(self, data):
+        ignoreEndTag = self.ignoreEndTagColgroup()
         self.endTagColgroup("colgroup")
-        # XXX
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processCharacters(data)
 
     def startTagCol(self, name ,attributes):
@@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase):
         self.tree.openElements.pop()
 
     def startTagOther(self, name, attributes):
+        ignoreEndTag = self.ignoreEndTagColgroup()
         self.endTagColgroup("colgroup")
-        # XXX how can be sure it's always ignored?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processStartTag(name, attributes)
 
     def endTagColgroup(self, name):
-        if self.tree.openElements[-1].name == "html":
+        if self.ignoreEndTagColgroup():
             # innerHTML case
+            assert self.parser.innerHTML
             self.parser.parseError()
         else:
             self.tree.openElements.pop()
@@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase):
           u"col has no end tag."))
 
     def endTagOther(self, name):
+        ignoreEndTag = self.ignoreEndTagColgroup()
         self.endTagColgroup("colgroup")
-        # XXX how can be sure it's always ignored?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processEndTag(name)
 
 
@@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase):
 
     def startTagTableOther(self, name, attributes):
         # XXX AT Any ideas on how to share this with endTagTable?
-        if self.tree.elementInScope("tbody", True) or \
-          self.tree.elementInScope("thead", True) or \
-          self.tree.elementInScope("tfoot", True):
+        if (self.tree.elementInScope("tbody", True) or
+            self.tree.elementInScope("thead", True) or
+            self.tree.elementInScope("tfoot", True)):
             self.clearStackToTableBodyContext()
             self.endTagTableRowGroup(self.tree.openElements[-1].name)
             self.parser.phase.processStartTag(name, attributes)
@@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase):
               ") in the table body phase. Ignored."))
 
     def endTagTable(self, name):
-        if self.tree.elementInScope("tbody", True) or \
-          self.tree.elementInScope("thead", True) or \
-          self.tree.elementInScope("tfoot", True):
+        if (self.tree.elementInScope("tbody", True) or
+            self.tree.elementInScope("thead", True) or
+            self.tree.elementInScope("tfoot", True)):
             self.clearStackToTableBodyContext()
             self.endTagTableRowGroup(self.tree.openElements[-1].name)
             self.parser.phase.processEndTag(name)
@@ -1428,6 +1468,9 @@ class InRowPhase(Phase):
               self.tree.openElements[-1].name + u") in the row phase."))
             self.tree.openElements.pop()
 
+    def ignoreEndTagTr(self):
+        return not self.tree.elementInScope("tr", tableVariant=True)
+
     # the rest
     def processCharacters(self, data):
         self.parser.phases["inTable"].processCharacters(data)
@@ -1439,28 +1482,31 @@ class InRowPhase(Phase):
         self.tree.activeFormattingElements.append(Marker)
 
     def startTagTableOther(self, name, attributes):
+        ignoreEndTag = self.ignoreEndTagTr()
         self.endTagTr("tr")
         # XXX how are we sure it's always ignored in the innerHTML case?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processStartTag(name, attributes)
 
     def startTagOther(self, name, attributes):
         self.parser.phases["inTable"].processStartTag(name, attributes)
 
     def endTagTr(self, name):
-        if self.tree.elementInScope("tr", True):
+        if not self.ignoreEndTagTr():
             self.clearStackToTableRowContext()
             self.tree.openElements.pop()
             self.parser.phase = self.parser.phases["inTableBody"]
         else:
             # innerHTML case
+            assert self.parser.innerHTML
             self.parser.parseError()
 
     def endTagTable(self, name):
+        ignoreEndTag = self.ignoreEndTagTr()
         self.endTagTr("tr")
         # Reprocess the current tag if the tr end tag was not ignored
         # XXX how are we sure it's always ignored in the innerHTML case?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
             self.parser.phase.processEndTag(name)
 
     def endTagTableRowGroup(self, name):
@@ -1628,7 +1674,7 @@ class InSelectPhase(Phase):
               u"select phase. Ignored."))
 
     def endTagSelect(self, name):
-        if self.tree.elementInScope(name, True):
+        if self.tree.elementInScope("select", True):
             node = self.tree.openElements.pop()
             while node.name != "select":
                 node = self.tree.openElements.pop()
@@ -1641,7 +1687,7 @@ class InSelectPhase(Phase):
         self.parser.parseError(_(u"Unexpected table end tag (" + name +\
           ") in the select phase."))
         if self.tree.elementInScope(name, True):
-            self.endTagSelect()
+            self.endTagSelect("select")
             self.parser.phase.processEndTag(name)
 
     def endTagOther(self, name):
@@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase):
               u"in the frameset phase (innerHTML)."))
         else:
             self.tree.openElements.pop()
-        if not self.parser.innerHTML and\
-          self.tree.openElements[-1].name != "frameset":
+        if (not self.parser.innerHTML and
+            self.tree.openElements[-1].name != "frameset"):
             # If we're not in innerHTML mode and the the current node is not a
             # "frameset" element (anymore) then switch.
             self.parser.phase = self.parser.phases["afterFrameset"]
diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py
index 9140456..e197415 100644
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@@ -14,7 +14,7 @@ class HTMLInputStream(object):
 
     """
 
-    def __init__(self, source, encoding=None, chardet=True):
+    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -26,6 +26,8 @@ class HTMLInputStream(object):
         the encoding.  If specified, that encoding will be used,
         regardless of any BOM or later declaration (such as in a meta
         element)
+        
+        parseMeta - Look for a <meta> element containing encoding information
 
         """
         # List of where new lines occur
@@ -41,12 +43,9 @@ class HTMLInputStream(object):
         #Encoding to use if no other information can be found
         self.defaultEncoding = "windows-1252"
         
-        #Autodetect encoding if no other information can be found?
-        self.chardet = chardet
-        
         #Detect encoding iff no explicit "transport level" encoding is supplied
         if encoding is None or not isValidEncoding(encoding):
-            encoding = self.detectEncoding()
+            encoding = self.detectEncoding(parseMeta, chardet)
         self.charEncoding = encoding
 
         # Read bytes from stream decoding them into Unicode
@@ -79,17 +78,17 @@ class HTMLInputStream(object):
             stream = cStringIO.StringIO(str(source))
         return stream
 
-    def detectEncoding(self):
+    def detectEncoding(self, parseMeta=True, chardet=True):
 
         #First look for a BOM
         #This will also read past the BOM if present
         encoding = self.detectBOM()
         #If there is no BOM need to look for meta elements with encoding 
         #information
-        if encoding is None:
+        if encoding is None and parseMeta:
             encoding = self.detectEncodingMeta()
         #Guess with chardet, if avaliable
-        if encoding is None and self.chardet:
+        if encoding is None and chardet:
             try:
                 import chardet
                 buffer = self.rawStream.read()
diff --git a/planet/html5lib/tokenizer.py b/planet/html5lib/tokenizer.py
index 3f4db08..584b268 100644
--- a/planet/html5lib/tokenizer.py
+++ b/planet/html5lib/tokenizer.py
@@ -32,8 +32,8 @@ class HTMLTokenizer(object):
 
     # XXX need to fix documentation
 
-    def __init__(self, stream, encoding=None):
-        self.stream = HTMLInputStream(stream, encoding)
+    def __init__(self, stream, encoding=None, parseMeta=True):
+        self.stream = HTMLInputStream(stream, encoding, parseMeta)
 
         self.states = {
             "data":self.dataState,
@@ -338,31 +338,33 @@ class HTMLTokenizer(object):
                 self.state = self.states["closeTagOpen"]
             else:
                 self.tokenQueue.append({"type": "Characters", "data": u"<"})
-                self.stream.queue.append(data)
+                self.stream.queue.insert(0, data)
                 self.state = self.states["data"]
         return True
 
     def closeTagOpenState(self):
-        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
-          contentModelFlags["CDATA"]):
-            charStack = []
+        if (self.contentModelFlag in (contentModelFlags["RCDATA"],
+            contentModelFlags["CDATA"])):
+            if self.currentToken:
+                charStack = []
 
-            # So far we know that "</" has been consumed. We now need to know
-            # whether the next few characters match the name of last emitted
-            # start tag which also happens to be the currentToken. We also need
-            # to have the character directly after the characters that could
-            # match the start tag name.
-            for x in xrange(len(self.currentToken["name"]) + 1):
-                charStack.append(self.stream.char())
-                # Make sure we don't get hit by EOF
-                if charStack[-1] == EOF:
-                    break
+                # So far we know that "</" has been consumed. We now need to know
+                # whether the next few characters match the name of last emitted
+                # start tag which also happens to be the currentToken. We also need
+                # to have the character directly after the characters that could
+                # match the start tag name.
+                for x in xrange(len(self.currentToken["name"]) + 1):
+                    charStack.append(self.stream.char())
+                    # Make sure we don't get hit by EOF
+                    if charStack[-1] == EOF:
+                        break
 
-            # Since this is just for checking. We put the characters back on
-            # the stack.
-            self.stream.queue.extend(charStack)
+                # Since this is just for checking. We put the characters back on
+                # the stack.
+                self.stream.queue.extend(charStack)
 
-            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
+            if self.currentToken \
+              and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
               and charStack[-1] in (spaceCharacters |
               frozenset((u">", u"/", u"<", EOF))):
                 # Because the characters are correct we can safely switch to
diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py
index 2502466..6c7bb0b 100755
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@@ -108,6 +108,9 @@ class TreeBuilder(object):
 
     #The class to use for creating doctypes
     doctypeClass = None
+    
+    #Fragment class
+    fragmentClass = None
 
     def __init__(self):
         self.reset()
@@ -294,7 +297,6 @@ class TreeBuilder(object):
                 fosterParent = self.openElements[
                     self.openElements.index(lastTable) - 1]
         else:
-            assert self.innerHTML
             fosterParent = self.openElements[0]
         return fosterParent, insertBefore
 
@@ -310,6 +312,13 @@ class TreeBuilder(object):
     def getDocument(self):
         "Return the final tree"
         return self.document
+    
+    def getFragment(self):
+        "Return the final fragment"
+        #assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
 
     def testSerializer(self, node):
         """Serialize the subtree of node in the format required by unit tests
diff --git a/planet/html5lib/treebuilders/dom.py b/planet/html5lib/treebuilders/dom.py
index 8b52d6a..bfaa880 100755
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@@ -1,6 +1,8 @@
 import _base
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
 import new
+from xml.sax.saxutils import escape
+from constants import voidElements
 
 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder):
         
     def commentClass(self, data):
         return NodeBuilder(self.dom.createComment(data))
+    
+    def fragmentClass(self):
+        return NodeBuilder(self.dom.createDocumentFragment())
 
     def appendChild(self, node):
         self.dom.appendChild(node.element)
@@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder):
 
     def getDocument(self):
         return self.dom
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self).element
 
     def insertText(self, data, parent=None):
         data=illegal_xml_chars.sub(u'\uFFFD',data)
@@ -118,7 +126,9 @@ def testSerializer(element):
         if element.nodeType == Node.DOCUMENT_TYPE_NODE:
             rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
         elif element.nodeType == Node.DOCUMENT_NODE:
-            rv.append("#document")
+            rv.append("#document")
+        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+            rv.append("#document-fragment")
         elif element.nodeType == Node.COMMENT_NODE:
             rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
         elif element.nodeType == Node.TEXT_NODE:
@@ -135,6 +145,32 @@ def testSerializer(element):
 
     return "\n".join(rv)
 
+class HTMLSerializer(object):
+    def serialize(self, node):
+        rv = self.serializeNode(node)
+        for child in node.childNodes:
+            rv += self.serialize(child)
+        if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
+            rv += "</%s>\n"%node.nodeName
+        return rv
+    
+    def serializeNode(self, node):
+        if node.nodeType == Node.TEXT_NODE:
+            rv = node.nodeValue
+        elif node.nodeType == Node.ELEMENT_NODE:
+            rv = "<%s"%node.nodeName
+            if node.hasAttributes():
+                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+                                 node.attributes.items()])
+            rv += ">"
+        elif node.nodeType == Node.COMMENT_NODE:
+            rv = "<!-- %s -->" % escape(node.nodeValue)        
+        elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            rv = "<!DOCTYPE %s>" % node.name
+        else:
+            rv = ""
+        return rv
+
 def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
   if node.nodeType == Node.ELEMENT_NODE:
     if not nsmap:
@@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
   elif node.nodeType == Node.DOCUMENT_NODE:
     handler.startDocument()
     for child in node.childNodes: dom2sax(child, handler, nsmap)
-    handler.endDocument()
+    handler.endDocument()
+
+  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+    for child in node.childNodes: dom2sax(child, handler, nsmap)
 
   else:
     # ATTRIBUTE_NODE
diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py
index acead55..2629664 100644
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@@ -129,6 +129,10 @@ class Document(Element):
     def __init__(self):
         Element.__init__(self, Document) 
 
+class DocumentFragment(Element):
+    def __init__(self):
+        Element.__init__(self, DocumentFragment)
+
 def testSerializer(element):
     rv = []
     finalText = None
@@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
     doctypeClass = DocumentType
     elementClass = Element
     commentClass = Comment
+    fragmentClass = DocumentFragment
 
     def testSerializer(self, element):
         return testSerializer(element)
 
     def getDocument(self):
         return self.document._element
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self)._element
diff --git a/planet/html5lib/treebuilders/simpletree.py b/planet/html5lib/treebuilders/simpletree.py
index 6b2f09e..05dc0c0 100755
--- a/planet/html5lib/treebuilders/simpletree.py
+++ b/planet/html5lib/treebuilders/simpletree.py
@@ -4,6 +4,7 @@ from xml.sax.saxutils import escape
 
 # Really crappy basic implementation of a DOM-core like thing
 class Node(_base.Node):
+    type = -1
     def __init__(self, name):
         self.name = name
         self.parent = None
@@ -11,15 +12,18 @@ class Node(_base.Node):
         self.childNodes = []
         self._flags = []
 
+    def __iter__(self):
+        for node in self.childNodes:
+            yield node
+            for item in node:
+                yield item
+
     def __unicode__(self):
         return self.name
 
     def toxml(self):
         raise NotImplementedError
 
-    def __repr__(self):
-        return "<%s %s>" % (self.__class__, self.name)
-
     def printTree(self, indent=0):
         tree = '\n|%s%s' % (' '* indent, unicode(self))
         for child in self.childNodes:
@@ -69,6 +73,7 @@ class Node(_base.Node):
         return bool(self.childNodes)
 
 class Document(Node):
+    type = 1
     def __init__(self):
         Node.__init__(self, None)
 
@@ -93,7 +98,13 @@ class Document(Node):
             tree += child.printTree(2)
         return tree
 
+class DocumentFragment(Document):
+    type = 2
+    def __unicode__(self):
+        return "#document-fragment"
+
 class DocumentType(Node):
+    type = 3
     def __init__(self, name):
         Node.__init__(self, name)
 
@@ -106,6 +117,7 @@ class DocumentType(Node):
         return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
 
 class TextNode(Node):
+    type = 4
     def __init__(self, value):
         Node.__init__(self, None)
         self.value = value
@@ -119,6 +131,7 @@ class TextNode(Node):
     hilite = toxml
 
 class Element(Node):
+    type = 5
     def __init__(self, name):
         Node.__init__(self, name)
         self.attributes = {}
@@ -164,6 +177,7 @@ class Element(Node):
         return tree
 
 class CommentNode(Node):
+    type = 6
     def __init__(self, data):
         Node.__init__(self, None)
         self.data = data
@@ -177,11 +191,38 @@ class CommentNode(Node):
     def hilite(self):
         return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
 
+class HTMLSerializer(object):
+    def serialize(self, node):
+        rv = self.serializeNode(node)
+        for child in node.childNodes:
+            rv += self.serialize(child)
+        if node.type == Element.type and node.name not in voidElements:
+            rv += "</%s>\n"%node.name
+        return rv
+    
+    def serializeNode(self, node):
+        if node.type == TextNode.type:
+            rv = node.value
+        elif node.type == Element.type:
+            rv = "<%s"%node.name
+            if node.attributes:
+                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+                                 node.attributes.iteritems()])
+            rv += ">"
+        elif node.type == CommentNode.type:
+            rv = "<!-- %s -->" % escape(node.data)        
+        elif node.type == DocumentType.type:
+            rv = "<!DOCTYPE %s>" % node.name
+        else:
+            rv = ""
+        return rv
+
 class TreeBuilder(_base.TreeBuilder):
     documentClass = Document
     doctypeClass = DocumentType
     elementClass = Element
     commentClass = CommentNode
+    fragmentClass = DocumentFragment
     
     def testSerializer(self, node):
         return node.printTree()
diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py
index 8d28045..dd2abd3 100644
--- a/planet/shell/__init__.py
+++ b/planet/shell/__init__.py
@@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):
 
     # Execute the shell module
     options = planet.config.template_options(template_file)
+    if module_name == 'plugin': options['__file__'] = template_file
     options.update(extra_options)
     log.debug("Processing %s %s using %s", mode,
         os.path.realpath(template_resolved), module_name)
diff --git a/planet/shell/plugin.py b/planet/shell/plugin.py
new file mode 100644
index 0000000..dd94380
--- /dev/null
+++ b/planet/shell/plugin.py
@@ -0,0 +1,64 @@
+import os, sys, imp
+from StringIO import StringIO
+
+def run(script, doc, output_file=None, options={}):
+    """ process an Python script using imp """
+    save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
+    plugin_stdout = StringIO()
+    plugin_stderr = StringIO()
+
+    try:
+        # redirect stdin
+        sys.stdin = StringIO(doc)
+
+        # redirect stdout
+        if output_file:
+            sys.stdout = open(output_file, 'w')
+        else:
+            sys.stdout = plugin_stdout
+
+        # redirect stderr
+        sys.stderr = plugin_stderr
+
+        # determine __file__ value
+        if options.has_key("__file__"):
+            plugin_file = options["__file__"]
+            del options["__file__"]
+        else:
+            plugin_file = script
+
+        # set sys.argv
+        options = sum([['--'+key, value] for key,value in options.items()], [])
+        sys.argv = [plugin_file] + options
+
+        # import script
+        handle = open(script, 'r')
+        cwd = os.getcwd()
+        try:
+            try:
+                try:
+                    description=('.plugin', 'rb', imp.PY_SOURCE)
+                    imp.load_module('__main__',handle,plugin_file,description)
+                except SystemExit,e:
+                    if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
+            except Exception, e:
+                import traceback
+                type, value, tb = sys.exc_info()
+                plugin_stderr.write(''.join(
+                   traceback.format_exception_only(type,value) +
+                   traceback.format_tb(tb)))
+        finally:
+            handle.close()
+            if cwd != os.getcwd(): os.chdir(cwd)
+
+    finally:
+        # restore system state
+        sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
+
+    # log anything sent to stderr
+    if plugin_stderr.getvalue():
+        import planet
+        planet.logger.error(plugin_stderr.getvalue())
+
+    # return stdout
+    return plugin_stdout.getvalue()
diff --git a/planet/spider.py b/planet/spider.py
index 11fe94a..b18a787 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):
 
 def spiderPlanet(only_if_new = False):
     """ Spider (fetch) an entire planet """
-    # log = planet.getLogger(config.log_level(),config.log_format())
     log = planet.getLogger(config.log_level(),config.log_format())
 
     global index
diff --git a/runtests.py b/runtests.py
index d14058d..7783d14 100755
--- a/runtests.py
+++ b/runtests.py
@@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'):
 if sys.path[0]: os.chdir(sys.path[0])
 sys.path[0] = os.getcwd()
 
-# find all of the planet test modules
-modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
+# determine verbosity
+verbosity = 1
+for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
+    if arg in sys.argv: 
+        verbosity = value
+        sys.argv.remove(arg)
 
-# enable warnings
+# find all of the planet test modules
+modules = []
+for pattern in sys.argv[1:] or ['test_*.py']:
+    modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
+
+# enable logging
 import planet
-planet.getLogger("WARNING",None)
+if verbosity == 0: planet.getLogger("FATAL",None)
+if verbosity == 1: planet.getLogger("WARNING",None)
+if verbosity == 2: planet.getLogger("DEBUG",None)
 
 # load all of the tests into a suite
 try:
@@ -33,11 +44,5 @@ except Exception, exception:
     for module in modules: __import__(module)
     raise
 
-verbosity = 1
-if "-q" in sys.argv or '--quiet' in sys.argv:
-    verbosity = 0
-if "-v" in sys.argv or '--verbose' in sys.argv:
-    verbosity = 2
-
 # run test suite
 unittest.TextTestRunner(verbosity=verbosity).run(suite)
diff --git a/tests/data/apply/config-mememe.ini b/tests/data/apply/config-mememe.ini
new file mode 100644
index 0000000..c6ca9bd
--- /dev/null
+++ b/tests/data/apply/config-mememe.ini
@@ -0,0 +1,29 @@
+[Planet]
+output_theme = classic_fancy
+output_dir = tests/work/apply
+name = test planet
+cache_directory = tests/work/spider/cache
+
+bill_of_materials:
+  images/#{face}
+
+[index.html.tmpl]
+filters:
+   html2xhtml.plugin
+   mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@class='sidebar']
+
+[tests/data/spider/testfeed0.atom]
+name = not found
+
+[tests/data/spider/testfeed1b.atom]
+name = one
+face = jdub.png
+
+[tests/data/spider/testfeed2.atom]
+name = two
+
+[tests/data/spider/testfeed3.rss]
+name = three
diff --git a/tests/test_apply.py b/tests/test_apply.py
index ec5a8e5..dafa37a 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase):
              os.makedirs(workdir)
     
     def tearDown(self):
-        shutil.rmtree(workdir)
-        os.removedirs(os.path.split(workdir)[0])
+        shutil.rmtree(os.path.split(workdir)[0])
 
     def test_apply_asf(self):
         config.load(configfile % 'asf')
@@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase):
         output = open(os.path.join(workdir, 'index.html4')).read()
         self.assertTrue(output.find('/>')<0)
 
+    def test_apply_filter_mememe(self):
+        config.load(configfile % 'mememe')
+        self.apply_fancy()
+    
+        output = open(os.path.join(workdir, 'index.html')).read()
+        self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
+
     def apply_fancy(self):
+        # drop slow templates unrelated to test at hand
+        templates = config.parser.get('Planet','template_files').split()
+        templates.remove('rss10.xml.tmpl')
+        templates.remove('rss20.xml.tmpl')
+        config.parser.set('Planet','template_files',' '.join(templates))
+        
         splice.apply(self.feeddata)
 
         # verify that selected files are there