MeMeme and html2xhtml plugins

2007-04-30 09:38:09 -04:00 · 2007-04-30 09:38:09 -04:00 · a5e1fde287
commit a5e1fde287
parent ddf15fc689
20 changed files with 878 additions and 119 deletions
--- a/.bzrignore
+++ b/.bzrignore
@ -1,3 +1,4 @@
 *.tmplc
 .DS_Store
 cache
+*.pluginc
--- a/docs/filters.html
+++ b/docs/filters.html
@ -8,12 +8,13 @@
 <title>Venus Filters</title>
 </head>
 <body>
-<h2>Filters</h2>
-<p>Filters are simple Unix pipes.  Input comes in <code>stdin</code>,
-parameters come from the config file, and output goes to <code>stdout</code>.
-Anything written to <code>stderr</code> is logged as an ERROR message.  If no
-<code>stdout</code> is produced, the entry is not written to the cache or
-processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
+<h2>Filters and Plugins</h2>
+<p>Filters and plugins are simple Unix pipes.  Input comes in
+<code>stdin</code>, parameters come from the config file, and output goes to
+<code>stdout</code>.  Anything written to <code>stderr</code> is logged as an
+ERROR message.  If no <code>stdout</code> is produced, the entry is not written
+to the cache or processed further; in fact, if the entry had previously been
+written to the cache, it will be removed.</p>

 <p>There are two types of filters supported by Venus, input and template.</p>
 <p>Input to an input filter is a aggressively
@ -89,6 +90,16 @@ an HTML output stream from one source.</li>
 <li>Templates written using htmltmpl or django currently only have access to a
 fixed set of fields, whereas XSLT and genshi templates have access to
 everything.</li>
+
+<li>Plugins differ from filters in that while filters are forked, plugins are
+<a href="http://docs.python.org/lib/module-imp.html">imported</a>.  This
+means that plugins are limited to Python and are run in-process.  Plugins
+therefore have direct access to planet internals like configuration and
+logging facitilies, as well as access to the bundled libraries like the
+<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
+<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
+means that functions like <code>os.abort()</code> can't be recovered
+from.</li>
 </ul>
 </body>
 </html>
--- a/docs/index.html
+++ b/docs/index.html
@ -21,7 +21,7 @@
 <ul>
 <li><a href="venus.svg">Architecture</a></li>
 <li><a href="normalization.html">Normalization</a></li>
-<li><a href="filters.html">Filters</a></li>
+<li><a href="filters.html">Filters and Plugins</a></li>
 </ul>
 </li>
 <li>Other
--- a/examples/opml-top100.ini
+++ b/examples/opml-top100.ini
@ -36,6 +36,13 @@ filters = excerpt.py
 omit = img p br
 width = 500

+# add memes to output
+[index.html.tmpl]
+filters = mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@id="footer"]
+
 # subscription list
 [http://share.opml.org/opml/top100.opml]
 content_type = opml
--- a/filters/html2xhtml.plugin
+++ b/filters/html2xhtml.plugin
@ -0,0 +1,6 @@
+import sys
+from planet import html5lib
+tree=html5lib.treebuilders.dom.TreeBuilder
+parser = html5lib.html5parser.HTMLParser(tree=tree)
+document = parser.parse(sys.stdin)
+sys.stdout.write(document.toxml("utf-8"))
--- a/filters/mememe.plugin
+++ b/filters/mememe.plugin
@ -0,0 +1,475 @@
+#
+# This Venus output filter will annotate an XHTML page with a list of
+# "memes" (or most popular linked destinations, based on the last week
+# of entries from the cache) and will update the subscription list with
+# links to recent entries from each subscription.
+#
+# Templates that don't produce XHTML natively will need their output passed
+# through html2xhtml.plugin first.
+#
+# Typical configuration (based on classic_fancy):
+#
+#   [index.html.tmpl]
+#   filters:
+#     html2xhtml.plugin
+#     mememe.plugin
+#  
+#   [mememe.plugin]
+#   sidebar = @class='sidebar'
+#
+
+import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
+from xml.sax.saxutils import escape
+from htmlentitydefs import entitydefs
+
+import planet
+from planet import config, feedparser
+from planet.spider import filename
+log = planet.getLogger(config.log_level(),config.log_format())
+options = config.filter_options(sys.argv[0])
+
+MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
+
+now = time.time()
+week = 7 * 86400
+week_ago = now - week
+
+cache = config.cache_directory()
+meme_cache = os.path.join(cache, 'memes')
+if not os.path.exists(meme_cache): os.makedirs(meme_cache)
+
+all_links = {}
+feed_links = {}
+
+def check_cache(url):
+  try:
+    file = open(filename(meme_cache, url))
+    headers = eval(file.read())
+    file.close()
+    return headers or {}
+  except:
+    return {}
+
+def cache_meme(url, headers):
+  json = []
+  for key,value in headers.items():
+    json.append('  %s: %s' % (toj(key), toj(value)))
+  file = open(filename(meme_cache, url),'w')
+  file.write('{\n' + ',\n'.join(json) + '\n}\n')
+  file.close()
+
+urlmap = {}
+def canonicalize(url):
+  url = urlmap.get(url,url)
+  parts = list(urlparse.urlparse(url))
+
+  parts[0] = parts[0].lower()
+  parts[1] = parts[1].lower()
+  if parts[1].startswith('www.'): parts[1]=parts[1][4:]
+  if not parts[2]: parts[2] = '/'
+  parts[-1] = ''
+  return urlparse.urlunparse(parts)
+  
+log.debug("Loading cached data")
+for name in glob.glob(os.path.join(cache, '*')):
+  # ensure that this is within the past week
+  if os.path.isdir(name): continue
+  mtime = os.stat(name).st_mtime
+  if mtime < week_ago: continue
+
+  # parse the file
+  try:
+    doc = libxml2.parseFile(name)
+  except:
+    continue
+  xp = doc.xpathNewContext()
+  xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
+  xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
+
+  # determine the entry
+  entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
+  if not entry: continue
+  entry = canonicalize(entry[0].prop("href"))
+
+  # determine the title
+  title = xp.xpathEval("/atom:entry/atom:title")
+  if title:
+    if title[0].prop('type') == 'html':
+      title = re.sub('<.*?>','',title[0].content)
+    else:
+      title = title[0].content
+  title = str(title or '')
+
+  # determine the feed id
+  feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
+  if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
+  if not feed: continue
+  feed = feed[0].content
+
+  # determine the author
+  author = xp.xpathEval("/atom:entry/atom:source/planet:name")
+  if author:
+    author = author[0].content
+  else:
+    author = ''
+
+  # track the feed_links
+  if author:
+    if not feed_links.has_key(author): feed_links[author] = list()
+    feed_links[author].append([mtime, entry, title])
+
+  # identify the unique links
+  entry_links = []
+  for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
+      parent = node.parent
+      while parent:
+          if parent.name == 'source': break
+          parent = parent.parent
+      else:
+          link = canonicalize(node.prop('href'))
+          if not link in entry_links:
+              entry_links.append(link)
+          if node.hasProp('title') and node.prop('title').startswith('http'):
+              link = canonicalize(node.prop('title'))
+              if not link in entry_links:
+                  entry_links.append(link)
+
+  # add the votes
+  weight = 1.0 - (now - mtime)**2 / week**2
+  vote = [(weight, str(entry), str(feed), title, author, mtime)]
+  for link in entry_links:
+    all_links[link] = all_links.get(link,list()) + vote
+
+  # free the entry
+  doc.freeDoc()
+
+# tally the votes
+weighted_links = []
+for link, votes in all_links.items():
+  site = {}
+  updated = 0
+  for weight, entry, feed, title, author, mtime in votes:
+    site[feed] = max(site.get(feed,0), weight) 
+    if mtime > updated: updated=mtime
+  weighted_links.append((sum(site.values()), link, updated))
+weighted_links.sort()
+weighted_links.reverse()
+
+cp1252 = {
+  128: 8364, # euro sign
+  130: 8218, # single low-9 quotation mark
+  131:  402, # latin small letter f with hook
+  132: 8222, # double low-9 quotation mark
+  133: 8230, # horizontal ellipsis
+  134: 8224, # dagger
+  135: 8225, # double dagger
+  136:  710, # modifier letter circumflex accent
+  137: 8240, # per mille sign
+  138:  352, # latin capital letter s with caron
+  139: 8249, # single left-pointing angle quotation mark
+  140:  338, # latin capital ligature oe
+  142:  381, # latin capital letter z with caron
+  145: 8216, # left single quotation mark
+  146: 8217, # right single quotation mark
+  147: 8220, # left double quotation mark
+  148: 8221, # right double quotation mark
+  149: 8226, # bullet
+  150: 8211, # en dash
+  151: 8212, # em dash
+  152:  732, # small tilde
+  153: 8482, # trade mark sign
+  154:  353, # latin small letter s with caron
+  155: 8250, # single right-pointing angle quotation mark
+  156:  339, # latin small ligature oe
+  158:  382, # latin small letter z with caron
+  159:  376} # latin capital letter y with diaeresis
+
+# determine the title for a given url
+class html(sgmllib.SGMLParser):
+  def __init__(self, url):
+    sgmllib.SGMLParser.__init__(self)
+    self.title = ""
+    self.feedurl = ""
+    self.intitle = False
+
+    headers = check_cache(url)
+
+    try:
+        # fetch the page
+        request = urllib2.Request(url)
+        request.add_header('User-Agent', 'Venus/MeMeme')
+        if headers.has_key('etag'):
+            request.add_header('If-None-Match', headers['etag'])
+        if headers.has_key('last_modified'):
+            request.add_header('If-Modified-Since', headers['last-modified'])
+        response = urllib2.urlopen(request)
+        self.feed(response.read())
+
+        # ensure the data is in utf-8
+        try:
+            self.title = self.title.decode('utf-8')
+        except:
+            self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
+                for c in self.title.decode('iso-8859-1')])
+
+        # cache the results
+        headers = {}
+        if self.feedurl: headers['feedurl'] = self.feedurl
+        if self.title: headers['title'] = self.title
+        headers.update(response.headers)
+        cache_meme(url, headers)
+    except:
+        self.feedurl = headers.get('feedurl')
+        if headers.has_key('title'):
+           if isinstance(headers['title'],str):
+               self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
+           else:
+               self.title=headers['title']
+
+    # if there is a feed, look for an entry that matches, and take that title
+    if self.feedurl and not self.title:
+        headers = check_cache(self.feedurl)
+        data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
+            modified=headers.get('last-modified'))
+
+        if data.has_key('headers') and data.has_key('status') and \
+            data.status in [200, 301, 302]:
+
+            titles = {}
+            for entry in data.entries:
+                if entry.has_key('title_detail') and entry.has_key('link'):
+                    titles[entry.link] = entry.title_detail.value
+                    if entry.title_detail.type == 'text/plain':
+                        titles[entry.link] = escape(titles[entry.link])
+
+            if titles.has_key(url): self.title = titles[url]
+
+            data.headers.update(titles)
+            cache_meme(self.feedurl, data.headers)
+        else:
+            if headers.has_key(url):
+               if isinstance(headers[url],str):
+                   self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
+               else:
+                   self.title=headers[url]
+
+    # fallback is the basename of the URI
+    if not self.title:
+        self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
+
+  # parse out the first autodiscovery link
+  def start_link(self, attrs):
+    if self.feedurl: return
+    attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
+    if not 'rel' in attrs: return
+    rels = attrs['rel'].split(' ')
+    if 'alternate' not in rels: return
+    if not 'type' in attrs or not attrs['type'].endswith('xml'): return
+    if 'href' in attrs:
+      self.feedurl = attrs['href']
+
+  # parse the page title
+  def start_title(self, attributes):
+    if not self.title: self.intitle = True
+  def end_title(self):
+    self.intitle = False
+  def handle_data(self, text):
+    if self.intitle: self.title += escape(text)
+
+# convert unicode string to a json string
+def toj(value):
+  result = repr(value).replace(r'\x',r'\u00')
+  if result[:1] == 'u': result=result[1:]
+  if result.startswith("'"):
+    result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
+  return result
+
+seenit = []
+count = 0
+
+# construct an empty feed
+feed_doc = libxml2.newDoc("1.0")
+meme_feed = feed_doc.newChild(None, "feed", None)
+meme_feed.newNs('http://www.w3.org/2005/Atom', None)
+meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
+author = meme_feed.newChild(None, 'author', None)
+author.newTextChild(None, 'name', config.owner_name())
+if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
+meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
+link = meme_feed.newChild(None, 'link', None)
+link.setProp('href', os.path.join(config.link(), 'memes.atom'))
+link.setProp('rel', 'self')
+meme_feed.newTextChild(None, 'updated',
+  time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
+
+# parse the input
+log.debug("Parse input")
+doc=libxml2.parseDoc(sys.stdin.read())
+
+# find the sidebar/footer
+sidebar = options.get('sidebar','//*[@class="sidebar"]')
+footer = doc.xpathEval(sidebar)
+if not hasattr(footer,'__len__') or len(footer) == 0:
+  raise Exception(sidebar + ' not found')
+if len(footer) > 1:
+  log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
+footer = footer[0]
+
+# add up to 10 entry links to each subscription
+subs_ul = footer.children
+while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
+child = subs_ul.children
+while child:
+  if child.name == 'li':
+    if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
+    link = child.lastChild()
+    while link.isText(): link=link.prev
+    author = link.getContent()
+    state = 'inactive'
+    if feed_links.has_key(author):
+      ul2 = child.newChild(None, 'ul', None)
+      feed_links[author].sort()
+      feed_links[author].reverse()
+      link_count = 0
+      for mtime, entry, title in feed_links[author]:
+        if not title: continue
+        li2 = ul2.newChild(None, 'li', None)
+        a = li2.newTextChild(None, 'a', title)
+        a.setProp('href', entry)
+        link_count = link_count + 1
+        if link_count >= 10: break
+      if link_count > 0: state = None
+    if state:
+      link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
+  child=child.next
+
+# create a h2 and ul for the memes list
+footer_top = footer.children
+memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
+memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
+
+# create a header for the memes list
+a = memes.newChild(None, 'a', None)
+a.setProp('href', 'memes.atom')
+img = a.newChild(None, 'img', None)
+img.setProp('src', 'images/feed-icon-10x10.png')
+
+# collect the results
+log.debug("Fetch titles and collect the results")
+from urllib import quote_plus
+for i in range(0,len(weighted_links)):
+  weight, link, updated = weighted_links[i]
+
+  # ensure that somebody new points to this entry.  This guards against
+  # groups of related links which several posts point to all.
+  novel = False
+  for weight, entry, feed, title, author, mtime in all_links[link]:
+    if entry not in seenit:
+      seenit.append(entry)
+      novel = True
+  if not novel: continue
+
+  all_links[link].sort()
+  all_links[link].reverse()
+  cache_file = filename(cache, link)
+  title = None
+
+  # when possible, take the title from the cache
+  if os.path.exists(cache_file):
+      entry = feedparser.parse(cache_file).entries[0]
+      if entry.has_key('title_detail'):
+        title = entry.title_detail.value
+        if entry.title_detail.type == 'text/plain': title = escape(title)
+
+  # otherwise, parse the html
+  if not title:
+    title = html(link).title
+
+  # dehtmlize
+  title = re.sub('&(\w+);',
+    lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
+  title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
+  title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
+
+  # title too long?  Insert zero width spaces where appropriate
+  if max(map(len,title.split())) > 30:
+    title=re.sub('(\W+)',u'\\1\u200b',title)
+
+  # save the entry title (it is used later)
+  entry_title = title.strip()
+
+  # add to the memes list
+  memes_ul.addContent('\n')
+  li = memes_ul.newChild(None, 'li', None)
+  memes_ul.addContent('\n')
+
+  # technorati link
+  a = li.newChild(None, 'a', None)
+  tlink = 'http://technorati.com/cosmos/search.html?url='
+  if link.startswith('http://'):
+    a.setProp('href',tlink + quote_plus(link[7:]))
+  else:
+    a.setProp('href',tlink + quote_plus(link))
+  a.setProp('title','cosmos')
+  img = a.newChild(None, 'img', None)
+  img.setProp('src','tcosm11.gif')
+
+  # main link
+  a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
+  a.setProp('href',link)
+  if (((i==0) or (updated>=weighted_links[i-1][2])) and
+    (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
+    rank = 0
+    for j in range(0,len(weighted_links)):
+      if updated < weighted_links[j][2]: rank = rank + 1
+    if rank < len(weighted_links)/2:
+      a.setProp('class','rising')
+
+  # voters
+  ul2 = li.newChild(None, 'ul', None)
+  voters = []
+  for weight, entry, feed, title, author, mtime in all_links[link]:
+    if entry in voters: continue
+    li2 = ul2.newChild(None, 'li', None)
+    a = li2.newTextChild(None, 'a' , author)
+    a.setProp('href',entry)
+    if title: a.setProp('title',title)
+    voters.append(entry)
+
+  # add to the meme feed
+  if len(all_links[link]) > 2:
+    meme_feed.addContent('\n')
+    entry = meme_feed.newChild(None, 'entry', None)
+    meme_feed.addContent('\n')
+
+    # entry
+    tagbase = config.link().split('/')
+    if not tagbase[-1]: tagbase = tagbase[:-1]
+    tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
+    entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
+    entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
+    meme_link = entry.newTextChild(None, 'link', None)
+    meme_link.setProp('href', link)
+    entry.newTextChild(None, 'updated', 
+      time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
+
+    # voters
+    content = entry.newChild(None, 'content', None)
+    content.setProp('type', 'xhtml')
+    div = content.newTextChild(None, 'div', 'Spotted by:')
+    div.newNs('http://www.w3.org/1999/xhtml', None)
+    content_ul = div.newChild(None, 'ul', None)
+    for weight, entry, feed, title, author, mtime in all_links[link]:
+      li2 = content_ul.newTextChild(None, 'li', author + ": ")
+      a = li2.newTextChild(None, 'a' , title or 'untitled')
+      a.setProp('href',entry)
+
+  count = count + 1
+  if count >= 10: break
+
+log.info("Writing " + MEMES_ATOM)
+output=open(MEMES_ATOM,'w')
+output.write(feed_doc.serialize('utf-8'))
+output.close()
+
+sys.stdout.write(doc.serialize('utf-8'))
--- a/planet/config.py
+++ b/planet/config.py
@ -352,14 +352,15 @@ def filters(section=None):
    filters = []
    if parser.has_option('Planet', 'filters'):
        filters += parser.get('Planet', 'filters').split()
-    if section and parser.has_option(section, 'filters'):
-        filters += parser.get(section, 'filters').split()
    if filter(section):
        filters.append('regexp_sifter.py?require=' +
            urllib.quote(filter(section)))
    if exclude(section):
        filters.append('regexp_sifter.py?exclude=' +
            urllib.quote(exclude(section)))
+    for section in section and [section] or template_files():
+        if parser.has_option(section, 'filters'):
+            filters += parser.get(section, 'filters').split()
    return filters

 def planet_options():
@ -382,6 +383,10 @@ def template_options(section):
    """ dictionary of template specific options"""
    return feed_options(section)

+def filter_options(section):
+    """ dictionary of filter specific options"""
+    return feed_options(section)
+
 def write(file=sys.stdout):
    """ write out an updated template """
    print parser.write(file)
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@ -71,35 +71,40 @@ class HTMLParser(object):
            "trailingEnd": TrailingEndPhase(self, self.tree)
        }

-    def parse(self, stream, encoding=None, innerHTML=False):
-        """Parse a HTML document into a well-formed tree
-
-        stream - a filelike object or string containing the HTML to be parsed
-
-        innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
-        is not yet supported)
-
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
-        """
-
+    def _parse(self, stream, innerHTML=False, container="div",
+               encoding=None):
+        
        self.tree.reset()
        self.firstStartTag = False
        self.errors = []

-        self.phase = self.phases["initial"]
+        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
+                                                 parseMeta=innerHTML)
+
+        if innerHTML:
+            self.innerHTML = container.lower()
+
+            if self.innerHTML in ('title', 'textarea'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+            elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+            elif self.innerHTML == 'plaintext':
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+            else:
+                # contentModelFlag already is PCDATA
+                #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+                pass
+            self.phase = self.phases["rootElement"]
+            self.phase.insertHtmlElement()
+            self.resetInsertionMode()
+        else:
+            self.innerHTML = False
+            self.phase = self.phases["initial"]
+
        # We only seem to have InBodyPhase testcases where the following is
        # relevant ... need others too
        self.lastPhase = None

-        # We don't actually support innerHTML yet but this should allow
-        # assertations
-        self.innerHTML = innerHTML
-
-        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
-
        # XXX This is temporary for the moment so there isn't any other
        # changes needed for the parser to work with the iterable tokenizer
        for token in self.tokenizer:
@ -118,7 +123,34 @@ class HTMLParser(object):
        # When the loop finishes it's EOF
        self.phase.processEOF()

+    def parse(self, stream, encoding=None):
+        """Parse a HTML document into a well-formed tree
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+        self._parse(stream, innerHTML=False, encoding=encoding)
        return self.tree.getDocument()
+    
+    def parseFragment(self, stream, container="div", encoding=None):
+        """Parse a HTML fragment into a well-formed tree fragment
+        
+        container - name of the element we're setting the innerHTML property
+        if set to None, default to 'div'
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+        self._parse(stream, True, container=container, encoding=encoding)
+        return self.tree.getFragment()

    def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
        # XXX The idea is to make data mandatory.
@ -187,28 +219,29 @@ class HTMLParser(object):
            "frameset":"inFrameset"
        }
        for node in self.tree.openElements[::-1]:
+            nodeName = node.name
            if node == self.tree.openElements[0]:
                last = True
-                if node.name not in ['td', 'th']:
+                if nodeName not in ['td', 'th']:
                    # XXX
                    assert self.innerHTML
-                    raise NotImplementedError
+                    nodeName = self.innerHTML
            # Check for conditions that should only happen in the innerHTML
            # case
-            if node.name in ("select", "colgroup", "head", "frameset"):
+            if nodeName in ("select", "colgroup", "head", "frameset"):
                # XXX
                assert self.innerHTML
-            if node.name in newModes:
-                self.phase = self.phases[newModes[node.name]]
+            if nodeName in newModes:
+                self.phase = self.phases[newModes[nodeName]]
                break
-            elif node.name == "html":
+            elif nodeName == "html":
                if self.tree.headPointer is None:
                    self.phase = self.phases["beforeHead"]
                else:
                   self.phase = self.phases["afterHead"]
                break
            elif last:
-                self.phase = self.phases["body"]
+                self.phase = self.phases["inBody"]
                break

 class Phase(object):
@ -434,9 +467,7 @@ class InHeadPhase(Phase):
            self.parser.phase.processCharacters(data)

    def startTagHead(self, name, attributes):
-        self.tree.insertElement(name, attributes)
-        self.tree.headPointer = self.tree.openElements[-1]
-        self.parser.phase = self.parser.phases["inHead"]
+        self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))

    def startTagTitle(self, name, attributes):
        element = self.tree.createElement(name, attributes)
@ -455,10 +486,11 @@ class InHeadPhase(Phase):
        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]

    def startTagScript(self, name, attributes):
+        #XXX Inner HTML case may be wrong
        element = self.tree.createElement(name, attributes)
        element._flags.append("parser-inserted")
-        if self.tree.headPointer is not None and\
-          self.parser.phase == self.parser.phases["inHead"]:
+        if (self.tree.headPointer is not None and
+            self.parser.phase == self.parser.phases["inHead"]):
            self.appendToHead(element)
        else:
            self.tree.openElements[-1].appendChild(element)
@ -653,8 +685,8 @@ class InBodyPhase(Phase):

    def startTagBody(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (body)."))
-        if len(self.tree.openElements) == 1 \
-          or self.tree.openElements[1].name != "body":
+        if (len(self.tree.openElements) == 1
+            or self.tree.openElements[1].name != "body"):
            assert self.parser.innerHTML
        else:
            for attr, value in attributes.iteritems():
@ -1179,6 +1211,7 @@ class InTablePhase(Phase):
            self.parser.resetInsertionMode()
        else:
            # innerHTML case
+            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagIgnore(self, name):
@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase):
        ])
        self.endTagHandler.default = self.endTagOther

+    def ignoreEndTagCaption(self):
+        return not self.tree.elementInScope("caption", True)
+
    def processCharacters(self, data):
        self.parser.phases["inBody"].processCharacters(data)

    def startTagTableElement(self, name, attributes):
        self.parser.parseError()
+        #XXX Have to duplicate logic here to find out if the tag is ignored
+        ignoreEndTag = self.ignoreEndTagCaption()
        self.parser.phase.processEndTag("caption")
-        # XXX how do we know the tag is _always_ ignored in the innerHTML
-        # case and therefore shouldn't be processed again? I'm not sure this
-        # strategy makes sense...
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.phases["inBody"].processStartTag(name, attributes)

    def endTagCaption(self, name):
-        if self.tree.elementInScope(name, True):
+        if not self.ignoreEndTagCaption():
            # AT this code is quite similar to endTagTable in "InTable"
            self.tree.generateImpliedEndTags()
            if self.tree.openElements[-1].name != "caption":
@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase):
            self.parser.phase = self.parser.phases["inTable"]
        else:
            # innerHTML case
+            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagTable(self, name):
        self.parser.parseError()
+        ignoreEndTag = self.ignoreEndTagCaption()
        self.parser.phase.processEndTag("caption")
-        # XXX ...
-        if not self.parser.innerHTML:
-            self.parser.phase.processStartTag(name, attributes)
+        if not ignoreEndTag:
+            self.parser.phase.processEndTag(name)

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase):
        ])
        self.endTagHandler.default = self.endTagOther

+    def ignoreEndTagColgroup(self):
+        return self.tree.openElements[-1].name == "html"
+
    def processCharacters(self, data):
+        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
-        # XXX
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processCharacters(data)

    def startTagCol(self, name ,attributes):
@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase):
        self.tree.openElements.pop()

    def startTagOther(self, name, attributes):
+        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
-        # XXX how can be sure it's always ignored?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def endTagColgroup(self, name):
-        if self.tree.openElements[-1].name == "html":
+        if self.ignoreEndTagColgroup():
            # innerHTML case
+            assert self.parser.innerHTML
            self.parser.parseError()
        else:
            self.tree.openElements.pop()
@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase):
          u"col has no end tag."))

    def endTagOther(self, name):
+        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
-        # XXX how can be sure it's always ignored?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processEndTag(name)


@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase):

    def startTagTableOther(self, name, attributes):
        # XXX AT Any ideas on how to share this with endTagTable?
-        if self.tree.elementInScope("tbody", True) or \
-          self.tree.elementInScope("thead", True) or \
-          self.tree.elementInScope("tfoot", True):
+        if (self.tree.elementInScope("tbody", True) or
+            self.tree.elementInScope("thead", True) or
+            self.tree.elementInScope("tfoot", True)):
            self.clearStackToTableBodyContext()
            self.endTagTableRowGroup(self.tree.openElements[-1].name)
            self.parser.phase.processStartTag(name, attributes)
@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase):
              ") in the table body phase. Ignored."))

    def endTagTable(self, name):
-        if self.tree.elementInScope("tbody", True) or \
-          self.tree.elementInScope("thead", True) or \
-          self.tree.elementInScope("tfoot", True):
+        if (self.tree.elementInScope("tbody", True) or
+            self.tree.elementInScope("thead", True) or
+            self.tree.elementInScope("tfoot", True)):
            self.clearStackToTableBodyContext()
            self.endTagTableRowGroup(self.tree.openElements[-1].name)
            self.parser.phase.processEndTag(name)
@ -1428,6 +1468,9 @@ class InRowPhase(Phase):
              self.tree.openElements[-1].name + u") in the row phase."))
            self.tree.openElements.pop()

+    def ignoreEndTagTr(self):
+        return not self.tree.elementInScope("tr", tableVariant=True)
+
    # the rest
    def processCharacters(self, data):
        self.parser.phases["inTable"].processCharacters(data)
@ -1439,28 +1482,31 @@ class InRowPhase(Phase):
        self.tree.activeFormattingElements.append(Marker)

    def startTagTableOther(self, name, attributes):
+        ignoreEndTag = self.ignoreEndTagTr()
        self.endTagTr("tr")
        # XXX how are we sure it's always ignored in the innerHTML case?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.phases["inTable"].processStartTag(name, attributes)

    def endTagTr(self, name):
-        if self.tree.elementInScope("tr", True):
+        if not self.ignoreEndTagTr():
            self.clearStackToTableRowContext()
            self.tree.openElements.pop()
            self.parser.phase = self.parser.phases["inTableBody"]
        else:
            # innerHTML case
+            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagTable(self, name):
+        ignoreEndTag = self.ignoreEndTagTr()
        self.endTagTr("tr")
        # Reprocess the current tag if the tr end tag was not ignored
        # XXX how are we sure it's always ignored in the innerHTML case?
-        if not self.parser.innerHTML:
+        if not ignoreEndTag:
            self.parser.phase.processEndTag(name)

    def endTagTableRowGroup(self, name):
@ -1628,7 +1674,7 @@ class InSelectPhase(Phase):
              u"select phase. Ignored."))

    def endTagSelect(self, name):
-        if self.tree.elementInScope(name, True):
+        if self.tree.elementInScope("select", True):
            node = self.tree.openElements.pop()
            while node.name != "select":
                node = self.tree.openElements.pop()
@ -1641,7 +1687,7 @@ class InSelectPhase(Phase):
        self.parser.parseError(_(u"Unexpected table end tag (" + name +\
          ") in the select phase."))
        if self.tree.elementInScope(name, True):
-            self.endTagSelect()
+            self.endTagSelect("select")
            self.parser.phase.processEndTag(name)

    def endTagOther(self, name):
@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase):
              u"in the frameset phase (innerHTML)."))
        else:
            self.tree.openElements.pop()
-        if not self.parser.innerHTML and\
-          self.tree.openElements[-1].name != "frameset":
+        if (not self.parser.innerHTML and
+            self.tree.openElements[-1].name != "frameset"):
            # If we're not in innerHTML mode and the the current node is not a
            # "frameset" element (anymore) then switch.
            self.parser.phase = self.parser.phases["afterFrameset"]
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@ -14,7 +14,7 @@ class HTMLInputStream(object):

    """

-    def __init__(self, source, encoding=None, chardet=True):
+    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
        """Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -26,6 +26,8 @@ class HTMLInputStream(object):
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
+        
+        parseMeta - Look for a <meta> element containing encoding information

        """
        # List of where new lines occur
@ -41,12 +43,9 @@ class HTMLInputStream(object):
        #Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        
-        #Autodetect encoding if no other information can be found?
-        self.chardet = chardet
-        
        #Detect encoding iff no explicit "transport level" encoding is supplied
        if encoding is None or not isValidEncoding(encoding):
-            encoding = self.detectEncoding()
+            encoding = self.detectEncoding(parseMeta, chardet)
        self.charEncoding = encoding

        # Read bytes from stream decoding them into Unicode
@ -79,17 +78,17 @@ class HTMLInputStream(object):
            stream = cStringIO.StringIO(str(source))
        return stream

-    def detectEncoding(self):
+    def detectEncoding(self, parseMeta=True, chardet=True):

        #First look for a BOM
        #This will also read past the BOM if present
        encoding = self.detectBOM()
        #If there is no BOM need to look for meta elements with encoding 
        #information
-        if encoding is None:
+        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
        #Guess with chardet, if avaliable
-        if encoding is None and self.chardet:
+        if encoding is None and chardet:
            try:
                import chardet
                buffer = self.rawStream.read()
--- a/planet/html5lib/tokenizer.py
+++ b/planet/html5lib/tokenizer.py
@ -32,8 +32,8 @@ class HTMLTokenizer(object):

    # XXX need to fix documentation

-    def __init__(self, stream, encoding=None):
-        self.stream = HTMLInputStream(stream, encoding)
+    def __init__(self, stream, encoding=None, parseMeta=True):
+        self.stream = HTMLInputStream(stream, encoding, parseMeta)

        self.states = {
            "data":self.dataState,
@ -338,31 +338,33 @@ class HTMLTokenizer(object):
                self.state = self.states["closeTagOpen"]
            else:
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
-                self.stream.queue.append(data)
+                self.stream.queue.insert(0, data)
                self.state = self.states["data"]
        return True

    def closeTagOpenState(self):
-        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
-          contentModelFlags["CDATA"]):
-            charStack = []
+        if (self.contentModelFlag in (contentModelFlags["RCDATA"],
+            contentModelFlags["CDATA"])):
+            if self.currentToken:
+                charStack = []

-            # So far we know that "</" has been consumed. We now need to know
-            # whether the next few characters match the name of last emitted
-            # start tag which also happens to be the currentToken. We also need
-            # to have the character directly after the characters that could
-            # match the start tag name.
-            for x in xrange(len(self.currentToken["name"]) + 1):
-                charStack.append(self.stream.char())
-                # Make sure we don't get hit by EOF
-                if charStack[-1] == EOF:
-                    break
+                # So far we know that "</" has been consumed. We now need to know
+                # whether the next few characters match the name of last emitted
+                # start tag which also happens to be the currentToken. We also need
+                # to have the character directly after the characters that could
+                # match the start tag name.
+                for x in xrange(len(self.currentToken["name"]) + 1):
+                    charStack.append(self.stream.char())
+                    # Make sure we don't get hit by EOF
+                    if charStack[-1] == EOF:
+                        break

-            # Since this is just for checking. We put the characters back on
-            # the stack.
-            self.stream.queue.extend(charStack)
+                # Since this is just for checking. We put the characters back on
+                # the stack.
+                self.stream.queue.extend(charStack)

-            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
+            if self.currentToken \
+              and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
              and charStack[-1] in (spaceCharacters |
              frozenset((u">", u"/", u"<", EOF))):
                # Because the characters are correct we can safely switch to
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@ -108,6 +108,9 @@ class TreeBuilder(object):

    #The class to use for creating doctypes
    doctypeClass = None
+    
+    #Fragment class
+    fragmentClass = None

    def __init__(self):
        self.reset()
@ -294,7 +297,6 @@ class TreeBuilder(object):
                fosterParent = self.openElements[
                    self.openElements.index(lastTable) - 1]
        else:
-            assert self.innerHTML
            fosterParent = self.openElements[0]
        return fosterParent, insertBefore

@ -310,6 +312,13 @@ class TreeBuilder(object):
    def getDocument(self):
        "Return the final tree"
        return self.document
+    
+    def getFragment(self):
+        "Return the final fragment"
+        #assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment

    def testSerializer(self, node):
        """Serialize the subtree of node in the format required by unit tests
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@ -1,6 +1,8 @@
 import _base
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
 import new
+from xml.sax.saxutils import escape
+from constants import voidElements

 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder):
        
    def commentClass(self, data):
        return NodeBuilder(self.dom.createComment(data))
+    
+    def fragmentClass(self):
+        return NodeBuilder(self.dom.createDocumentFragment())

    def appendChild(self, node):
        self.dom.appendChild(node.element)
@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder):

    def getDocument(self):
        return self.dom
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self).element

    def insertText(self, data, parent=None):
        data=illegal_xml_chars.sub(u'\uFFFD',data)
@ -118,7 +126,9 @@ def testSerializer(element):
        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
        elif element.nodeType == Node.DOCUMENT_NODE:
-            rv.append("#document")
+            rv.append("#document")
+        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+            rv.append("#document-fragment")
        elif element.nodeType == Node.COMMENT_NODE:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
        elif element.nodeType == Node.TEXT_NODE:
@ -135,6 +145,32 @@ def testSerializer(element):

    return "\n".join(rv)

+class HTMLSerializer(object):
+    def serialize(self, node):
+        rv = self.serializeNode(node)
+        for child in node.childNodes:
+            rv += self.serialize(child)
+        if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
+            rv += "</%s>\n"%node.nodeName
+        return rv
+    
+    def serializeNode(self, node):
+        if node.nodeType == Node.TEXT_NODE:
+            rv = node.nodeValue
+        elif node.nodeType == Node.ELEMENT_NODE:
+            rv = "<%s"%node.nodeName
+            if node.hasAttributes():
+                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+                                 node.attributes.items()])
+            rv += ">"
+        elif node.nodeType == Node.COMMENT_NODE:
+            rv = "<!-- %s -->" % escape(node.nodeValue)        
+        elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            rv = "<!DOCTYPE %s>" % node.name
+        else:
+            rv = ""
+        return rv
+
 def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
  if node.nodeType == Node.ELEMENT_NODE:
    if not nsmap:
@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
  elif node.nodeType == Node.DOCUMENT_NODE:
    handler.startDocument()
    for child in node.childNodes: dom2sax(child, handler, nsmap)
-    handler.endDocument()
+    handler.endDocument()
+
+  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+    for child in node.childNodes: dom2sax(child, handler, nsmap)

  else:
    # ATTRIBUTE_NODE
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@ -129,6 +129,10 @@ class Document(Element):
    def __init__(self):
        Element.__init__(self, Document) 

+class DocumentFragment(Element):
+    def __init__(self):
+        Element.__init__(self, DocumentFragment)
+
 def testSerializer(element):
    rv = []
    finalText = None
@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = Comment
+    fragmentClass = DocumentFragment

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.document._element
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self)._element
--- a/planet/html5lib/treebuilders/simpletree.py
+++ b/planet/html5lib/treebuilders/simpletree.py
@ -4,6 +4,7 @@ from xml.sax.saxutils import escape

 # Really crappy basic implementation of a DOM-core like thing
 class Node(_base.Node):
+    type = -1
    def __init__(self, name):
        self.name = name
        self.parent = None
@ -11,15 +12,18 @@ class Node(_base.Node):
        self.childNodes = []
        self._flags = []

+    def __iter__(self):
+        for node in self.childNodes:
+            yield node
+            for item in node:
+                yield item
+
    def __unicode__(self):
        return self.name

    def toxml(self):
        raise NotImplementedError

-    def __repr__(self):
-        return "<%s %s>" % (self.__class__, self.name)
-
    def printTree(self, indent=0):
        tree = '\n|%s%s' % (' '* indent, unicode(self))
        for child in self.childNodes:
@ -69,6 +73,7 @@ class Node(_base.Node):
        return bool(self.childNodes)

 class Document(Node):
+    type = 1
    def __init__(self):
        Node.__init__(self, None)

@ -93,7 +98,13 @@ class Document(Node):
            tree += child.printTree(2)
        return tree

+class DocumentFragment(Document):
+    type = 2
+    def __unicode__(self):
+        return "#document-fragment"
+
 class DocumentType(Node):
+    type = 3
    def __init__(self, name):
        Node.__init__(self, name)

@ -106,6 +117,7 @@ class DocumentType(Node):
        return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name

 class TextNode(Node):
+    type = 4
    def __init__(self, value):
        Node.__init__(self, None)
        self.value = value
@ -119,6 +131,7 @@ class TextNode(Node):
    hilite = toxml

 class Element(Node):
+    type = 5
    def __init__(self, name):
        Node.__init__(self, name)
        self.attributes = {}
@ -164,6 +177,7 @@ class Element(Node):
        return tree

 class CommentNode(Node):
+    type = 6
    def __init__(self, data):
        Node.__init__(self, None)
        self.data = data
@ -177,11 +191,38 @@ class CommentNode(Node):
    def hilite(self):
        return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)

+class HTMLSerializer(object):
+    def serialize(self, node):
+        rv = self.serializeNode(node)
+        for child in node.childNodes:
+            rv += self.serialize(child)
+        if node.type == Element.type and node.name not in voidElements:
+            rv += "</%s>\n"%node.name
+        return rv
+    
+    def serializeNode(self, node):
+        if node.type == TextNode.type:
+            rv = node.value
+        elif node.type == Element.type:
+            rv = "<%s"%node.name
+            if node.attributes:
+                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
+                                 node.attributes.iteritems()])
+            rv += ">"
+        elif node.type == CommentNode.type:
+            rv = "<!-- %s -->" % escape(node.data)        
+        elif node.type == DocumentType.type:
+            rv = "<!DOCTYPE %s>" % node.name
+        else:
+            rv = ""
+        return rv
+
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = CommentNode
+    fragmentClass = DocumentFragment
    
    def testSerializer(self, node):
        return node.printTree()
--- a/planet/shell/init.py
+++ b/planet/shell/init.py
@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):

    # Execute the shell module
    options = planet.config.template_options(template_file)
+    if module_name == 'plugin': options['__file__'] = template_file
    options.update(extra_options)
    log.debug("Processing %s %s using %s", mode,
        os.path.realpath(template_resolved), module_name)
--- a/planet/shell/plugin.py
+++ b/planet/shell/plugin.py
@ -0,0 +1,64 @@
+import os, sys, imp
+from StringIO import StringIO
+
+def run(script, doc, output_file=None, options={}):
+    """ process an Python script using imp """
+    save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
+    plugin_stdout = StringIO()
+    plugin_stderr = StringIO()
+
+    try:
+        # redirect stdin
+        sys.stdin = StringIO(doc)
+
+        # redirect stdout
+        if output_file:
+            sys.stdout = open(output_file, 'w')
+        else:
+            sys.stdout = plugin_stdout
+
+        # redirect stderr
+        sys.stderr = plugin_stderr
+
+        # determine __file__ value
+        if options.has_key("__file__"):
+            plugin_file = options["__file__"]
+            del options["__file__"]
+        else:
+            plugin_file = script
+
+        # set sys.argv
+        options = sum([['--'+key, value] for key,value in options.items()], [])
+        sys.argv = [plugin_file] + options
+
+        # import script
+        handle = open(script, 'r')
+        cwd = os.getcwd()
+        try:
+            try:
+                try:
+                    description=('.plugin', 'rb', imp.PY_SOURCE)
+                    imp.load_module('__main__',handle,plugin_file,description)
+                except SystemExit,e:
+                    if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
+            except Exception, e:
+                import traceback
+                type, value, tb = sys.exc_info()
+                plugin_stderr.write(''.join(
+                   traceback.format_exception_only(type,value) +
+                   traceback.format_tb(tb)))
+        finally:
+            handle.close()
+            if cwd != os.getcwd(): os.chdir(cwd)
+
+    finally:
+        # restore system state
+        sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
+
+    # log anything sent to stderr
+    if plugin_stderr.getvalue():
+        import planet
+        planet.logger.error(plugin_stderr.getvalue())
+
+    # return stdout
+    return plugin_stdout.getvalue()
--- a/planet/spider.py
+++ b/planet/spider.py
@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):

 def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
-    # log = planet.getLogger(config.log_level(),config.log_format())
    log = planet.getLogger(config.log_level(),config.log_format())

    global index
--- a/runtests.py
+++ b/runtests.py
@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'):
 if sys.path[0]: os.chdir(sys.path[0])
 sys.path[0] = os.getcwd()

-# find all of the planet test modules
-modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
+# determine verbosity
+verbosity = 1
+for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
+    if arg in sys.argv: 
+        verbosity = value
+        sys.argv.remove(arg)

-# enable warnings
+# find all of the planet test modules
+modules = []
+for pattern in sys.argv[1:] or ['test_*.py']:
+    modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
+
+# enable logging
 import planet
-planet.getLogger("WARNING",None)
+if verbosity == 0: planet.getLogger("FATAL",None)
+if verbosity == 1: planet.getLogger("WARNING",None)
+if verbosity == 2: planet.getLogger("DEBUG",None)

 # load all of the tests into a suite
 try:
@ -33,11 +44,5 @@ except Exception, exception:
    for module in modules: __import__(module)
    raise

-verbosity = 1
-if "-q" in sys.argv or '--quiet' in sys.argv:
-    verbosity = 0
-if "-v" in sys.argv or '--verbose' in sys.argv:
-    verbosity = 2
-
 # run test suite
 unittest.TextTestRunner(verbosity=verbosity).run(suite)
--- a/tests/data/apply/config-mememe.ini
+++ b/tests/data/apply/config-mememe.ini
@ -0,0 +1,29 @@
+[Planet]
+output_theme = classic_fancy
+output_dir = tests/work/apply
+name = test planet
+cache_directory = tests/work/spider/cache
+
+bill_of_materials:
+  images/#{face}
+
+[index.html.tmpl]
+filters:
+   html2xhtml.plugin
+   mememe.plugin
+
+[mememe.plugin]
+sidebar = //*[@class='sidebar']
+
+[tests/data/spider/testfeed0.atom]
+name = not found
+
+[tests/data/spider/testfeed1b.atom]
+name = one
+face = jdub.png
+
+[tests/data/spider/testfeed2.atom]
+name = two
+
+[tests/data/spider/testfeed3.rss]
+name = three
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase):
             os.makedirs(workdir)
    
    def tearDown(self):
-        shutil.rmtree(workdir)
-        os.removedirs(os.path.split(workdir)[0])
+        shutil.rmtree(os.path.split(workdir)[0])

    def test_apply_asf(self):
        config.load(configfile % 'asf')
@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase):
        output = open(os.path.join(workdir, 'index.html4')).read()
        self.assertTrue(output.find('/>')<0)

+    def test_apply_filter_mememe(self):
+        config.load(configfile % 'mememe')
+        self.apply_fancy()
+    
+        output = open(os.path.join(workdir, 'index.html')).read()
+        self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
+
    def apply_fancy(self):
+        # drop slow templates unrelated to test at hand
+        templates = config.parser.get('Planet','template_files').split()
+        templates.remove('rss10.xml.tmpl')
+        templates.remove('rss20.xml.tmpl')
+        config.parser.set('Planet','template_files',' '.join(templates))
+        
        splice.apply(self.feeddata)

        # verify that selected files are there