From a5e1fde287e83779aca83a9f4511fdd5d43dbbd1 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 30 Apr 2007 09:38:09 -0400 Subject: [PATCH] MeMeme and html2xhtml plugins --- .bzrignore | 1 + docs/filters.html | 23 +- docs/index.html | 2 +- examples/opml-top100.ini | 7 + filters/html2xhtml.plugin | 6 + filters/mememe.plugin | 475 +++++++++++++++++++++ planet/config.py | 9 +- planet/html5lib/html5parser.py | 172 +++++--- planet/html5lib/inputstream.py | 15 +- planet/html5lib/tokenizer.py | 42 +- planet/html5lib/treebuilders/_base.py | 11 +- planet/html5lib/treebuilders/dom.py | 43 +- planet/html5lib/treebuilders/etreefull.py | 8 + planet/html5lib/treebuilders/simpletree.py | 47 +- planet/shell/__init__.py | 1 + planet/shell/plugin.py | 64 +++ planet/spider.py | 1 - runtests.py | 25 +- tests/data/apply/config-mememe.ini | 29 ++ tests/test_apply.py | 16 +- 20 files changed, 878 insertions(+), 119 deletions(-) create mode 100644 filters/html2xhtml.plugin create mode 100644 filters/mememe.plugin create mode 100644 planet/shell/plugin.py create mode 100644 tests/data/apply/config-mememe.ini diff --git a/.bzrignore b/.bzrignore index 1d1886c..a8f0629 100644 --- a/.bzrignore +++ b/.bzrignore @@ -1,3 +1,4 @@ *.tmplc .DS_Store cache +*.pluginc diff --git a/docs/filters.html b/docs/filters.html index 58eb6fe..228f323 100644 --- a/docs/filters.html +++ b/docs/filters.html @@ -8,12 +8,13 @@ Venus Filters -

Filters

-

Filters are simple Unix pipes. Input comes in stdin, -parameters come from the config file, and output goes to stdout. -Anything written to stderr is logged as an ERROR message. If no -stdout is produced, the entry is not written to the cache or -processed further; in fact, if the entry had previously been written to the cache, it will be removed.

+

Filters and Plugins

+

Filters and plugins are simple Unix pipes. Input comes in +stdin, parameters come from the config file, and output goes to +stdout. Anything written to stderr is logged as an +ERROR message. If no stdout is produced, the entry is not written +to the cache or processed further; in fact, if the entry had previously been +written to the cache, it will be removed.

There are two types of filters supported by Venus, input and template.

Input to an input filter is a aggressively @@ -89,6 +90,16 @@ an HTML output stream from one source.

  • Templates written using htmltmpl or django currently only have access to a fixed set of fields, whereas XSLT and genshi templates have access to everything.
  • + +
  • Plugins differ from filters in that while filters are forked, plugins are +imported. This +means that plugins are limited to Python and are run in-process. Plugins +therefore have direct access to planet internals like configuration and +logging facitilies, as well as access to the bundled libraries like the +Universal Feed Parser and +html5lib; but it also +means that functions like os.abort() can't be recovered +from.
  • diff --git a/docs/index.html b/docs/index.html index ebdd234..c461d7f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -21,7 +21,7 @@
  • Other diff --git a/examples/opml-top100.ini b/examples/opml-top100.ini index 0522472..01b210d 100644 --- a/examples/opml-top100.ini +++ b/examples/opml-top100.ini @@ -36,6 +36,13 @@ filters = excerpt.py omit = img p br width = 500 +# add memes to output +[index.html.tmpl] +filters = mememe.plugin + +[mememe.plugin] +sidebar = //*[@id="footer"] + # subscription list [http://share.opml.org/opml/top100.opml] content_type = opml diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin new file mode 100644 index 0000000..456df48 --- /dev/null +++ b/filters/html2xhtml.plugin @@ -0,0 +1,6 @@ +import sys +from planet import html5lib +tree=html5lib.treebuilders.dom.TreeBuilder +parser = html5lib.html5parser.HTMLParser(tree=tree) +document = parser.parse(sys.stdin) +sys.stdout.write(document.toxml("utf-8")) diff --git a/filters/mememe.plugin b/filters/mememe.plugin new file mode 100644 index 0000000..2ce3b30 --- /dev/null +++ b/filters/mememe.plugin @@ -0,0 +1,475 @@ +# +# This Venus output filter will annotate an XHTML page with a list of +# "memes" (or most popular linked destinations, based on the last week +# of entries from the cache) and will update the subscription list with +# links to recent entries from each subscription. +# +# Templates that don't produce XHTML natively will need their output passed +# through html2xhtml.plugin first. +# +# Typical configuration (based on classic_fancy): +# +# [index.html.tmpl] +# filters: +# html2xhtml.plugin +# mememe.plugin +# +# [mememe.plugin] +# sidebar = @class='sidebar' +# + +import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5 +from xml.sax.saxutils import escape +from htmlentitydefs import entitydefs + +import planet +from planet import config, feedparser +from planet.spider import filename +log = planet.getLogger(config.log_level(),config.log_format()) +options = config.filter_options(sys.argv[0]) + +MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom') + +now = time.time() +week = 7 * 86400 +week_ago = now - week + +cache = config.cache_directory() +meme_cache = os.path.join(cache, 'memes') +if not os.path.exists(meme_cache): os.makedirs(meme_cache) + +all_links = {} +feed_links = {} + +def check_cache(url): + try: + file = open(filename(meme_cache, url)) + headers = eval(file.read()) + file.close() + return headers or {} + except: + return {} + +def cache_meme(url, headers): + json = [] + for key,value in headers.items(): + json.append(' %s: %s' % (toj(key), toj(value))) + file = open(filename(meme_cache, url),'w') + file.write('{\n' + ',\n'.join(json) + '\n}\n') + file.close() + +urlmap = {} +def canonicalize(url): + url = urlmap.get(url,url) + parts = list(urlparse.urlparse(url)) + + parts[0] = parts[0].lower() + parts[1] = parts[1].lower() + if parts[1].startswith('www.'): parts[1]=parts[1][4:] + if not parts[2]: parts[2] = '/' + parts[-1] = '' + return urlparse.urlunparse(parts) + +log.debug("Loading cached data") +for name in glob.glob(os.path.join(cache, '*')): + # ensure that this is within the past week + if os.path.isdir(name): continue + mtime = os.stat(name).st_mtime + if mtime < week_ago: continue + + # parse the file + try: + doc = libxml2.parseFile(name) + except: + continue + xp = doc.xpathNewContext() + xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom") + xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/") + + # determine the entry + entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']") + if not entry: continue + entry = canonicalize(entry[0].prop("href")) + + # determine the title + title = xp.xpathEval("/atom:entry/atom:title") + if title: + if title[0].prop('type') == 'html': + title = re.sub('<.*?>','',title[0].content) + else: + title = title[0].content + title = str(title or '') + + # determine the feed id + feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup") + if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id") + if not feed: continue + feed = feed[0].content + + # determine the author + author = xp.xpathEval("/atom:entry/atom:source/planet:name") + if author: + author = author[0].content + else: + author = '' + + # track the feed_links + if author: + if not feed_links.has_key(author): feed_links[author] = list() + feed_links[author].append([mtime, entry, title]) + + # identify the unique links + entry_links = [] + for node in doc.xpathEval("//*[@href and not(@rel='source')]"): + parent = node.parent + while parent: + if parent.name == 'source': break + parent = parent.parent + else: + link = canonicalize(node.prop('href')) + if not link in entry_links: + entry_links.append(link) + if node.hasProp('title') and node.prop('title').startswith('http'): + link = canonicalize(node.prop('title')) + if not link in entry_links: + entry_links.append(link) + + # add the votes + weight = 1.0 - (now - mtime)**2 / week**2 + vote = [(weight, str(entry), str(feed), title, author, mtime)] + for link in entry_links: + all_links[link] = all_links.get(link,list()) + vote + + # free the entry + doc.freeDoc() + +# tally the votes +weighted_links = [] +for link, votes in all_links.items(): + site = {} + updated = 0 + for weight, entry, feed, title, author, mtime in votes: + site[feed] = max(site.get(feed,0), weight) + if mtime > updated: updated=mtime + weighted_links.append((sum(site.values()), link, updated)) +weighted_links.sort() +weighted_links.reverse() + +cp1252 = { + 128: 8364, # euro sign + 130: 8218, # single low-9 quotation mark + 131: 402, # latin small letter f with hook + 132: 8222, # double low-9 quotation mark + 133: 8230, # horizontal ellipsis + 134: 8224, # dagger + 135: 8225, # double dagger + 136: 710, # modifier letter circumflex accent + 137: 8240, # per mille sign + 138: 352, # latin capital letter s with caron + 139: 8249, # single left-pointing angle quotation mark + 140: 338, # latin capital ligature oe + 142: 381, # latin capital letter z with caron + 145: 8216, # left single quotation mark + 146: 8217, # right single quotation mark + 147: 8220, # left double quotation mark + 148: 8221, # right double quotation mark + 149: 8226, # bullet + 150: 8211, # en dash + 151: 8212, # em dash + 152: 732, # small tilde + 153: 8482, # trade mark sign + 154: 353, # latin small letter s with caron + 155: 8250, # single right-pointing angle quotation mark + 156: 339, # latin small ligature oe + 158: 382, # latin small letter z with caron + 159: 376} # latin capital letter y with diaeresis + +# determine the title for a given url +class html(sgmllib.SGMLParser): + def __init__(self, url): + sgmllib.SGMLParser.__init__(self) + self.title = "" + self.feedurl = "" + self.intitle = False + + headers = check_cache(url) + + try: + # fetch the page + request = urllib2.Request(url) + request.add_header('User-Agent', 'Venus/MeMeme') + if headers.has_key('etag'): + request.add_header('If-None-Match', headers['etag']) + if headers.has_key('last_modified'): + request.add_header('If-Modified-Since', headers['last-modified']) + response = urllib2.urlopen(request) + self.feed(response.read()) + + # ensure the data is in utf-8 + try: + self.title = self.title.decode('utf-8') + except: + self.title = ''.join([unichr(cp1252.get(ord(c),ord(c))) + for c in self.title.decode('iso-8859-1')]) + + # cache the results + headers = {} + if self.feedurl: headers['feedurl'] = self.feedurl + if self.title: headers['title'] = self.title + headers.update(response.headers) + cache_meme(url, headers) + except: + self.feedurl = headers.get('feedurl') + if headers.has_key('title'): + if isinstance(headers['title'],str): + self.title=eval('u'+repr(headers['title']).replace('\\\\','\\')) + else: + self.title=headers['title'] + + # if there is a feed, look for an entry that matches, and take that title + if self.feedurl and not self.title: + headers = check_cache(self.feedurl) + data = feedparser.parse(self.feedurl, etag=headers.get('etag'), + modified=headers.get('last-modified')) + + if data.has_key('headers') and data.has_key('status') and \ + data.status in [200, 301, 302]: + + titles = {} + for entry in data.entries: + if entry.has_key('title_detail') and entry.has_key('link'): + titles[entry.link] = entry.title_detail.value + if entry.title_detail.type == 'text/plain': + titles[entry.link] = escape(titles[entry.link]) + + if titles.has_key(url): self.title = titles[url] + + data.headers.update(titles) + cache_meme(self.feedurl, data.headers) + else: + if headers.has_key(url): + if isinstance(headers[url],str): + self.title=eval('u'+repr(headers[url]).replace('\\\\','\\')) + else: + self.title=headers[url] + + # fallback is the basename of the URI + if not self.title: + self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0]) + + # parse out the first autodiscovery link + def start_link(self, attrs): + if self.feedurl: return + attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) + if not 'rel' in attrs: return + rels = attrs['rel'].split(' ') + if 'alternate' not in rels: return + if not 'type' in attrs or not attrs['type'].endswith('xml'): return + if 'href' in attrs: + self.feedurl = attrs['href'] + + # parse the page title + def start_title(self, attributes): + if not self.title: self.intitle = True + def end_title(self): + self.intitle = False + def handle_data(self, text): + if self.intitle: self.title += escape(text) + +# convert unicode string to a json string +def toj(value): + result = repr(value).replace(r'\x',r'\u00') + if result[:1] == 'u': result=result[1:] + if result.startswith("'"): + result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1] + return result + +seenit = [] +count = 0 + +# construct an empty feed +feed_doc = libxml2.newDoc("1.0") +meme_feed = feed_doc.newChild(None, "feed", None) +meme_feed.newNs('http://www.w3.org/2005/Atom', None) +meme_feed.newTextChild(None, 'title', config.name() + ': Memes') +author = meme_feed.newChild(None, 'author', None) +author.newTextChild(None, 'name', config.owner_name()) +if config.owner_email: author.newTextChild(None, 'email', config.owner_email()) +meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom')) +link = meme_feed.newChild(None, 'link', None) +link.setProp('href', os.path.join(config.link(), 'memes.atom')) +link.setProp('rel', 'self') +meme_feed.newTextChild(None, 'updated', + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())) + +# parse the input +log.debug("Parse input") +doc=libxml2.parseDoc(sys.stdin.read()) + +# find the sidebar/footer +sidebar = options.get('sidebar','//*[@class="sidebar"]') +footer = doc.xpathEval(sidebar) +if not hasattr(footer,'__len__') or len(footer) == 0: + raise Exception(sidebar + ' not found') +if len(footer) > 1: + log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar)) +footer = footer[0] + +# add up to 10 entry links to each subscription +subs_ul = footer.children +while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next +child = subs_ul.children +while child: + if child.name == 'li': + if child.lastChild().name == 'ul': child.lastChild().unlinkNode() + link = child.lastChild() + while link.isText(): link=link.prev + author = link.getContent() + state = 'inactive' + if feed_links.has_key(author): + ul2 = child.newChild(None, 'ul', None) + feed_links[author].sort() + feed_links[author].reverse() + link_count = 0 + for mtime, entry, title in feed_links[author]: + if not title: continue + li2 = ul2.newChild(None, 'li', None) + a = li2.newTextChild(None, 'a', title) + a.setProp('href', entry) + link_count = link_count + 1 + if link_count >= 10: break + if link_count > 0: state = None + if state: + link.setProp('class',((link.prop('class') or '') + ' ' + state).strip()) + child=child.next + +# create a h2 and ul for the memes list +footer_top = footer.children +memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes ')) +memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None)) + +# create a header for the memes list +a = memes.newChild(None, 'a', None) +a.setProp('href', 'memes.atom') +img = a.newChild(None, 'img', None) +img.setProp('src', 'images/feed-icon-10x10.png') + +# collect the results +log.debug("Fetch titles and collect the results") +from urllib import quote_plus +for i in range(0,len(weighted_links)): + weight, link, updated = weighted_links[i] + + # ensure that somebody new points to this entry. This guards against + # groups of related links which several posts point to all. + novel = False + for weight, entry, feed, title, author, mtime in all_links[link]: + if entry not in seenit: + seenit.append(entry) + novel = True + if not novel: continue + + all_links[link].sort() + all_links[link].reverse() + cache_file = filename(cache, link) + title = None + + # when possible, take the title from the cache + if os.path.exists(cache_file): + entry = feedparser.parse(cache_file).entries[0] + if entry.has_key('title_detail'): + title = entry.title_detail.value + if entry.title_detail.type == 'text/plain': title = escape(title) + + # otherwise, parse the html + if not title: + title = html(link).title + + # dehtmlize + title = re.sub('&(\w+);', + lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title) + title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title) + title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title) + + # title too long? Insert zero width spaces where appropriate + if max(map(len,title.split())) > 30: + title=re.sub('(\W+)',u'\\1\u200b',title) + + # save the entry title (it is used later) + entry_title = title.strip() + + # add to the memes list + memes_ul.addContent('\n') + li = memes_ul.newChild(None, 'li', None) + memes_ul.addContent('\n') + + # technorati link + a = li.newChild(None, 'a', None) + tlink = 'http://technorati.com/cosmos/search.html?url=' + if link.startswith('http://'): + a.setProp('href',tlink + quote_plus(link[7:])) + else: + a.setProp('href',tlink + quote_plus(link)) + a.setProp('title','cosmos') + img = a.newChild(None, 'img', None) + img.setProp('src','tcosm11.gif') + + # main link + a = li.newTextChild(None, 'a', title.strip().encode('utf-8')) + a.setProp('href',link) + if (((i==0) or (updated>=weighted_links[i-1][2])) and + (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))): + rank = 0 + for j in range(0,len(weighted_links)): + if updated < weighted_links[j][2]: rank = rank + 1 + if rank < len(weighted_links)/2: + a.setProp('class','rising') + + # voters + ul2 = li.newChild(None, 'ul', None) + voters = [] + for weight, entry, feed, title, author, mtime in all_links[link]: + if entry in voters: continue + li2 = ul2.newChild(None, 'li', None) + a = li2.newTextChild(None, 'a' , author) + a.setProp('href',entry) + if title: a.setProp('title',title) + voters.append(entry) + + # add to the meme feed + if len(all_links[link]) > 2: + meme_feed.addContent('\n') + entry = meme_feed.newChild(None, 'entry', None) + meme_feed.addContent('\n') + + # entry + tagbase = config.link().split('/') + if not tagbase[-1]: tagbase = tagbase[:-1] + tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:])) + entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest()) + entry.newTextChild(None, 'title', entry_title.encode('utf-8')) + meme_link = entry.newTextChild(None, 'link', None) + meme_link.setProp('href', link) + entry.newTextChild(None, 'updated', + time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated))) + + # voters + content = entry.newChild(None, 'content', None) + content.setProp('type', 'xhtml') + div = content.newTextChild(None, 'div', 'Spotted by:') + div.newNs('http://www.w3.org/1999/xhtml', None) + content_ul = div.newChild(None, 'ul', None) + for weight, entry, feed, title, author, mtime in all_links[link]: + li2 = content_ul.newTextChild(None, 'li', author + ": ") + a = li2.newTextChild(None, 'a' , title or 'untitled') + a.setProp('href',entry) + + count = count + 1 + if count >= 10: break + +log.info("Writing " + MEMES_ATOM) +output=open(MEMES_ATOM,'w') +output.write(feed_doc.serialize('utf-8')) +output.close() + +sys.stdout.write(doc.serialize('utf-8')) diff --git a/planet/config.py b/planet/config.py index d2b84e6..fb436e8 100644 --- a/planet/config.py +++ b/planet/config.py @@ -352,14 +352,15 @@ def filters(section=None): filters = [] if parser.has_option('Planet', 'filters'): filters += parser.get('Planet', 'filters').split() - if section and parser.has_option(section, 'filters'): - filters += parser.get(section, 'filters').split() if filter(section): filters.append('regexp_sifter.py?require=' + urllib.quote(filter(section))) if exclude(section): filters.append('regexp_sifter.py?exclude=' + urllib.quote(exclude(section))) + for section in section and [section] or template_files(): + if parser.has_option(section, 'filters'): + filters += parser.get(section, 'filters').split() return filters def planet_options(): @@ -382,6 +383,10 @@ def template_options(section): """ dictionary of template specific options""" return feed_options(section) +def filter_options(section): + """ dictionary of filter specific options""" + return feed_options(section) + def write(file=sys.stdout): """ write out an updated template """ print parser.write(file) diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py index a007616..898ec9f 100644 --- a/planet/html5lib/html5parser.py +++ b/planet/html5lib/html5parser.py @@ -71,35 +71,40 @@ class HTMLParser(object): "trailingEnd": TrailingEndPhase(self, self.tree) } - def parse(self, stream, encoding=None, innerHTML=False): - """Parse a HTML document into a well-formed tree - - stream - a filelike object or string containing the HTML to be parsed - - innerHTML - Are we parsing in innerHTML mode (note innerHTML=True - is not yet supported) - - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) - """ - + def _parse(self, stream, innerHTML=False, container="div", + encoding=None): + self.tree.reset() self.firstStartTag = False self.errors = [] - self.phase = self.phases["initial"] + self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding, + parseMeta=innerHTML) + + if innerHTML: + self.innerHTML = container.lower() + + if self.innerHTML in ('title', 'textarea'): + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] + elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'): + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"] + elif self.innerHTML == 'plaintext': + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"] + else: + # contentModelFlag already is PCDATA + #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"] + pass + self.phase = self.phases["rootElement"] + self.phase.insertHtmlElement() + self.resetInsertionMode() + else: + self.innerHTML = False + self.phase = self.phases["initial"] + # We only seem to have InBodyPhase testcases where the following is # relevant ... need others too self.lastPhase = None - # We don't actually support innerHTML yet but this should allow - # assertations - self.innerHTML = innerHTML - - self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding) - # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer for token in self.tokenizer: @@ -118,7 +123,34 @@ class HTMLParser(object): # When the loop finishes it's EOF self.phase.processEOF() + def parse(self, stream, encoding=None): + """Parse a HTML document into a well-formed tree + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, innerHTML=False, encoding=encoding) return self.tree.getDocument() + + def parseFragment(self, stream, container="div", encoding=None): + """Parse a HTML fragment into a well-formed tree fragment + + container - name of the element we're setting the innerHTML property + if set to None, default to 'div' + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, True, container=container, encoding=encoding) + return self.tree.getFragment() def parseError(self, data="XXX ERROR MESSAGE NEEDED"): # XXX The idea is to make data mandatory. @@ -187,28 +219,29 @@ class HTMLParser(object): "frameset":"inFrameset" } for node in self.tree.openElements[::-1]: + nodeName = node.name if node == self.tree.openElements[0]: last = True - if node.name not in ['td', 'th']: + if nodeName not in ['td', 'th']: # XXX assert self.innerHTML - raise NotImplementedError + nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case - if node.name in ("select", "colgroup", "head", "frameset"): + if nodeName in ("select", "colgroup", "head", "frameset"): # XXX assert self.innerHTML - if node.name in newModes: - self.phase = self.phases[newModes[node.name]] + if nodeName in newModes: + self.phase = self.phases[newModes[nodeName]] break - elif node.name == "html": + elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] else: self.phase = self.phases["afterHead"] break elif last: - self.phase = self.phases["body"] + self.phase = self.phases["inBody"] break class Phase(object): @@ -434,9 +467,7 @@ class InHeadPhase(Phase): self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): - self.tree.insertElement(name, attributes) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] + self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored")) def startTagTitle(self, name, attributes): element = self.tree.createElement(name, attributes) @@ -455,10 +486,11 @@ class InHeadPhase(Phase): self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagScript(self, name, attributes): + #XXX Inner HTML case may be wrong element = self.tree.createElement(name, attributes) element._flags.append("parser-inserted") - if self.tree.headPointer is not None and\ - self.parser.phase == self.parser.phases["inHead"]: + if (self.tree.headPointer is not None and + self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) @@ -653,8 +685,8 @@ class InBodyPhase(Phase): def startTagBody(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (body).")) - if len(self.tree.openElements) == 1 \ - or self.tree.openElements[1].name != "body": + if (len(self.tree.openElements) == 1 + or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: for attr, value in attributes.iteritems(): @@ -1179,6 +1211,7 @@ class InTablePhase(Phase): self.parser.resetInsertionMode() else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagIgnore(self, name): @@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase): ]) self.endTagHandler.default = self.endTagOther + def ignoreEndTagCaption(self): + return not self.tree.elementInScope("caption", True) + def processCharacters(self, data): self.parser.phases["inBody"].processCharacters(data) def startTagTableElement(self, name, attributes): self.parser.parseError() + #XXX Have to duplicate logic here to find out if the tag is ignored + ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag("caption") - # XXX how do we know the tag is _always_ ignored in the innerHTML - # case and therefore shouldn't be processed again? I'm not sure this - # strategy makes sense... - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inBody"].processStartTag(name, attributes) def endTagCaption(self, name): - if self.tree.elementInScope(name, True): + if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": @@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase): self.parser.phase = self.parser.phases["inTable"] else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, name): self.parser.parseError() + ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag("caption") - # XXX ... - if not self.parser.innerHTML: - self.parser.phase.processStartTag(name, attributes) + if not ignoreEndTag: + self.parser.phase.processEndTag(name) def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ @@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase): ]) self.endTagHandler.default = self.endTagOther + def ignoreEndTagColgroup(self): + return self.tree.openElements[-1].name == "html" + def processCharacters(self, data): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processCharacters(data) def startTagCol(self, name ,attributes): @@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase): self.tree.openElements.pop() def startTagOther(self, name, attributes): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX how can be sure it's always ignored? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def endTagColgroup(self, name): - if self.tree.openElements[-1].name == "html": + if self.ignoreEndTagColgroup(): # innerHTML case + assert self.parser.innerHTML self.parser.parseError() else: self.tree.openElements.pop() @@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase): u"col has no end tag.")) def endTagOther(self, name): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX how can be sure it's always ignored? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processEndTag(name) @@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase): def startTagTableOther(self, name, attributes): # XXX AT Any ideas on how to share this with endTagTable? - if self.tree.elementInScope("tbody", True) or \ - self.tree.elementInScope("thead", True) or \ - self.tree.elementInScope("tfoot", True): + if (self.tree.elementInScope("tbody", True) or + self.tree.elementInScope("thead", True) or + self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processStartTag(name, attributes) @@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase): ") in the table body phase. Ignored.")) def endTagTable(self, name): - if self.tree.elementInScope("tbody", True) or \ - self.tree.elementInScope("thead", True) or \ - self.tree.elementInScope("tfoot", True): + if (self.tree.elementInScope("tbody", True) or + self.tree.elementInScope("thead", True) or + self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processEndTag(name) @@ -1428,6 +1468,9 @@ class InRowPhase(Phase): self.tree.openElements[-1].name + u") in the row phase.")) self.tree.openElements.pop() + def ignoreEndTagTr(self): + return not self.tree.elementInScope("tr", tableVariant=True) + # the rest def processCharacters(self, data): self.parser.phases["inTable"].processCharacters(data) @@ -1439,28 +1482,31 @@ class InRowPhase(Phase): self.tree.activeFormattingElements.append(Marker) def startTagTableOther(self, name, attributes): + ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inTable"].processStartTag(name, attributes) def endTagTr(self, name): - if self.tree.elementInScope("tr", True): + if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTableBody"] else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, name): + ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processEndTag(name) def endTagTableRowGroup(self, name): @@ -1628,7 +1674,7 @@ class InSelectPhase(Phase): u"select phase. Ignored.")) def endTagSelect(self, name): - if self.tree.elementInScope(name, True): + if self.tree.elementInScope("select", True): node = self.tree.openElements.pop() while node.name != "select": node = self.tree.openElements.pop() @@ -1641,7 +1687,7 @@ class InSelectPhase(Phase): self.parser.parseError(_(u"Unexpected table end tag (" + name +\ ") in the select phase.")) if self.tree.elementInScope(name, True): - self.endTagSelect() + self.endTagSelect("select") self.parser.phase.processEndTag(name) def endTagOther(self, name): @@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase): u"in the frameset phase (innerHTML).")) else: self.tree.openElements.pop() - if not self.parser.innerHTML and\ - self.tree.openElements[-1].name != "frameset": + if (not self.parser.innerHTML and + self.tree.openElements[-1].name != "frameset"): # If we're not in innerHTML mode and the the current node is not a # "frameset" element (anymore) then switch. self.parser.phase = self.parser.phases["afterFrameset"] diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py index 9140456..e197415 100644 --- a/planet/html5lib/inputstream.py +++ b/planet/html5lib/inputstream.py @@ -14,7 +14,7 @@ class HTMLInputStream(object): """ - def __init__(self, source, encoding=None, chardet=True): + def __init__(self, source, encoding=None, parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -26,6 +26,8 @@ class HTMLInputStream(object): the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) + + parseMeta - Look for a element containing encoding information """ # List of where new lines occur @@ -41,12 +43,9 @@ class HTMLInputStream(object): #Encoding to use if no other information can be found self.defaultEncoding = "windows-1252" - #Autodetect encoding if no other information can be found? - self.chardet = chardet - #Detect encoding iff no explicit "transport level" encoding is supplied if encoding is None or not isValidEncoding(encoding): - encoding = self.detectEncoding() + encoding = self.detectEncoding(parseMeta, chardet) self.charEncoding = encoding # Read bytes from stream decoding them into Unicode @@ -79,17 +78,17 @@ class HTMLInputStream(object): stream = cStringIO.StringIO(str(source)) return stream - def detectEncoding(self): + def detectEncoding(self, parseMeta=True, chardet=True): #First look for a BOM #This will also read past the BOM if present encoding = self.detectBOM() #If there is no BOM need to look for meta elements with encoding #information - if encoding is None: + if encoding is None and parseMeta: encoding = self.detectEncodingMeta() #Guess with chardet, if avaliable - if encoding is None and self.chardet: + if encoding is None and chardet: try: import chardet buffer = self.rawStream.read() diff --git a/planet/html5lib/tokenizer.py b/planet/html5lib/tokenizer.py index 3f4db08..584b268 100644 --- a/planet/html5lib/tokenizer.py +++ b/planet/html5lib/tokenizer.py @@ -32,8 +32,8 @@ class HTMLTokenizer(object): # XXX need to fix documentation - def __init__(self, stream, encoding=None): - self.stream = HTMLInputStream(stream, encoding) + def __init__(self, stream, encoding=None, parseMeta=True): + self.stream = HTMLInputStream(stream, encoding, parseMeta) self.states = { "data":self.dataState, @@ -338,31 +338,33 @@ class HTMLTokenizer(object): self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": "Characters", "data": u"<"}) - self.stream.queue.append(data) + self.stream.queue.insert(0, data) self.state = self.states["data"] return True def closeTagOpenState(self): - if self.contentModelFlag in (contentModelFlags["RCDATA"],\ - contentModelFlags["CDATA"]): - charStack = [] + if (self.contentModelFlag in (contentModelFlags["RCDATA"], + contentModelFlags["CDATA"])): + if self.currentToken: + charStack = [] - # So far we know that "", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py index 2502466..6c7bb0b 100755 --- a/planet/html5lib/treebuilders/_base.py +++ b/planet/html5lib/treebuilders/_base.py @@ -108,6 +108,9 @@ class TreeBuilder(object): #The class to use for creating doctypes doctypeClass = None + + #Fragment class + fragmentClass = None def __init__(self): self.reset() @@ -294,7 +297,6 @@ class TreeBuilder(object): fosterParent = self.openElements[ self.openElements.index(lastTable) - 1] else: - assert self.innerHTML fosterParent = self.openElements[0] return fosterParent, insertBefore @@ -310,6 +312,13 @@ class TreeBuilder(object): def getDocument(self): "Return the final tree" return self.document + + def getFragment(self): + "Return the final fragment" + #assert self.innerHTML + fragment = self.fragmentClass() + self.openElements[0].reparentChildren(fragment) + return fragment def testSerializer(self, node): """Serialize the subtree of node in the format required by unit tests diff --git a/planet/html5lib/treebuilders/dom.py b/planet/html5lib/treebuilders/dom.py index 8b52d6a..bfaa880 100755 --- a/planet/html5lib/treebuilders/dom.py +++ b/planet/html5lib/treebuilders/dom.py @@ -1,6 +1,8 @@ import _base from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE import new +from xml.sax.saxutils import escape +from constants import voidElements import re illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") @@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder): def commentClass(self, data): return NodeBuilder(self.dom.createComment(data)) + + def fragmentClass(self): + return NodeBuilder(self.dom.createDocumentFragment()) def appendChild(self, node): self.dom.appendChild(node.element) @@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder): def getDocument(self): return self.dom + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element def insertText(self, data, parent=None): data=illegal_xml_chars.sub(u'\uFFFD',data) @@ -118,7 +126,9 @@ def testSerializer(element): if element.nodeType == Node.DOCUMENT_TYPE_NODE: rv.append("|%s"%(' '*indent, element.name)) elif element.nodeType == Node.DOCUMENT_NODE: - rv.append("#document") + rv.append("#document") + elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + rv.append("#document-fragment") elif element.nodeType == Node.COMMENT_NODE: rv.append("|%s"%(' '*indent, element.nodeValue)) elif element.nodeType == Node.TEXT_NODE: @@ -135,6 +145,32 @@ def testSerializer(element): return "\n".join(rv) +class HTMLSerializer(object): + def serialize(self, node): + rv = self.serializeNode(node) + for child in node.childNodes: + rv += self.serialize(child) + if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements: + rv += "\n"%node.nodeName + return rv + + def serializeNode(self, node): + if node.nodeType == Node.TEXT_NODE: + rv = node.nodeValue + elif node.nodeType == Node.ELEMENT_NODE: + rv = "<%s"%node.nodeName + if node.hasAttributes(): + rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in + node.attributes.items()]) + rv += ">" + elif node.nodeType == Node.COMMENT_NODE: + rv = "" % escape(node.nodeValue) + elif node.nodeType == Node.DOCUMENT_TYPE_NODE: + rv = "" % node.name + else: + rv = "" + return rv + def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): if node.nodeType == Node.ELEMENT_NODE: if not nsmap: @@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): elif node.nodeType == Node.DOCUMENT_NODE: handler.startDocument() for child in node.childNodes: dom2sax(child, handler, nsmap) - handler.endDocument() + handler.endDocument() + + elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: dom2sax(child, handler, nsmap) else: # ATTRIBUTE_NODE diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py index acead55..2629664 100644 --- a/planet/html5lib/treebuilders/etreefull.py +++ b/planet/html5lib/treebuilders/etreefull.py @@ -129,6 +129,10 @@ class Document(Element): def __init__(self): Element.__init__(self, Document) +class DocumentFragment(Element): + def __init__(self): + Element.__init__(self, DocumentFragment) + def testSerializer(element): rv = [] finalText = None @@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder): doctypeClass = DocumentType elementClass = Element commentClass = Comment + fragmentClass = DocumentFragment def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.document._element + + def getFragment(self): + return _base.TreeBuilder.getFragment(self)._element diff --git a/planet/html5lib/treebuilders/simpletree.py b/planet/html5lib/treebuilders/simpletree.py index 6b2f09e..05dc0c0 100755 --- a/planet/html5lib/treebuilders/simpletree.py +++ b/planet/html5lib/treebuilders/simpletree.py @@ -4,6 +4,7 @@ from xml.sax.saxutils import escape # Really crappy basic implementation of a DOM-core like thing class Node(_base.Node): + type = -1 def __init__(self, name): self.name = name self.parent = None @@ -11,15 +12,18 @@ class Node(_base.Node): self.childNodes = [] self._flags = [] + def __iter__(self): + for node in self.childNodes: + yield node + for item in node: + yield item + def __unicode__(self): return self.name def toxml(self): raise NotImplementedError - def __repr__(self): - return "<%s %s>" % (self.__class__, self.name) - def printTree(self, indent=0): tree = '\n|%s%s' % (' '* indent, unicode(self)) for child in self.childNodes: @@ -69,6 +73,7 @@ class Node(_base.Node): return bool(self.childNodes) class Document(Node): + type = 1 def __init__(self): Node.__init__(self, None) @@ -93,7 +98,13 @@ class Document(Node): tree += child.printTree(2) return tree +class DocumentFragment(Document): + type = 2 + def __unicode__(self): + return "#document-fragment" + class DocumentType(Node): + type = 3 def __init__(self, name): Node.__init__(self, name) @@ -106,6 +117,7 @@ class DocumentType(Node): return '<!DOCTYPE %s>' % self.name class TextNode(Node): + type = 4 def __init__(self, value): Node.__init__(self, None) self.value = value @@ -119,6 +131,7 @@ class TextNode(Node): hilite = toxml class Element(Node): + type = 5 def __init__(self, name): Node.__init__(self, name) self.attributes = {} @@ -164,6 +177,7 @@ class Element(Node): return tree class CommentNode(Node): + type = 6 def __init__(self, data): Node.__init__(self, None) self.data = data @@ -177,11 +191,38 @@ class CommentNode(Node): def hilite(self): return '<!--%s-->' % escape(self.data) +class HTMLSerializer(object): + def serialize(self, node): + rv = self.serializeNode(node) + for child in node.childNodes: + rv += self.serialize(child) + if node.type == Element.type and node.name not in voidElements: + rv += "\n"%node.name + return rv + + def serializeNode(self, node): + if node.type == TextNode.type: + rv = node.value + elif node.type == Element.type: + rv = "<%s"%node.name + if node.attributes: + rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in + node.attributes.iteritems()]) + rv += ">" + elif node.type == CommentNode.type: + rv = "" % escape(node.data) + elif node.type == DocumentType.type: + rv = "" % node.name + else: + rv = "" + return rv + class TreeBuilder(_base.TreeBuilder): documentClass = Document doctypeClass = DocumentType elementClass = Element commentClass = CommentNode + fragmentClass = DocumentFragment def testSerializer(self, node): return node.printTree() diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py index 8d28045..dd2abd3 100644 --- a/planet/shell/__init__.py +++ b/planet/shell/__init__.py @@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'): # Execute the shell module options = planet.config.template_options(template_file) + if module_name == 'plugin': options['__file__'] = template_file options.update(extra_options) log.debug("Processing %s %s using %s", mode, os.path.realpath(template_resolved), module_name) diff --git a/planet/shell/plugin.py b/planet/shell/plugin.py new file mode 100644 index 0000000..dd94380 --- /dev/null +++ b/planet/shell/plugin.py @@ -0,0 +1,64 @@ +import os, sys, imp +from StringIO import StringIO + +def run(script, doc, output_file=None, options={}): + """ process an Python script using imp """ + save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv) + plugin_stdout = StringIO() + plugin_stderr = StringIO() + + try: + # redirect stdin + sys.stdin = StringIO(doc) + + # redirect stdout + if output_file: + sys.stdout = open(output_file, 'w') + else: + sys.stdout = plugin_stdout + + # redirect stderr + sys.stderr = plugin_stderr + + # determine __file__ value + if options.has_key("__file__"): + plugin_file = options["__file__"] + del options["__file__"] + else: + plugin_file = script + + # set sys.argv + options = sum([['--'+key, value] for key,value in options.items()], []) + sys.argv = [plugin_file] + options + + # import script + handle = open(script, 'r') + cwd = os.getcwd() + try: + try: + try: + description=('.plugin', 'rb', imp.PY_SOURCE) + imp.load_module('__main__',handle,plugin_file,description) + except SystemExit,e: + if e.code: log.error('%s exit rc=%d',(plugin_file,e.code)) + except Exception, e: + import traceback + type, value, tb = sys.exc_info() + plugin_stderr.write(''.join( + traceback.format_exception_only(type,value) + + traceback.format_tb(tb))) + finally: + handle.close() + if cwd != os.getcwd(): os.chdir(cwd) + + finally: + # restore system state + sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys + + # log anything sent to stderr + if plugin_stderr.getvalue(): + import planet + planet.logger.error(plugin_stderr.getvalue()) + + # return stdout + return plugin_stdout.getvalue() diff --git a/planet/spider.py b/planet/spider.py index 11fe94a..b18a787 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log): def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ - # log = planet.getLogger(config.log_level(),config.log_format()) log = planet.getLogger(config.log_level(),config.log_format()) global index diff --git a/runtests.py b/runtests.py index d14058d..7783d14 100755 --- a/runtests.py +++ b/runtests.py @@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'): if sys.path[0]: os.chdir(sys.path[0]) sys.path[0] = os.getcwd() -# find all of the planet test modules -modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py'))) +# determine verbosity +verbosity = 1 +for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)): + if arg in sys.argv: + verbosity = value + sys.argv.remove(arg) -# enable warnings +# find all of the planet test modules +modules = [] +for pattern in sys.argv[1:] or ['test_*.py']: + modules += map(fullmodname, glob.glob(os.path.join('tests', pattern))) + +# enable logging import planet -planet.getLogger("WARNING",None) +if verbosity == 0: planet.getLogger("FATAL",None) +if verbosity == 1: planet.getLogger("WARNING",None) +if verbosity == 2: planet.getLogger("DEBUG",None) # load all of the tests into a suite try: @@ -33,11 +44,5 @@ except Exception, exception: for module in modules: __import__(module) raise -verbosity = 1 -if "-q" in sys.argv or '--quiet' in sys.argv: - verbosity = 0 -if "-v" in sys.argv or '--verbose' in sys.argv: - verbosity = 2 - # run test suite unittest.TextTestRunner(verbosity=verbosity).run(suite) diff --git a/tests/data/apply/config-mememe.ini b/tests/data/apply/config-mememe.ini new file mode 100644 index 0000000..c6ca9bd --- /dev/null +++ b/tests/data/apply/config-mememe.ini @@ -0,0 +1,29 @@ +[Planet] +output_theme = classic_fancy +output_dir = tests/work/apply +name = test planet +cache_directory = tests/work/spider/cache + +bill_of_materials: + images/#{face} + +[index.html.tmpl] +filters: + html2xhtml.plugin + mememe.plugin + +[mememe.plugin] +sidebar = //*[@class='sidebar'] + +[tests/data/spider/testfeed0.atom] +name = not found + +[tests/data/spider/testfeed1b.atom] +name = one +face = jdub.png + +[tests/data/spider/testfeed2.atom] +name = two + +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/test_apply.py b/tests/test_apply.py index ec5a8e5..dafa37a 100644 --- a/tests/test_apply.py +++ b/tests/test_apply.py @@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase): os.makedirs(workdir) def tearDown(self): - shutil.rmtree(workdir) - os.removedirs(os.path.split(workdir)[0]) + shutil.rmtree(os.path.split(workdir)[0]) def test_apply_asf(self): config.load(configfile % 'asf') @@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase): output = open(os.path.join(workdir, 'index.html4')).read() self.assertTrue(output.find('/>')<0) + def test_apply_filter_mememe(self): + config.load(configfile % 'mememe') + self.apply_fancy() + + output = open(os.path.join(workdir, 'index.html')).read() + self.assertTrue(output.find('