MeMeme and html2xhtml plugins

This commit is contained in:
Sam Ruby 2007-04-30 09:38:09 -04:00
parent ddf15fc689
commit a5e1fde287
20 changed files with 878 additions and 119 deletions

View File

@ -1,3 +1,4 @@
*.tmplc
.DS_Store
cache
*.pluginc

View File

@ -8,12 +8,13 @@
<title>Venus Filters</title>
</head>
<body>
<h2>Filters</h2>
<p>Filters are simple Unix pipes. Input comes in <code>stdin</code>,
parameters come from the config file, and output goes to <code>stdout</code>.
Anything written to <code>stderr</code> is logged as an ERROR message. If no
<code>stdout</code> is produced, the entry is not written to the cache or
processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
<h2>Filters and Plugins</h2>
<p>Filters and plugins are simple Unix pipes. Input comes in
<code>stdin</code>, parameters come from the config file, and output goes to
<code>stdout</code>. Anything written to <code>stderr</code> is logged as an
ERROR message. If no <code>stdout</code> is produced, the entry is not written
to the cache or processed further; in fact, if the entry had previously been
written to the cache, it will be removed.</p>
<p>There are two types of filters supported by Venus, input and template.</p>
<p>Input to an input filter is a aggressively
@ -89,6 +90,16 @@ an HTML output stream from one source.</li>
<li>Templates written using htmltmpl or django currently only have access to a
fixed set of fields, whereas XSLT and genshi templates have access to
everything.</li>
<li>Plugins differ from filters in that while filters are forked, plugins are
<a href="http://docs.python.org/lib/module-imp.html">imported</a>. This
means that plugins are limited to Python and are run in-process. Plugins
therefore have direct access to planet internals like configuration and
logging facitilies, as well as access to the bundled libraries like the
<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
means that functions like <code>os.abort()</code> can't be recovered
from.</li>
</ul>
</body>
</html>

View File

@ -21,7 +21,7 @@
<ul>
<li><a href="venus.svg">Architecture</a></li>
<li><a href="normalization.html">Normalization</a></li>
<li><a href="filters.html">Filters</a></li>
<li><a href="filters.html">Filters and Plugins</a></li>
</ul>
</li>
<li>Other

View File

@ -36,6 +36,13 @@ filters = excerpt.py
omit = img p br
width = 500
# add memes to output
[index.html.tmpl]
filters = mememe.plugin
[mememe.plugin]
sidebar = //*[@id="footer"]
# subscription list
[http://share.opml.org/opml/top100.opml]
content_type = opml

View File

@ -0,0 +1,6 @@
import sys
from planet import html5lib
tree=html5lib.treebuilders.dom.TreeBuilder
parser = html5lib.html5parser.HTMLParser(tree=tree)
document = parser.parse(sys.stdin)
sys.stdout.write(document.toxml("utf-8"))

475
filters/mememe.plugin Normal file
View File

@ -0,0 +1,475 @@
#
# This Venus output filter will annotate an XHTML page with a list of
# "memes" (or most popular linked destinations, based on the last week
# of entries from the cache) and will update the subscription list with
# links to recent entries from each subscription.
#
# Templates that don't produce XHTML natively will need their output passed
# through html2xhtml.plugin first.
#
# Typical configuration (based on classic_fancy):
#
# [index.html.tmpl]
# filters:
# html2xhtml.plugin
# mememe.plugin
#
# [mememe.plugin]
# sidebar = @class='sidebar'
#
import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
import planet
from planet import config, feedparser
from planet.spider import filename
log = planet.getLogger(config.log_level(),config.log_format())
options = config.filter_options(sys.argv[0])
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
now = time.time()
week = 7 * 86400
week_ago = now - week
cache = config.cache_directory()
meme_cache = os.path.join(cache, 'memes')
if not os.path.exists(meme_cache): os.makedirs(meme_cache)
all_links = {}
feed_links = {}
def check_cache(url):
try:
file = open(filename(meme_cache, url))
headers = eval(file.read())
file.close()
return headers or {}
except:
return {}
def cache_meme(url, headers):
json = []
for key,value in headers.items():
json.append(' %s: %s' % (toj(key), toj(value)))
file = open(filename(meme_cache, url),'w')
file.write('{\n' + ',\n'.join(json) + '\n}\n')
file.close()
urlmap = {}
def canonicalize(url):
url = urlmap.get(url,url)
parts = list(urlparse.urlparse(url))
parts[0] = parts[0].lower()
parts[1] = parts[1].lower()
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
if not parts[2]: parts[2] = '/'
parts[-1] = ''
return urlparse.urlunparse(parts)
log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')):
# ensure that this is within the past week
if os.path.isdir(name): continue
mtime = os.stat(name).st_mtime
if mtime < week_ago: continue
# parse the file
try:
doc = libxml2.parseFile(name)
except:
continue
xp = doc.xpathNewContext()
xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
# determine the entry
entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
if not entry: continue
entry = canonicalize(entry[0].prop("href"))
# determine the title
title = xp.xpathEval("/atom:entry/atom:title")
if title:
if title[0].prop('type') == 'html':
title = re.sub('<.*?>','',title[0].content)
else:
title = title[0].content
title = str(title or '')
# determine the feed id
feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
if not feed: continue
feed = feed[0].content
# determine the author
author = xp.xpathEval("/atom:entry/atom:source/planet:name")
if author:
author = author[0].content
else:
author = ''
# track the feed_links
if author:
if not feed_links.has_key(author): feed_links[author] = list()
feed_links[author].append([mtime, entry, title])
# identify the unique links
entry_links = []
for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
parent = node.parent
while parent:
if parent.name == 'source': break
parent = parent.parent
else:
link = canonicalize(node.prop('href'))
if not link in entry_links:
entry_links.append(link)
if node.hasProp('title') and node.prop('title').startswith('http'):
link = canonicalize(node.prop('title'))
if not link in entry_links:
entry_links.append(link)
# add the votes
weight = 1.0 - (now - mtime)**2 / week**2
vote = [(weight, str(entry), str(feed), title, author, mtime)]
for link in entry_links:
all_links[link] = all_links.get(link,list()) + vote
# free the entry
doc.freeDoc()
# tally the votes
weighted_links = []
for link, votes in all_links.items():
site = {}
updated = 0
for weight, entry, feed, title, author, mtime in votes:
site[feed] = max(site.get(feed,0), weight)
if mtime > updated: updated=mtime
weighted_links.append((sum(site.values()), link, updated))
weighted_links.sort()
weighted_links.reverse()
cp1252 = {
128: 8364, # euro sign
130: 8218, # single low-9 quotation mark
131: 402, # latin small letter f with hook
132: 8222, # double low-9 quotation mark
133: 8230, # horizontal ellipsis
134: 8224, # dagger
135: 8225, # double dagger
136: 710, # modifier letter circumflex accent
137: 8240, # per mille sign
138: 352, # latin capital letter s with caron
139: 8249, # single left-pointing angle quotation mark
140: 338, # latin capital ligature oe
142: 381, # latin capital letter z with caron
145: 8216, # left single quotation mark
146: 8217, # right single quotation mark
147: 8220, # left double quotation mark
148: 8221, # right double quotation mark
149: 8226, # bullet
150: 8211, # en dash
151: 8212, # em dash
152: 732, # small tilde
153: 8482, # trade mark sign
154: 353, # latin small letter s with caron
155: 8250, # single right-pointing angle quotation mark
156: 339, # latin small ligature oe
158: 382, # latin small letter z with caron
159: 376} # latin capital letter y with diaeresis
# determine the title for a given url
class html(sgmllib.SGMLParser):
def __init__(self, url):
sgmllib.SGMLParser.__init__(self)
self.title = ""
self.feedurl = ""
self.intitle = False
headers = check_cache(url)
try:
# fetch the page
request = urllib2.Request(url)
request.add_header('User-Agent', 'Venus/MeMeme')
if headers.has_key('etag'):
request.add_header('If-None-Match', headers['etag'])
if headers.has_key('last_modified'):
request.add_header('If-Modified-Since', headers['last-modified'])
response = urllib2.urlopen(request)
self.feed(response.read())
# ensure the data is in utf-8
try:
self.title = self.title.decode('utf-8')
except:
self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
for c in self.title.decode('iso-8859-1')])
# cache the results
headers = {}
if self.feedurl: headers['feedurl'] = self.feedurl
if self.title: headers['title'] = self.title
headers.update(response.headers)
cache_meme(url, headers)
except:
self.feedurl = headers.get('feedurl')
if headers.has_key('title'):
if isinstance(headers['title'],str):
self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
else:
self.title=headers['title']
# if there is a feed, look for an entry that matches, and take that title
if self.feedurl and not self.title:
headers = check_cache(self.feedurl)
data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
modified=headers.get('last-modified'))
if data.has_key('headers') and data.has_key('status') and \
data.status in [200, 301, 302]:
titles = {}
for entry in data.entries:
if entry.has_key('title_detail') and entry.has_key('link'):
titles[entry.link] = entry.title_detail.value
if entry.title_detail.type == 'text/plain':
titles[entry.link] = escape(titles[entry.link])
if titles.has_key(url): self.title = titles[url]
data.headers.update(titles)
cache_meme(self.feedurl, data.headers)
else:
if headers.has_key(url):
if isinstance(headers[url],str):
self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
else:
self.title=headers[url]
# fallback is the basename of the URI
if not self.title:
self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
# parse out the first autodiscovery link
def start_link(self, attrs):
if self.feedurl: return
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
if not 'rel' in attrs: return
rels = attrs['rel'].split(' ')
if 'alternate' not in rels: return
if not 'type' in attrs or not attrs['type'].endswith('xml'): return
if 'href' in attrs:
self.feedurl = attrs['href']
# parse the page title
def start_title(self, attributes):
if not self.title: self.intitle = True
def end_title(self):
self.intitle = False
def handle_data(self, text):
if self.intitle: self.title += escape(text)
# convert unicode string to a json string
def toj(value):
result = repr(value).replace(r'\x',r'\u00')
if result[:1] == 'u': result=result[1:]
if result.startswith("'"):
result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
return result
seenit = []
count = 0
# construct an empty feed
feed_doc = libxml2.newDoc("1.0")
meme_feed = feed_doc.newChild(None, "feed", None)
meme_feed.newNs('http://www.w3.org/2005/Atom', None)
meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
author = meme_feed.newChild(None, 'author', None)
author.newTextChild(None, 'name', config.owner_name())
if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
link = meme_feed.newChild(None, 'link', None)
link.setProp('href', os.path.join(config.link(), 'memes.atom'))
link.setProp('rel', 'self')
meme_feed.newTextChild(None, 'updated',
time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
# parse the input
log.debug("Parse input")
doc=libxml2.parseDoc(sys.stdin.read())
# find the sidebar/footer
sidebar = options.get('sidebar','//*[@class="sidebar"]')
footer = doc.xpathEval(sidebar)
if not hasattr(footer,'__len__') or len(footer) == 0:
raise Exception(sidebar + ' not found')
if len(footer) > 1:
log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
footer = footer[0]
# add up to 10 entry links to each subscription
subs_ul = footer.children
while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
child = subs_ul.children
while child:
if child.name == 'li':
if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
link = child.lastChild()
while link.isText(): link=link.prev
author = link.getContent()
state = 'inactive'
if feed_links.has_key(author):
ul2 = child.newChild(None, 'ul', None)
feed_links[author].sort()
feed_links[author].reverse()
link_count = 0
for mtime, entry, title in feed_links[author]:
if not title: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a', title)
a.setProp('href', entry)
link_count = link_count + 1
if link_count >= 10: break
if link_count > 0: state = None
if state:
link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
child=child.next
# create a h2 and ul for the memes list
footer_top = footer.children
memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
# create a header for the memes list
a = memes.newChild(None, 'a', None)
a.setProp('href', 'memes.atom')
img = a.newChild(None, 'img', None)
img.setProp('src', 'images/feed-icon-10x10.png')
# collect the results
log.debug("Fetch titles and collect the results")
from urllib import quote_plus
for i in range(0,len(weighted_links)):
weight, link, updated = weighted_links[i]
# ensure that somebody new points to this entry. This guards against
# groups of related links which several posts point to all.
novel = False
for weight, entry, feed, title, author, mtime in all_links[link]:
if entry not in seenit:
seenit.append(entry)
novel = True
if not novel: continue
all_links[link].sort()
all_links[link].reverse()
cache_file = filename(cache, link)
title = None
# when possible, take the title from the cache
if os.path.exists(cache_file):
entry = feedparser.parse(cache_file).entries[0]
if entry.has_key('title_detail'):
title = entry.title_detail.value
if entry.title_detail.type == 'text/plain': title = escape(title)
# otherwise, parse the html
if not title:
title = html(link).title
# dehtmlize
title = re.sub('&(\w+);',
lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
# title too long? Insert zero width spaces where appropriate
if max(map(len,title.split())) > 30:
title=re.sub('(\W+)',u'\\1\u200b',title)
# save the entry title (it is used later)
entry_title = title.strip()
# add to the memes list
memes_ul.addContent('\n')
li = memes_ul.newChild(None, 'li', None)
memes_ul.addContent('\n')
# technorati link
a = li.newChild(None, 'a', None)
tlink = 'http://technorati.com/cosmos/search.html?url='
if link.startswith('http://'):
a.setProp('href',tlink + quote_plus(link[7:]))
else:
a.setProp('href',tlink + quote_plus(link))
a.setProp('title','cosmos')
img = a.newChild(None, 'img', None)
img.setProp('src','tcosm11.gif')
# main link
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
a.setProp('href',link)
if (((i==0) or (updated>=weighted_links[i-1][2])) and
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
rank = 0
for j in range(0,len(weighted_links)):
if updated < weighted_links[j][2]: rank = rank + 1
if rank < len(weighted_links)/2:
a.setProp('class','rising')
# voters
ul2 = li.newChild(None, 'ul', None)
voters = []
for weight, entry, feed, title, author, mtime in all_links[link]:
if entry in voters: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a' , author)
a.setProp('href',entry)
if title: a.setProp('title',title)
voters.append(entry)
# add to the meme feed
if len(all_links[link]) > 2:
meme_feed.addContent('\n')
entry = meme_feed.newChild(None, 'entry', None)
meme_feed.addContent('\n')
# entry
tagbase = config.link().split('/')
if not tagbase[-1]: tagbase = tagbase[:-1]
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
meme_link = entry.newTextChild(None, 'link', None)
meme_link.setProp('href', link)
entry.newTextChild(None, 'updated',
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
# voters
content = entry.newChild(None, 'content', None)
content.setProp('type', 'xhtml')
div = content.newTextChild(None, 'div', 'Spotted by:')
div.newNs('http://www.w3.org/1999/xhtml', None)
content_ul = div.newChild(None, 'ul', None)
for weight, entry, feed, title, author, mtime in all_links[link]:
li2 = content_ul.newTextChild(None, 'li', author + ": ")
a = li2.newTextChild(None, 'a' , title or 'untitled')
a.setProp('href',entry)
count = count + 1
if count >= 10: break
log.info("Writing " + MEMES_ATOM)
output=open(MEMES_ATOM,'w')
output.write(feed_doc.serialize('utf-8'))
output.close()
sys.stdout.write(doc.serialize('utf-8'))

View File

@ -352,14 +352,15 @@ def filters(section=None):
filters = []
if parser.has_option('Planet', 'filters'):
filters += parser.get('Planet', 'filters').split()
if section and parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split()
if filter(section):
filters.append('regexp_sifter.py?require=' +
urllib.quote(filter(section)))
if exclude(section):
filters.append('regexp_sifter.py?exclude=' +
urllib.quote(exclude(section)))
for section in section and [section] or template_files():
if parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split()
return filters
def planet_options():
@ -382,6 +383,10 @@ def template_options(section):
""" dictionary of template specific options"""
return feed_options(section)
def filter_options(section):
""" dictionary of filter specific options"""
return feed_options(section)
def write(file=sys.stdout):
""" write out an updated template """
print parser.write(file)

View File

@ -71,35 +71,40 @@ class HTMLParser(object):
"trailingEnd": TrailingEndPhase(self, self.tree)
}
def parse(self, stream, encoding=None, innerHTML=False):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
is not yet supported)
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
def _parse(self, stream, innerHTML=False, container="div",
encoding=None):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.phase = self.phases["initial"]
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
parseMeta=innerHTML)
if innerHTML:
self.innerHTML = container.lower()
if self.innerHTML in ('title', 'textarea'):
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
elif self.innerHTML == 'plaintext':
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
else:
# contentModelFlag already is PCDATA
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
pass
self.phase = self.phases["rootElement"]
self.phase.insertHtmlElement()
self.resetInsertionMode()
else:
self.innerHTML = False
self.phase = self.phases["initial"]
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None
# We don't actually support innerHTML yet but this should allow
# assertations
self.innerHTML = innerHTML
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
for token in self.tokenizer:
@ -118,7 +123,34 @@ class HTMLParser(object):
# When the loop finishes it's EOF
self.phase.processEOF()
def parse(self, stream, encoding=None):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
self._parse(stream, innerHTML=False, encoding=encoding)
return self.tree.getDocument()
def parseFragment(self, stream, container="div", encoding=None):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
if set to None, default to 'div'
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
self._parse(stream, True, container=container, encoding=encoding)
return self.tree.getFragment()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
@ -187,28 +219,29 @@ class HTMLParser(object):
"frameset":"inFrameset"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
if node == self.tree.openElements[0]:
last = True
if node.name not in ['td', 'th']:
if nodeName not in ['td', 'th']:
# XXX
assert self.innerHTML
raise NotImplementedError
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if node.name in ("select", "colgroup", "head", "frameset"):
if nodeName in ("select", "colgroup", "head", "frameset"):
# XXX
assert self.innerHTML
if node.name in newModes:
self.phase = self.phases[newModes[node.name]]
if nodeName in newModes:
self.phase = self.phases[newModes[nodeName]]
break
elif node.name == "html":
elif nodeName == "html":
if self.tree.headPointer is None:
self.phase = self.phases["beforeHead"]
else:
self.phase = self.phases["afterHead"]
break
elif last:
self.phase = self.phases["body"]
self.phase = self.phases["inBody"]
break
class Phase(object):
@ -434,9 +467,7 @@ class InHeadPhase(Phase):
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
def startTagTitle(self, name, attributes):
element = self.tree.createElement(name, attributes)
@ -455,10 +486,11 @@ class InHeadPhase(Phase):
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes):
#XXX Inner HTML case may be wrong
element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted")
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
if (self.tree.headPointer is not None and
self.parser.phase == self.parser.phases["inHead"]):
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
@ -653,8 +685,8 @@ class InBodyPhase(Phase):
def startTagBody(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (body)."))
if len(self.tree.openElements) == 1 \
or self.tree.openElements[1].name != "body":
if (len(self.tree.openElements) == 1
or self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
for attr, value in attributes.iteritems():
@ -1179,6 +1211,7 @@ class InTablePhase(Phase):
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, name):
@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase):
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", True)
def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)
def startTagTableElement(self, name, attributes):
self.parser.parseError()
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
# XXX how do we know the tag is _always_ ignored in the innerHTML
# case and therefore shouldn't be processed again? I'm not sure this
# strategy makes sense...
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def endTagCaption(self, name):
if self.tree.elementInScope(name, True):
if not self.ignoreEndTagCaption():
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "caption":
@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase):
self.parser.phase = self.parser.phases["inTable"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
self.parser.parseError()
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
# XXX ...
if not self.parser.innerHTML:
self.parser.phase.processStartTag(name, attributes)
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase):
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"
def processCharacters(self, data):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
# XXX
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processCharacters(data)
def startTagCol(self, name ,attributes):
@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase):
self.tree.openElements.pop()
def startTagOther(self, name, attributes):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
# XXX how can be sure it's always ignored?
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def endTagColgroup(self, name):
if self.tree.openElements[-1].name == "html":
if self.ignoreEndTagColgroup():
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
else:
self.tree.openElements.pop()
@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase):
u"col has no end tag."))
def endTagOther(self, name):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
# XXX how can be sure it's always ignored?
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase):
def startTagTableOther(self, name, attributes):
# XXX AT Any ideas on how to share this with endTagTable?
if self.tree.elementInScope("tbody", True) or \
self.tree.elementInScope("thead", True) or \
self.tree.elementInScope("tfoot", True):
if (self.tree.elementInScope("tbody", True) or
self.tree.elementInScope("thead", True) or
self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processStartTag(name, attributes)
@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase):
") in the table body phase. Ignored."))
def endTagTable(self, name):
if self.tree.elementInScope("tbody", True) or \
self.tree.elementInScope("thead", True) or \
self.tree.elementInScope("tfoot", True):
if (self.tree.elementInScope("tbody", True) or
self.tree.elementInScope("thead", True) or
self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processEndTag(name)
@ -1428,6 +1468,9 @@ class InRowPhase(Phase):
self.tree.openElements[-1].name + u") in the row phase."))
self.tree.openElements.pop()
def ignoreEndTagTr(self):
return not self.tree.elementInScope("tr", tableVariant=True)
# the rest
def processCharacters(self, data):
self.parser.phases["inTable"].processCharacters(data)
@ -1439,28 +1482,31 @@ class InRowPhase(Phase):
self.tree.activeFormattingElements.append(Marker)
def startTagTableOther(self, name, attributes):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# XXX how are we sure it's always ignored in the innerHTML case?
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inTable"].processStartTag(name, attributes)
def endTagTr(self, name):
if self.tree.elementInScope("tr", True):
if not self.ignoreEndTagTr():
self.clearStackToTableRowContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTableBody"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
if not self.parser.innerHTML:
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
def endTagTableRowGroup(self, name):
@ -1628,7 +1674,7 @@ class InSelectPhase(Phase):
u"select phase. Ignored."))
def endTagSelect(self, name):
if self.tree.elementInScope(name, True):
if self.tree.elementInScope("select", True):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
@ -1641,7 +1687,7 @@ class InSelectPhase(Phase):
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
") in the select phase."))
if self.tree.elementInScope(name, True):
self.endTagSelect()
self.endTagSelect("select")
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase):
u"in the frameset phase (innerHTML)."))
else:
self.tree.openElements.pop()
if not self.parser.innerHTML and\
self.tree.openElements[-1].name != "frameset":
if (not self.parser.innerHTML and
self.tree.openElements[-1].name != "frameset"):
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"]

View File

@ -14,7 +14,7 @@ class HTMLInputStream(object):
"""
def __init__(self, source, encoding=None, chardet=True):
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -26,6 +26,8 @@ class HTMLInputStream(object):
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
parseMeta - Look for a <meta> element containing encoding information
"""
# List of where new lines occur
@ -41,12 +43,9 @@ class HTMLInputStream(object):
#Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
#Autodetect encoding if no other information can be found?
self.chardet = chardet
#Detect encoding iff no explicit "transport level" encoding is supplied
if encoding is None or not isValidEncoding(encoding):
encoding = self.detectEncoding()
encoding = self.detectEncoding(parseMeta, chardet)
self.charEncoding = encoding
# Read bytes from stream decoding them into Unicode
@ -79,17 +78,17 @@ class HTMLInputStream(object):
stream = cStringIO.StringIO(str(source))
return stream
def detectEncoding(self):
def detectEncoding(self, parseMeta=True, chardet=True):
#First look for a BOM
#This will also read past the BOM if present
encoding = self.detectBOM()
#If there is no BOM need to look for meta elements with encoding
#information
if encoding is None:
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
#Guess with chardet, if avaliable
if encoding is None and self.chardet:
if encoding is None and chardet:
try:
import chardet
buffer = self.rawStream.read()

View File

@ -32,8 +32,8 @@ class HTMLTokenizer(object):
# XXX need to fix documentation
def __init__(self, stream, encoding=None):
self.stream = HTMLInputStream(stream, encoding)
def __init__(self, stream, encoding=None, parseMeta=True):
self.stream = HTMLInputStream(stream, encoding, parseMeta)
self.states = {
"data":self.dataState,
@ -338,31 +338,33 @@ class HTMLTokenizer(object):
self.state = self.states["closeTagOpen"]
else:
self.tokenQueue.append({"type": "Characters", "data": u"<"})
self.stream.queue.append(data)
self.stream.queue.insert(0, data)
self.state = self.states["data"]
return True
def closeTagOpenState(self):
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
contentModelFlags["CDATA"]):
charStack = []
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
contentModelFlags["CDATA"])):
if self.currentToken:
charStack = []
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the currentToken. We also need
# to have the character directly after the characters that could
# match the start tag name.
for x in xrange(len(self.currentToken["name"]) + 1):
charStack.append(self.stream.char())
# Make sure we don't get hit by EOF
if charStack[-1] == EOF:
break
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the currentToken. We also need
# to have the character directly after the characters that could
# match the start tag name.
for x in xrange(len(self.currentToken["name"]) + 1):
charStack.append(self.stream.char())
# Make sure we don't get hit by EOF
if charStack[-1] == EOF:
break
# Since this is just for checking. We put the characters back on
# the stack.
self.stream.queue.extend(charStack)
# Since this is just for checking. We put the characters back on
# the stack.
self.stream.queue.extend(charStack)
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
if self.currentToken \
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
and charStack[-1] in (spaceCharacters |
frozenset((u">", u"/", u"<", EOF))):
# Because the characters are correct we can safely switch to

View File

@ -108,6 +108,9 @@ class TreeBuilder(object):
#The class to use for creating doctypes
doctypeClass = None
#Fragment class
fragmentClass = None
def __init__(self):
self.reset()
@ -294,7 +297,6 @@ class TreeBuilder(object):
fosterParent = self.openElements[
self.openElements.index(lastTable) - 1]
else:
assert self.innerHTML
fosterParent = self.openElements[0]
return fosterParent, insertBefore
@ -310,6 +312,13 @@ class TreeBuilder(object):
def getDocument(self):
"Return the final tree"
return self.document
def getFragment(self):
"Return the final fragment"
#assert self.innerHTML
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
return fragment
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests

View File

@ -1,6 +1,8 @@
import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
from xml.sax.saxutils import escape
from constants import voidElements
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder):
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder):
def getDocument(self):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
@ -118,7 +126,9 @@ def testSerializer(element):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
@ -135,6 +145,32 @@ def testSerializer(element):
return "\n".join(rv)
class HTMLSerializer(object):
def serialize(self, node):
rv = self.serializeNode(node)
for child in node.childNodes:
rv += self.serialize(child)
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
rv += "</%s>\n"%node.nodeName
return rv
def serializeNode(self, node):
if node.nodeType == Node.TEXT_NODE:
rv = node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
rv = "<%s"%node.nodeName
if node.hasAttributes():
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
node.attributes.items()])
rv += ">"
elif node.nodeType == Node.COMMENT_NODE:
rv = "<!-- %s -->" % escape(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
rv = "<!DOCTYPE %s>" % node.name
else:
rv = ""
return rv
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE

View File

@ -129,6 +129,10 @@ class Document(Element):
def __init__(self):
Element.__init__(self, Document)
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, DocumentFragment)
def testSerializer(element):
rv = []
finalText = None
@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.document._element
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element

View File

@ -4,6 +4,7 @@ from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
class Node(_base.Node):
type = -1
def __init__(self, name):
self.name = name
self.parent = None
@ -11,15 +12,18 @@ class Node(_base.Node):
self.childNodes = []
self._flags = []
def __iter__(self):
for node in self.childNodes:
yield node
for item in node:
yield item
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def __repr__(self):
return "<%s %s>" % (self.__class__, self.name)
def printTree(self, indent=0):
tree = '\n|%s%s' % (' '* indent, unicode(self))
for child in self.childNodes:
@ -69,6 +73,7 @@ class Node(_base.Node):
return bool(self.childNodes)
class Document(Node):
type = 1
def __init__(self):
Node.__init__(self, None)
@ -93,7 +98,13 @@ class Document(Node):
tree += child.printTree(2)
return tree
class DocumentFragment(Document):
type = 2
def __unicode__(self):
return "#document-fragment"
class DocumentType(Node):
type = 3
def __init__(self, name):
Node.__init__(self, name)
@ -106,6 +117,7 @@ class DocumentType(Node):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
class TextNode(Node):
type = 4
def __init__(self, value):
Node.__init__(self, None)
self.value = value
@ -119,6 +131,7 @@ class TextNode(Node):
hilite = toxml
class Element(Node):
type = 5
def __init__(self, name):
Node.__init__(self, name)
self.attributes = {}
@ -164,6 +177,7 @@ class Element(Node):
return tree
class CommentNode(Node):
type = 6
def __init__(self, data):
Node.__init__(self, None)
self.data = data
@ -177,11 +191,38 @@ class CommentNode(Node):
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
class HTMLSerializer(object):
def serialize(self, node):
rv = self.serializeNode(node)
for child in node.childNodes:
rv += self.serialize(child)
if node.type == Element.type and node.name not in voidElements:
rv += "</%s>\n"%node.name
return rv
def serializeNode(self, node):
if node.type == TextNode.type:
rv = node.value
elif node.type == Element.type:
rv = "<%s"%node.name
if node.attributes:
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
node.attributes.iteritems()])
rv += ">"
elif node.type == CommentNode.type:
rv = "<!-- %s -->" % escape(node.data)
elif node.type == DocumentType.type:
rv = "<!DOCTYPE %s>" % node.name
else:
rv = ""
return rv
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = CommentNode
fragmentClass = DocumentFragment
def testSerializer(self, node):
return node.printTree()

View File

@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):
# Execute the shell module
options = planet.config.template_options(template_file)
if module_name == 'plugin': options['__file__'] = template_file
options.update(extra_options)
log.debug("Processing %s %s using %s", mode,
os.path.realpath(template_resolved), module_name)

64
planet/shell/plugin.py Normal file
View File

@ -0,0 +1,64 @@
import os, sys, imp
from StringIO import StringIO
def run(script, doc, output_file=None, options={}):
""" process an Python script using imp """
save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
plugin_stdout = StringIO()
plugin_stderr = StringIO()
try:
# redirect stdin
sys.stdin = StringIO(doc)
# redirect stdout
if output_file:
sys.stdout = open(output_file, 'w')
else:
sys.stdout = plugin_stdout
# redirect stderr
sys.stderr = plugin_stderr
# determine __file__ value
if options.has_key("__file__"):
plugin_file = options["__file__"]
del options["__file__"]
else:
plugin_file = script
# set sys.argv
options = sum([['--'+key, value] for key,value in options.items()], [])
sys.argv = [plugin_file] + options
# import script
handle = open(script, 'r')
cwd = os.getcwd()
try:
try:
try:
description=('.plugin', 'rb', imp.PY_SOURCE)
imp.load_module('__main__',handle,plugin_file,description)
except SystemExit,e:
if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
except Exception, e:
import traceback
type, value, tb = sys.exc_info()
plugin_stderr.write(''.join(
traceback.format_exception_only(type,value) +
traceback.format_tb(tb)))
finally:
handle.close()
if cwd != os.getcwd(): os.chdir(cwd)
finally:
# restore system state
sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
# log anything sent to stderr
if plugin_stderr.getvalue():
import planet
planet.logger.error(plugin_stderr.getvalue())
# return stdout
return plugin_stdout.getvalue()

View File

@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):
def spiderPlanet(only_if_new = False):
""" Spider (fetch) an entire planet """
# log = planet.getLogger(config.log_level(),config.log_format())
log = planet.getLogger(config.log_level(),config.log_format())
global index

View File

@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'):
if sys.path[0]: os.chdir(sys.path[0])
sys.path[0] = os.getcwd()
# find all of the planet test modules
modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
# determine verbosity
verbosity = 1
for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
if arg in sys.argv:
verbosity = value
sys.argv.remove(arg)
# enable warnings
# find all of the planet test modules
modules = []
for pattern in sys.argv[1:] or ['test_*.py']:
modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
# enable logging
import planet
planet.getLogger("WARNING",None)
if verbosity == 0: planet.getLogger("FATAL",None)
if verbosity == 1: planet.getLogger("WARNING",None)
if verbosity == 2: planet.getLogger("DEBUG",None)
# load all of the tests into a suite
try:
@ -33,11 +44,5 @@ except Exception, exception:
for module in modules: __import__(module)
raise
verbosity = 1
if "-q" in sys.argv or '--quiet' in sys.argv:
verbosity = 0
if "-v" in sys.argv or '--verbose' in sys.argv:
verbosity = 2
# run test suite
unittest.TextTestRunner(verbosity=verbosity).run(suite)

View File

@ -0,0 +1,29 @@
[Planet]
output_theme = classic_fancy
output_dir = tests/work/apply
name = test planet
cache_directory = tests/work/spider/cache
bill_of_materials:
images/#{face}
[index.html.tmpl]
filters:
html2xhtml.plugin
mememe.plugin
[mememe.plugin]
sidebar = //*[@class='sidebar']
[tests/data/spider/testfeed0.atom]
name = not found
[tests/data/spider/testfeed1b.atom]
name = one
face = jdub.png
[tests/data/spider/testfeed2.atom]
name = two
[tests/data/spider/testfeed3.rss]
name = three

View File

@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase):
os.makedirs(workdir)
def tearDown(self):
shutil.rmtree(workdir)
os.removedirs(os.path.split(workdir)[0])
shutil.rmtree(os.path.split(workdir)[0])
def test_apply_asf(self):
config.load(configfile % 'asf')
@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase):
output = open(os.path.join(workdir, 'index.html4')).read()
self.assertTrue(output.find('/>')<0)
def test_apply_filter_mememe(self):
config.load(configfile % 'mememe')
self.apply_fancy()
output = open(os.path.join(workdir, 'index.html')).read()
self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
def apply_fancy(self):
# drop slow templates unrelated to test at hand
templates = config.parser.get('Planet','template_files').split()
templates.remove('rss10.xml.tmpl')
templates.remove('rss20.xml.tmpl')
config.parser.set('Planet','template_files',' '.join(templates))
splice.apply(self.feeddata)
# verify that selected files are there