MeMeme and html2xhtml plugins
This commit is contained in:
parent
ddf15fc689
commit
a5e1fde287
@ -1,3 +1,4 @@
|
||||
*.tmplc
|
||||
.DS_Store
|
||||
cache
|
||||
*.pluginc
|
||||
|
@ -8,12 +8,13 @@
|
||||
<title>Venus Filters</title>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Filters</h2>
|
||||
<p>Filters are simple Unix pipes. Input comes in <code>stdin</code>,
|
||||
parameters come from the config file, and output goes to <code>stdout</code>.
|
||||
Anything written to <code>stderr</code> is logged as an ERROR message. If no
|
||||
<code>stdout</code> is produced, the entry is not written to the cache or
|
||||
processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
|
||||
<h2>Filters and Plugins</h2>
|
||||
<p>Filters and plugins are simple Unix pipes. Input comes in
|
||||
<code>stdin</code>, parameters come from the config file, and output goes to
|
||||
<code>stdout</code>. Anything written to <code>stderr</code> is logged as an
|
||||
ERROR message. If no <code>stdout</code> is produced, the entry is not written
|
||||
to the cache or processed further; in fact, if the entry had previously been
|
||||
written to the cache, it will be removed.</p>
|
||||
|
||||
<p>There are two types of filters supported by Venus, input and template.</p>
|
||||
<p>Input to an input filter is a aggressively
|
||||
@ -89,6 +90,16 @@ an HTML output stream from one source.</li>
|
||||
<li>Templates written using htmltmpl or django currently only have access to a
|
||||
fixed set of fields, whereas XSLT and genshi templates have access to
|
||||
everything.</li>
|
||||
|
||||
<li>Plugins differ from filters in that while filters are forked, plugins are
|
||||
<a href="http://docs.python.org/lib/module-imp.html">imported</a>. This
|
||||
means that plugins are limited to Python and are run in-process. Plugins
|
||||
therefore have direct access to planet internals like configuration and
|
||||
logging facitilies, as well as access to the bundled libraries like the
|
||||
<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
|
||||
<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
|
||||
means that functions like <code>os.abort()</code> can't be recovered
|
||||
from.</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -21,7 +21,7 @@
|
||||
<ul>
|
||||
<li><a href="venus.svg">Architecture</a></li>
|
||||
<li><a href="normalization.html">Normalization</a></li>
|
||||
<li><a href="filters.html">Filters</a></li>
|
||||
<li><a href="filters.html">Filters and Plugins</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Other
|
||||
|
@ -36,6 +36,13 @@ filters = excerpt.py
|
||||
omit = img p br
|
||||
width = 500
|
||||
|
||||
# add memes to output
|
||||
[index.html.tmpl]
|
||||
filters = mememe.plugin
|
||||
|
||||
[mememe.plugin]
|
||||
sidebar = //*[@id="footer"]
|
||||
|
||||
# subscription list
|
||||
[http://share.opml.org/opml/top100.opml]
|
||||
content_type = opml
|
||||
|
6
filters/html2xhtml.plugin
Normal file
6
filters/html2xhtml.plugin
Normal file
@ -0,0 +1,6 @@
|
||||
import sys
|
||||
from planet import html5lib
|
||||
tree=html5lib.treebuilders.dom.TreeBuilder
|
||||
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
||||
document = parser.parse(sys.stdin)
|
||||
sys.stdout.write(document.toxml("utf-8"))
|
475
filters/mememe.plugin
Normal file
475
filters/mememe.plugin
Normal file
@ -0,0 +1,475 @@
|
||||
#
|
||||
# This Venus output filter will annotate an XHTML page with a list of
|
||||
# "memes" (or most popular linked destinations, based on the last week
|
||||
# of entries from the cache) and will update the subscription list with
|
||||
# links to recent entries from each subscription.
|
||||
#
|
||||
# Templates that don't produce XHTML natively will need their output passed
|
||||
# through html2xhtml.plugin first.
|
||||
#
|
||||
# Typical configuration (based on classic_fancy):
|
||||
#
|
||||
# [index.html.tmpl]
|
||||
# filters:
|
||||
# html2xhtml.plugin
|
||||
# mememe.plugin
|
||||
#
|
||||
# [mememe.plugin]
|
||||
# sidebar = @class='sidebar'
|
||||
#
|
||||
|
||||
import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
|
||||
from xml.sax.saxutils import escape
|
||||
from htmlentitydefs import entitydefs
|
||||
|
||||
import planet
|
||||
from planet import config, feedparser
|
||||
from planet.spider import filename
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
options = config.filter_options(sys.argv[0])
|
||||
|
||||
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
|
||||
|
||||
now = time.time()
|
||||
week = 7 * 86400
|
||||
week_ago = now - week
|
||||
|
||||
cache = config.cache_directory()
|
||||
meme_cache = os.path.join(cache, 'memes')
|
||||
if not os.path.exists(meme_cache): os.makedirs(meme_cache)
|
||||
|
||||
all_links = {}
|
||||
feed_links = {}
|
||||
|
||||
def check_cache(url):
|
||||
try:
|
||||
file = open(filename(meme_cache, url))
|
||||
headers = eval(file.read())
|
||||
file.close()
|
||||
return headers or {}
|
||||
except:
|
||||
return {}
|
||||
|
||||
def cache_meme(url, headers):
|
||||
json = []
|
||||
for key,value in headers.items():
|
||||
json.append(' %s: %s' % (toj(key), toj(value)))
|
||||
file = open(filename(meme_cache, url),'w')
|
||||
file.write('{\n' + ',\n'.join(json) + '\n}\n')
|
||||
file.close()
|
||||
|
||||
urlmap = {}
|
||||
def canonicalize(url):
|
||||
url = urlmap.get(url,url)
|
||||
parts = list(urlparse.urlparse(url))
|
||||
|
||||
parts[0] = parts[0].lower()
|
||||
parts[1] = parts[1].lower()
|
||||
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
|
||||
if not parts[2]: parts[2] = '/'
|
||||
parts[-1] = ''
|
||||
return urlparse.urlunparse(parts)
|
||||
|
||||
log.debug("Loading cached data")
|
||||
for name in glob.glob(os.path.join(cache, '*')):
|
||||
# ensure that this is within the past week
|
||||
if os.path.isdir(name): continue
|
||||
mtime = os.stat(name).st_mtime
|
||||
if mtime < week_ago: continue
|
||||
|
||||
# parse the file
|
||||
try:
|
||||
doc = libxml2.parseFile(name)
|
||||
except:
|
||||
continue
|
||||
xp = doc.xpathNewContext()
|
||||
xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
|
||||
xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
|
||||
|
||||
# determine the entry
|
||||
entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
|
||||
if not entry: continue
|
||||
entry = canonicalize(entry[0].prop("href"))
|
||||
|
||||
# determine the title
|
||||
title = xp.xpathEval("/atom:entry/atom:title")
|
||||
if title:
|
||||
if title[0].prop('type') == 'html':
|
||||
title = re.sub('<.*?>','',title[0].content)
|
||||
else:
|
||||
title = title[0].content
|
||||
title = str(title or '')
|
||||
|
||||
# determine the feed id
|
||||
feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
|
||||
if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
|
||||
if not feed: continue
|
||||
feed = feed[0].content
|
||||
|
||||
# determine the author
|
||||
author = xp.xpathEval("/atom:entry/atom:source/planet:name")
|
||||
if author:
|
||||
author = author[0].content
|
||||
else:
|
||||
author = ''
|
||||
|
||||
# track the feed_links
|
||||
if author:
|
||||
if not feed_links.has_key(author): feed_links[author] = list()
|
||||
feed_links[author].append([mtime, entry, title])
|
||||
|
||||
# identify the unique links
|
||||
entry_links = []
|
||||
for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
|
||||
parent = node.parent
|
||||
while parent:
|
||||
if parent.name == 'source': break
|
||||
parent = parent.parent
|
||||
else:
|
||||
link = canonicalize(node.prop('href'))
|
||||
if not link in entry_links:
|
||||
entry_links.append(link)
|
||||
if node.hasProp('title') and node.prop('title').startswith('http'):
|
||||
link = canonicalize(node.prop('title'))
|
||||
if not link in entry_links:
|
||||
entry_links.append(link)
|
||||
|
||||
# add the votes
|
||||
weight = 1.0 - (now - mtime)**2 / week**2
|
||||
vote = [(weight, str(entry), str(feed), title, author, mtime)]
|
||||
for link in entry_links:
|
||||
all_links[link] = all_links.get(link,list()) + vote
|
||||
|
||||
# free the entry
|
||||
doc.freeDoc()
|
||||
|
||||
# tally the votes
|
||||
weighted_links = []
|
||||
for link, votes in all_links.items():
|
||||
site = {}
|
||||
updated = 0
|
||||
for weight, entry, feed, title, author, mtime in votes:
|
||||
site[feed] = max(site.get(feed,0), weight)
|
||||
if mtime > updated: updated=mtime
|
||||
weighted_links.append((sum(site.values()), link, updated))
|
||||
weighted_links.sort()
|
||||
weighted_links.reverse()
|
||||
|
||||
cp1252 = {
|
||||
128: 8364, # euro sign
|
||||
130: 8218, # single low-9 quotation mark
|
||||
131: 402, # latin small letter f with hook
|
||||
132: 8222, # double low-9 quotation mark
|
||||
133: 8230, # horizontal ellipsis
|
||||
134: 8224, # dagger
|
||||
135: 8225, # double dagger
|
||||
136: 710, # modifier letter circumflex accent
|
||||
137: 8240, # per mille sign
|
||||
138: 352, # latin capital letter s with caron
|
||||
139: 8249, # single left-pointing angle quotation mark
|
||||
140: 338, # latin capital ligature oe
|
||||
142: 381, # latin capital letter z with caron
|
||||
145: 8216, # left single quotation mark
|
||||
146: 8217, # right single quotation mark
|
||||
147: 8220, # left double quotation mark
|
||||
148: 8221, # right double quotation mark
|
||||
149: 8226, # bullet
|
||||
150: 8211, # en dash
|
||||
151: 8212, # em dash
|
||||
152: 732, # small tilde
|
||||
153: 8482, # trade mark sign
|
||||
154: 353, # latin small letter s with caron
|
||||
155: 8250, # single right-pointing angle quotation mark
|
||||
156: 339, # latin small ligature oe
|
||||
158: 382, # latin small letter z with caron
|
||||
159: 376} # latin capital letter y with diaeresis
|
||||
|
||||
# determine the title for a given url
|
||||
class html(sgmllib.SGMLParser):
|
||||
def __init__(self, url):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
self.title = ""
|
||||
self.feedurl = ""
|
||||
self.intitle = False
|
||||
|
||||
headers = check_cache(url)
|
||||
|
||||
try:
|
||||
# fetch the page
|
||||
request = urllib2.Request(url)
|
||||
request.add_header('User-Agent', 'Venus/MeMeme')
|
||||
if headers.has_key('etag'):
|
||||
request.add_header('If-None-Match', headers['etag'])
|
||||
if headers.has_key('last_modified'):
|
||||
request.add_header('If-Modified-Since', headers['last-modified'])
|
||||
response = urllib2.urlopen(request)
|
||||
self.feed(response.read())
|
||||
|
||||
# ensure the data is in utf-8
|
||||
try:
|
||||
self.title = self.title.decode('utf-8')
|
||||
except:
|
||||
self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
|
||||
for c in self.title.decode('iso-8859-1')])
|
||||
|
||||
# cache the results
|
||||
headers = {}
|
||||
if self.feedurl: headers['feedurl'] = self.feedurl
|
||||
if self.title: headers['title'] = self.title
|
||||
headers.update(response.headers)
|
||||
cache_meme(url, headers)
|
||||
except:
|
||||
self.feedurl = headers.get('feedurl')
|
||||
if headers.has_key('title'):
|
||||
if isinstance(headers['title'],str):
|
||||
self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
|
||||
else:
|
||||
self.title=headers['title']
|
||||
|
||||
# if there is a feed, look for an entry that matches, and take that title
|
||||
if self.feedurl and not self.title:
|
||||
headers = check_cache(self.feedurl)
|
||||
data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
|
||||
modified=headers.get('last-modified'))
|
||||
|
||||
if data.has_key('headers') and data.has_key('status') and \
|
||||
data.status in [200, 301, 302]:
|
||||
|
||||
titles = {}
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail') and entry.has_key('link'):
|
||||
titles[entry.link] = entry.title_detail.value
|
||||
if entry.title_detail.type == 'text/plain':
|
||||
titles[entry.link] = escape(titles[entry.link])
|
||||
|
||||
if titles.has_key(url): self.title = titles[url]
|
||||
|
||||
data.headers.update(titles)
|
||||
cache_meme(self.feedurl, data.headers)
|
||||
else:
|
||||
if headers.has_key(url):
|
||||
if isinstance(headers[url],str):
|
||||
self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
|
||||
else:
|
||||
self.title=headers[url]
|
||||
|
||||
# fallback is the basename of the URI
|
||||
if not self.title:
|
||||
self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
|
||||
|
||||
# parse out the first autodiscovery link
|
||||
def start_link(self, attrs):
|
||||
if self.feedurl: return
|
||||
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
|
||||
if not 'rel' in attrs: return
|
||||
rels = attrs['rel'].split(' ')
|
||||
if 'alternate' not in rels: return
|
||||
if not 'type' in attrs or not attrs['type'].endswith('xml'): return
|
||||
if 'href' in attrs:
|
||||
self.feedurl = attrs['href']
|
||||
|
||||
# parse the page title
|
||||
def start_title(self, attributes):
|
||||
if not self.title: self.intitle = True
|
||||
def end_title(self):
|
||||
self.intitle = False
|
||||
def handle_data(self, text):
|
||||
if self.intitle: self.title += escape(text)
|
||||
|
||||
# convert unicode string to a json string
|
||||
def toj(value):
|
||||
result = repr(value).replace(r'\x',r'\u00')
|
||||
if result[:1] == 'u': result=result[1:]
|
||||
if result.startswith("'"):
|
||||
result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
|
||||
return result
|
||||
|
||||
seenit = []
|
||||
count = 0
|
||||
|
||||
# construct an empty feed
|
||||
feed_doc = libxml2.newDoc("1.0")
|
||||
meme_feed = feed_doc.newChild(None, "feed", None)
|
||||
meme_feed.newNs('http://www.w3.org/2005/Atom', None)
|
||||
meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
|
||||
author = meme_feed.newChild(None, 'author', None)
|
||||
author.newTextChild(None, 'name', config.owner_name())
|
||||
if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
|
||||
meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
|
||||
link = meme_feed.newChild(None, 'link', None)
|
||||
link.setProp('href', os.path.join(config.link(), 'memes.atom'))
|
||||
link.setProp('rel', 'self')
|
||||
meme_feed.newTextChild(None, 'updated',
|
||||
time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
|
||||
|
||||
# parse the input
|
||||
log.debug("Parse input")
|
||||
doc=libxml2.parseDoc(sys.stdin.read())
|
||||
|
||||
# find the sidebar/footer
|
||||
sidebar = options.get('sidebar','//*[@class="sidebar"]')
|
||||
footer = doc.xpathEval(sidebar)
|
||||
if not hasattr(footer,'__len__') or len(footer) == 0:
|
||||
raise Exception(sidebar + ' not found')
|
||||
if len(footer) > 1:
|
||||
log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
|
||||
footer = footer[0]
|
||||
|
||||
# add up to 10 entry links to each subscription
|
||||
subs_ul = footer.children
|
||||
while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
|
||||
child = subs_ul.children
|
||||
while child:
|
||||
if child.name == 'li':
|
||||
if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
|
||||
link = child.lastChild()
|
||||
while link.isText(): link=link.prev
|
||||
author = link.getContent()
|
||||
state = 'inactive'
|
||||
if feed_links.has_key(author):
|
||||
ul2 = child.newChild(None, 'ul', None)
|
||||
feed_links[author].sort()
|
||||
feed_links[author].reverse()
|
||||
link_count = 0
|
||||
for mtime, entry, title in feed_links[author]:
|
||||
if not title: continue
|
||||
li2 = ul2.newChild(None, 'li', None)
|
||||
a = li2.newTextChild(None, 'a', title)
|
||||
a.setProp('href', entry)
|
||||
link_count = link_count + 1
|
||||
if link_count >= 10: break
|
||||
if link_count > 0: state = None
|
||||
if state:
|
||||
link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
|
||||
child=child.next
|
||||
|
||||
# create a h2 and ul for the memes list
|
||||
footer_top = footer.children
|
||||
memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
|
||||
memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
|
||||
|
||||
# create a header for the memes list
|
||||
a = memes.newChild(None, 'a', None)
|
||||
a.setProp('href', 'memes.atom')
|
||||
img = a.newChild(None, 'img', None)
|
||||
img.setProp('src', 'images/feed-icon-10x10.png')
|
||||
|
||||
# collect the results
|
||||
log.debug("Fetch titles and collect the results")
|
||||
from urllib import quote_plus
|
||||
for i in range(0,len(weighted_links)):
|
||||
weight, link, updated = weighted_links[i]
|
||||
|
||||
# ensure that somebody new points to this entry. This guards against
|
||||
# groups of related links which several posts point to all.
|
||||
novel = False
|
||||
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||
if entry not in seenit:
|
||||
seenit.append(entry)
|
||||
novel = True
|
||||
if not novel: continue
|
||||
|
||||
all_links[link].sort()
|
||||
all_links[link].reverse()
|
||||
cache_file = filename(cache, link)
|
||||
title = None
|
||||
|
||||
# when possible, take the title from the cache
|
||||
if os.path.exists(cache_file):
|
||||
entry = feedparser.parse(cache_file).entries[0]
|
||||
if entry.has_key('title_detail'):
|
||||
title = entry.title_detail.value
|
||||
if entry.title_detail.type == 'text/plain': title = escape(title)
|
||||
|
||||
# otherwise, parse the html
|
||||
if not title:
|
||||
title = html(link).title
|
||||
|
||||
# dehtmlize
|
||||
title = re.sub('&(\w+);',
|
||||
lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
|
||||
title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
|
||||
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
|
||||
|
||||
# title too long? Insert zero width spaces where appropriate
|
||||
if max(map(len,title.split())) > 30:
|
||||
title=re.sub('(\W+)',u'\\1\u200b',title)
|
||||
|
||||
# save the entry title (it is used later)
|
||||
entry_title = title.strip()
|
||||
|
||||
# add to the memes list
|
||||
memes_ul.addContent('\n')
|
||||
li = memes_ul.newChild(None, 'li', None)
|
||||
memes_ul.addContent('\n')
|
||||
|
||||
# technorati link
|
||||
a = li.newChild(None, 'a', None)
|
||||
tlink = 'http://technorati.com/cosmos/search.html?url='
|
||||
if link.startswith('http://'):
|
||||
a.setProp('href',tlink + quote_plus(link[7:]))
|
||||
else:
|
||||
a.setProp('href',tlink + quote_plus(link))
|
||||
a.setProp('title','cosmos')
|
||||
img = a.newChild(None, 'img', None)
|
||||
img.setProp('src','tcosm11.gif')
|
||||
|
||||
# main link
|
||||
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
|
||||
a.setProp('href',link)
|
||||
if (((i==0) or (updated>=weighted_links[i-1][2])) and
|
||||
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
|
||||
rank = 0
|
||||
for j in range(0,len(weighted_links)):
|
||||
if updated < weighted_links[j][2]: rank = rank + 1
|
||||
if rank < len(weighted_links)/2:
|
||||
a.setProp('class','rising')
|
||||
|
||||
# voters
|
||||
ul2 = li.newChild(None, 'ul', None)
|
||||
voters = []
|
||||
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||
if entry in voters: continue
|
||||
li2 = ul2.newChild(None, 'li', None)
|
||||
a = li2.newTextChild(None, 'a' , author)
|
||||
a.setProp('href',entry)
|
||||
if title: a.setProp('title',title)
|
||||
voters.append(entry)
|
||||
|
||||
# add to the meme feed
|
||||
if len(all_links[link]) > 2:
|
||||
meme_feed.addContent('\n')
|
||||
entry = meme_feed.newChild(None, 'entry', None)
|
||||
meme_feed.addContent('\n')
|
||||
|
||||
# entry
|
||||
tagbase = config.link().split('/')
|
||||
if not tagbase[-1]: tagbase = tagbase[:-1]
|
||||
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
|
||||
entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
|
||||
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
|
||||
meme_link = entry.newTextChild(None, 'link', None)
|
||||
meme_link.setProp('href', link)
|
||||
entry.newTextChild(None, 'updated',
|
||||
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
|
||||
|
||||
# voters
|
||||
content = entry.newChild(None, 'content', None)
|
||||
content.setProp('type', 'xhtml')
|
||||
div = content.newTextChild(None, 'div', 'Spotted by:')
|
||||
div.newNs('http://www.w3.org/1999/xhtml', None)
|
||||
content_ul = div.newChild(None, 'ul', None)
|
||||
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||
li2 = content_ul.newTextChild(None, 'li', author + ": ")
|
||||
a = li2.newTextChild(None, 'a' , title or 'untitled')
|
||||
a.setProp('href',entry)
|
||||
|
||||
count = count + 1
|
||||
if count >= 10: break
|
||||
|
||||
log.info("Writing " + MEMES_ATOM)
|
||||
output=open(MEMES_ATOM,'w')
|
||||
output.write(feed_doc.serialize('utf-8'))
|
||||
output.close()
|
||||
|
||||
sys.stdout.write(doc.serialize('utf-8'))
|
@ -352,14 +352,15 @@ def filters(section=None):
|
||||
filters = []
|
||||
if parser.has_option('Planet', 'filters'):
|
||||
filters += parser.get('Planet', 'filters').split()
|
||||
if section and parser.has_option(section, 'filters'):
|
||||
filters += parser.get(section, 'filters').split()
|
||||
if filter(section):
|
||||
filters.append('regexp_sifter.py?require=' +
|
||||
urllib.quote(filter(section)))
|
||||
if exclude(section):
|
||||
filters.append('regexp_sifter.py?exclude=' +
|
||||
urllib.quote(exclude(section)))
|
||||
for section in section and [section] or template_files():
|
||||
if parser.has_option(section, 'filters'):
|
||||
filters += parser.get(section, 'filters').split()
|
||||
return filters
|
||||
|
||||
def planet_options():
|
||||
@ -382,6 +383,10 @@ def template_options(section):
|
||||
""" dictionary of template specific options"""
|
||||
return feed_options(section)
|
||||
|
||||
def filter_options(section):
|
||||
""" dictionary of filter specific options"""
|
||||
return feed_options(section)
|
||||
|
||||
def write(file=sys.stdout):
|
||||
""" write out an updated template """
|
||||
print parser.write(file)
|
||||
|
@ -71,35 +71,40 @@ class HTMLParser(object):
|
||||
"trailingEnd": TrailingEndPhase(self, self.tree)
|
||||
}
|
||||
|
||||
def parse(self, stream, encoding=None, innerHTML=False):
|
||||
"""Parse a HTML document into a well-formed tree
|
||||
|
||||
stream - a filelike object or string containing the HTML to be parsed
|
||||
|
||||
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
|
||||
is not yet supported)
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
"""
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div",
|
||||
encoding=None):
|
||||
|
||||
self.tree.reset()
|
||||
self.firstStartTag = False
|
||||
self.errors = []
|
||||
|
||||
self.phase = self.phases["initial"]
|
||||
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
|
||||
parseMeta=innerHTML)
|
||||
|
||||
if innerHTML:
|
||||
self.innerHTML = container.lower()
|
||||
|
||||
if self.innerHTML in ('title', 'textarea'):
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
|
||||
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
|
||||
elif self.innerHTML == 'plaintext':
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
|
||||
else:
|
||||
# contentModelFlag already is PCDATA
|
||||
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
|
||||
pass
|
||||
self.phase = self.phases["rootElement"]
|
||||
self.phase.insertHtmlElement()
|
||||
self.resetInsertionMode()
|
||||
else:
|
||||
self.innerHTML = False
|
||||
self.phase = self.phases["initial"]
|
||||
|
||||
# We only seem to have InBodyPhase testcases where the following is
|
||||
# relevant ... need others too
|
||||
self.lastPhase = None
|
||||
|
||||
# We don't actually support innerHTML yet but this should allow
|
||||
# assertations
|
||||
self.innerHTML = innerHTML
|
||||
|
||||
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
|
||||
|
||||
# XXX This is temporary for the moment so there isn't any other
|
||||
# changes needed for the parser to work with the iterable tokenizer
|
||||
for token in self.tokenizer:
|
||||
@ -118,7 +123,34 @@ class HTMLParser(object):
|
||||
# When the loop finishes it's EOF
|
||||
self.phase.processEOF()
|
||||
|
||||
def parse(self, stream, encoding=None):
|
||||
"""Parse a HTML document into a well-formed tree
|
||||
|
||||
stream - a filelike object or string containing the HTML to be parsed
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
"""
|
||||
self._parse(stream, innerHTML=False, encoding=encoding)
|
||||
return self.tree.getDocument()
|
||||
|
||||
def parseFragment(self, stream, container="div", encoding=None):
|
||||
"""Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
container - name of the element we're setting the innerHTML property
|
||||
if set to None, default to 'div'
|
||||
|
||||
stream - a filelike object or string containing the HTML to be parsed
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
"""
|
||||
self._parse(stream, True, container=container, encoding=encoding)
|
||||
return self.tree.getFragment()
|
||||
|
||||
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
@ -187,28 +219,29 @@ class HTMLParser(object):
|
||||
"frameset":"inFrameset"
|
||||
}
|
||||
for node in self.tree.openElements[::-1]:
|
||||
nodeName = node.name
|
||||
if node == self.tree.openElements[0]:
|
||||
last = True
|
||||
if node.name not in ['td', 'th']:
|
||||
if nodeName not in ['td', 'th']:
|
||||
# XXX
|
||||
assert self.innerHTML
|
||||
raise NotImplementedError
|
||||
nodeName = self.innerHTML
|
||||
# Check for conditions that should only happen in the innerHTML
|
||||
# case
|
||||
if node.name in ("select", "colgroup", "head", "frameset"):
|
||||
if nodeName in ("select", "colgroup", "head", "frameset"):
|
||||
# XXX
|
||||
assert self.innerHTML
|
||||
if node.name in newModes:
|
||||
self.phase = self.phases[newModes[node.name]]
|
||||
if nodeName in newModes:
|
||||
self.phase = self.phases[newModes[nodeName]]
|
||||
break
|
||||
elif node.name == "html":
|
||||
elif nodeName == "html":
|
||||
if self.tree.headPointer is None:
|
||||
self.phase = self.phases["beforeHead"]
|
||||
else:
|
||||
self.phase = self.phases["afterHead"]
|
||||
break
|
||||
elif last:
|
||||
self.phase = self.phases["body"]
|
||||
self.phase = self.phases["inBody"]
|
||||
break
|
||||
|
||||
class Phase(object):
|
||||
@ -434,9 +467,7 @@ class InHeadPhase(Phase):
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def startTagHead(self, name, attributes):
|
||||
self.tree.insertElement(name, attributes)
|
||||
self.tree.headPointer = self.tree.openElements[-1]
|
||||
self.parser.phase = self.parser.phases["inHead"]
|
||||
self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
|
||||
|
||||
def startTagTitle(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
@ -455,10 +486,11 @@ class InHeadPhase(Phase):
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||
|
||||
def startTagScript(self, name, attributes):
|
||||
#XXX Inner HTML case may be wrong
|
||||
element = self.tree.createElement(name, attributes)
|
||||
element._flags.append("parser-inserted")
|
||||
if self.tree.headPointer is not None and\
|
||||
self.parser.phase == self.parser.phases["inHead"]:
|
||||
if (self.tree.headPointer is not None and
|
||||
self.parser.phase == self.parser.phases["inHead"]):
|
||||
self.appendToHead(element)
|
||||
else:
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
@ -653,8 +685,8 @@ class InBodyPhase(Phase):
|
||||
|
||||
def startTagBody(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (body)."))
|
||||
if len(self.tree.openElements) == 1 \
|
||||
or self.tree.openElements[1].name != "body":
|
||||
if (len(self.tree.openElements) == 1
|
||||
or self.tree.openElements[1].name != "body"):
|
||||
assert self.parser.innerHTML
|
||||
else:
|
||||
for attr, value in attributes.iteritems():
|
||||
@ -1179,6 +1211,7 @@ class InTablePhase(Phase):
|
||||
self.parser.resetInsertionMode()
|
||||
else:
|
||||
# innerHTML case
|
||||
assert self.parser.innerHTML
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase):
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def ignoreEndTagCaption(self):
|
||||
return not self.tree.elementInScope("caption", True)
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.phases["inBody"].processCharacters(data)
|
||||
|
||||
def startTagTableElement(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
#XXX Have to duplicate logic here to find out if the tag is ignored
|
||||
ignoreEndTag = self.ignoreEndTagCaption()
|
||||
self.parser.phase.processEndTag("caption")
|
||||
# XXX how do we know the tag is _always_ ignored in the innerHTML
|
||||
# case and therefore shouldn't be processed again? I'm not sure this
|
||||
# strategy makes sense...
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||
|
||||
def endTagCaption(self, name):
|
||||
if self.tree.elementInScope(name, True):
|
||||
if not self.ignoreEndTagCaption():
|
||||
# AT this code is quite similar to endTagTable in "InTable"
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != "caption":
|
||||
@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inTable"]
|
||||
else:
|
||||
# innerHTML case
|
||||
assert self.parser.innerHTML
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagTable(self, name):
|
||||
self.parser.parseError()
|
||||
ignoreEndTag = self.ignoreEndTagCaption()
|
||||
self.parser.phase.processEndTag("caption")
|
||||
# XXX ...
|
||||
if not self.parser.innerHTML:
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase):
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def ignoreEndTagColgroup(self):
|
||||
return self.tree.openElements[-1].name == "html"
|
||||
|
||||
def processCharacters(self, data):
|
||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||
self.endTagColgroup("colgroup")
|
||||
# XXX
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def startTagCol(self, name ,attributes):
|
||||
@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase):
|
||||
self.tree.openElements.pop()
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||
self.endTagColgroup("colgroup")
|
||||
# XXX how can be sure it's always ignored?
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def endTagColgroup(self, name):
|
||||
if self.tree.openElements[-1].name == "html":
|
||||
if self.ignoreEndTagColgroup():
|
||||
# innerHTML case
|
||||
assert self.parser.innerHTML
|
||||
self.parser.parseError()
|
||||
else:
|
||||
self.tree.openElements.pop()
|
||||
@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase):
|
||||
u"col has no end tag."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||
self.endTagColgroup("colgroup")
|
||||
# XXX how can be sure it's always ignored?
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
|
||||
@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase):
|
||||
|
||||
def startTagTableOther(self, name, attributes):
|
||||
# XXX AT Any ideas on how to share this with endTagTable?
|
||||
if self.tree.elementInScope("tbody", True) or \
|
||||
self.tree.elementInScope("thead", True) or \
|
||||
self.tree.elementInScope("tfoot", True):
|
||||
if (self.tree.elementInScope("tbody", True) or
|
||||
self.tree.elementInScope("thead", True) or
|
||||
self.tree.elementInScope("tfoot", True)):
|
||||
self.clearStackToTableBodyContext()
|
||||
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase):
|
||||
") in the table body phase. Ignored."))
|
||||
|
||||
def endTagTable(self, name):
|
||||
if self.tree.elementInScope("tbody", True) or \
|
||||
self.tree.elementInScope("thead", True) or \
|
||||
self.tree.elementInScope("tfoot", True):
|
||||
if (self.tree.elementInScope("tbody", True) or
|
||||
self.tree.elementInScope("thead", True) or
|
||||
self.tree.elementInScope("tfoot", True)):
|
||||
self.clearStackToTableBodyContext()
|
||||
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
||||
self.parser.phase.processEndTag(name)
|
||||
@ -1428,6 +1468,9 @@ class InRowPhase(Phase):
|
||||
self.tree.openElements[-1].name + u") in the row phase."))
|
||||
self.tree.openElements.pop()
|
||||
|
||||
def ignoreEndTagTr(self):
|
||||
return not self.tree.elementInScope("tr", tableVariant=True)
|
||||
|
||||
# the rest
|
||||
def processCharacters(self, data):
|
||||
self.parser.phases["inTable"].processCharacters(data)
|
||||
@ -1439,28 +1482,31 @@ class InRowPhase(Phase):
|
||||
self.tree.activeFormattingElements.append(Marker)
|
||||
|
||||
def startTagTableOther(self, name, attributes):
|
||||
ignoreEndTag = self.ignoreEndTagTr()
|
||||
self.endTagTr("tr")
|
||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.phases["inTable"].processStartTag(name, attributes)
|
||||
|
||||
def endTagTr(self, name):
|
||||
if self.tree.elementInScope("tr", True):
|
||||
if not self.ignoreEndTagTr():
|
||||
self.clearStackToTableRowContext()
|
||||
self.tree.openElements.pop()
|
||||
self.parser.phase = self.parser.phases["inTableBody"]
|
||||
else:
|
||||
# innerHTML case
|
||||
assert self.parser.innerHTML
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagTable(self, name):
|
||||
ignoreEndTag = self.ignoreEndTagTr()
|
||||
self.endTagTr("tr")
|
||||
# Reprocess the current tag if the tr end tag was not ignored
|
||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||
if not self.parser.innerHTML:
|
||||
if not ignoreEndTag:
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagTableRowGroup(self, name):
|
||||
@ -1628,7 +1674,7 @@ class InSelectPhase(Phase):
|
||||
u"select phase. Ignored."))
|
||||
|
||||
def endTagSelect(self, name):
|
||||
if self.tree.elementInScope(name, True):
|
||||
if self.tree.elementInScope("select", True):
|
||||
node = self.tree.openElements.pop()
|
||||
while node.name != "select":
|
||||
node = self.tree.openElements.pop()
|
||||
@ -1641,7 +1687,7 @@ class InSelectPhase(Phase):
|
||||
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
||||
") in the select phase."))
|
||||
if self.tree.elementInScope(name, True):
|
||||
self.endTagSelect()
|
||||
self.endTagSelect("select")
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagOther(self, name):
|
||||
@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase):
|
||||
u"in the frameset phase (innerHTML)."))
|
||||
else:
|
||||
self.tree.openElements.pop()
|
||||
if not self.parser.innerHTML and\
|
||||
self.tree.openElements[-1].name != "frameset":
|
||||
if (not self.parser.innerHTML and
|
||||
self.tree.openElements[-1].name != "frameset"):
|
||||
# If we're not in innerHTML mode and the the current node is not a
|
||||
# "frameset" element (anymore) then switch.
|
||||
self.parser.phase = self.parser.phases["afterFrameset"]
|
||||
|
@ -14,7 +14,7 @@ class HTMLInputStream(object):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, encoding=None, chardet=True):
|
||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
@ -26,6 +26,8 @@ class HTMLInputStream(object):
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
# List of where new lines occur
|
||||
@ -41,12 +43,9 @@ class HTMLInputStream(object):
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Autodetect encoding if no other information can be found?
|
||||
self.chardet = chardet
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if encoding is None or not isValidEncoding(encoding):
|
||||
encoding = self.detectEncoding()
|
||||
encoding = self.detectEncoding(parseMeta, chardet)
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
@ -79,17 +78,17 @@ class HTMLInputStream(object):
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
|
||||
def detectEncoding(self):
|
||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding is None:
|
||||
if encoding is None and parseMeta:
|
||||
encoding = self.detectEncodingMeta()
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and self.chardet:
|
||||
if encoding is None and chardet:
|
||||
try:
|
||||
import chardet
|
||||
buffer = self.rawStream.read()
|
||||
|
@ -32,8 +32,8 @@ class HTMLTokenizer(object):
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def __init__(self, stream, encoding=None):
|
||||
self.stream = HTMLInputStream(stream, encoding)
|
||||
def __init__(self, stream, encoding=None, parseMeta=True):
|
||||
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
||||
|
||||
self.states = {
|
||||
"data":self.dataState,
|
||||
@ -338,31 +338,33 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["closeTagOpen"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.queue.insert(0, data)
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def closeTagOpenState(self):
|
||||
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
||||
contentModelFlags["CDATA"]):
|
||||
charStack = []
|
||||
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
|
||||
contentModelFlags["CDATA"])):
|
||||
if self.currentToken:
|
||||
charStack = []
|
||||
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||
charStack.append(self.stream.char())
|
||||
# Make sure we don't get hit by EOF
|
||||
if charStack[-1] == EOF:
|
||||
break
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||
charStack.append(self.stream.char())
|
||||
# Make sure we don't get hit by EOF
|
||||
if charStack[-1] == EOF:
|
||||
break
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
|
||||
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
if self.currentToken \
|
||||
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
and charStack[-1] in (spaceCharacters |
|
||||
frozenset((u">", u"/", u"<", EOF))):
|
||||
# Because the characters are correct we can safely switch to
|
||||
|
@ -108,6 +108,9 @@ class TreeBuilder(object):
|
||||
|
||||
#The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
#Fragment class
|
||||
fragmentClass = None
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
@ -294,7 +297,6 @@ class TreeBuilder(object):
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
assert self.innerHTML
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
@ -310,6 +312,13 @@ class TreeBuilder(object):
|
||||
def getDocument(self):
|
||||
"Return the final tree"
|
||||
return self.document
|
||||
|
||||
def getFragment(self):
|
||||
"Return the final fragment"
|
||||
#assert self.innerHTML
|
||||
fragment = self.fragmentClass()
|
||||
self.openElements[0].reparentChildren(fragment)
|
||||
return fragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
|
@ -1,6 +1,8 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
from xml.sax.saxutils import escape
|
||||
from constants import voidElements
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def fragmentClass(self):
|
||||
return NodeBuilder(self.dom.createDocumentFragment())
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
@ -118,7 +126,9 @@ def testSerializer(element):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
@ -135,6 +145,32 @@ def testSerializer(element):
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
|
||||
rv += "</%s>\n"%node.nodeName
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.nodeType == Node.TEXT_NODE:
|
||||
rv = node.nodeValue
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
rv = "<%s"%node.nodeName
|
||||
if node.hasAttributes():
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.items()])
|
||||
rv += ">"
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
rv = "<!-- %s -->" % escape(node.nodeValue)
|
||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
if node.nodeType == Node.ELEMENT_NODE:
|
||||
if not nsmap:
|
||||
@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||
handler.startDocument()
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
|
||||
else:
|
||||
# ATTRIBUTE_NODE
|
||||
|
@ -129,6 +129,10 @@ class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, DocumentFragment)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
@ -4,6 +4,7 @@ from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
class Node(_base.Node):
|
||||
type = -1
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.parent = None
|
||||
@ -11,15 +12,18 @@ class Node(_base.Node):
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __iter__(self):
|
||||
for node in self.childNodes:
|
||||
yield node
|
||||
for item in node:
|
||||
yield item
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def toxml(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
def printTree(self, indent=0):
|
||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||
for child in self.childNodes:
|
||||
@ -69,6 +73,7 @@ class Node(_base.Node):
|
||||
return bool(self.childNodes)
|
||||
|
||||
class Document(Node):
|
||||
type = 1
|
||||
def __init__(self):
|
||||
Node.__init__(self, None)
|
||||
|
||||
@ -93,7 +98,13 @@ class Document(Node):
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
class DocumentFragment(Document):
|
||||
type = 2
|
||||
def __unicode__(self):
|
||||
return "#document-fragment"
|
||||
|
||||
class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
|
||||
@ -106,6 +117,7 @@ class DocumentType(Node):
|
||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
||||
|
||||
class TextNode(Node):
|
||||
type = 4
|
||||
def __init__(self, value):
|
||||
Node.__init__(self, None)
|
||||
self.value = value
|
||||
@ -119,6 +131,7 @@ class TextNode(Node):
|
||||
hilite = toxml
|
||||
|
||||
class Element(Node):
|
||||
type = 5
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
self.attributes = {}
|
||||
@ -164,6 +177,7 @@ class Element(Node):
|
||||
return tree
|
||||
|
||||
class CommentNode(Node):
|
||||
type = 6
|
||||
def __init__(self, data):
|
||||
Node.__init__(self, None)
|
||||
self.data = data
|
||||
@ -177,11 +191,38 @@ class CommentNode(Node):
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.type == Element.type and node.name not in voidElements:
|
||||
rv += "</%s>\n"%node.name
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.type == TextNode.type:
|
||||
rv = node.value
|
||||
elif node.type == Element.type:
|
||||
rv = "<%s"%node.name
|
||||
if node.attributes:
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.iteritems()])
|
||||
rv += ">"
|
||||
elif node.type == CommentNode.type:
|
||||
rv = "<!-- %s -->" % escape(node.data)
|
||||
elif node.type == DocumentType.type:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = CommentNode
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, node):
|
||||
return node.printTree()
|
||||
|
@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):
|
||||
|
||||
# Execute the shell module
|
||||
options = planet.config.template_options(template_file)
|
||||
if module_name == 'plugin': options['__file__'] = template_file
|
||||
options.update(extra_options)
|
||||
log.debug("Processing %s %s using %s", mode,
|
||||
os.path.realpath(template_resolved), module_name)
|
||||
|
64
planet/shell/plugin.py
Normal file
64
planet/shell/plugin.py
Normal file
@ -0,0 +1,64 @@
|
||||
import os, sys, imp
|
||||
from StringIO import StringIO
|
||||
|
||||
def run(script, doc, output_file=None, options={}):
|
||||
""" process an Python script using imp """
|
||||
save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
|
||||
plugin_stdout = StringIO()
|
||||
plugin_stderr = StringIO()
|
||||
|
||||
try:
|
||||
# redirect stdin
|
||||
sys.stdin = StringIO(doc)
|
||||
|
||||
# redirect stdout
|
||||
if output_file:
|
||||
sys.stdout = open(output_file, 'w')
|
||||
else:
|
||||
sys.stdout = plugin_stdout
|
||||
|
||||
# redirect stderr
|
||||
sys.stderr = plugin_stderr
|
||||
|
||||
# determine __file__ value
|
||||
if options.has_key("__file__"):
|
||||
plugin_file = options["__file__"]
|
||||
del options["__file__"]
|
||||
else:
|
||||
plugin_file = script
|
||||
|
||||
# set sys.argv
|
||||
options = sum([['--'+key, value] for key,value in options.items()], [])
|
||||
sys.argv = [plugin_file] + options
|
||||
|
||||
# import script
|
||||
handle = open(script, 'r')
|
||||
cwd = os.getcwd()
|
||||
try:
|
||||
try:
|
||||
try:
|
||||
description=('.plugin', 'rb', imp.PY_SOURCE)
|
||||
imp.load_module('__main__',handle,plugin_file,description)
|
||||
except SystemExit,e:
|
||||
if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
|
||||
except Exception, e:
|
||||
import traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
plugin_stderr.write(''.join(
|
||||
traceback.format_exception_only(type,value) +
|
||||
traceback.format_tb(tb)))
|
||||
finally:
|
||||
handle.close()
|
||||
if cwd != os.getcwd(): os.chdir(cwd)
|
||||
|
||||
finally:
|
||||
# restore system state
|
||||
sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
|
||||
|
||||
# log anything sent to stderr
|
||||
if plugin_stderr.getvalue():
|
||||
import planet
|
||||
planet.logger.error(plugin_stderr.getvalue())
|
||||
|
||||
# return stdout
|
||||
return plugin_stdout.getvalue()
|
@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
||||
|
||||
def spiderPlanet(only_if_new = False):
|
||||
""" Spider (fetch) an entire planet """
|
||||
# log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
|
||||
global index
|
||||
|
25
runtests.py
25
runtests.py
@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'):
|
||||
if sys.path[0]: os.chdir(sys.path[0])
|
||||
sys.path[0] = os.getcwd()
|
||||
|
||||
# find all of the planet test modules
|
||||
modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
|
||||
# determine verbosity
|
||||
verbosity = 1
|
||||
for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
|
||||
if arg in sys.argv:
|
||||
verbosity = value
|
||||
sys.argv.remove(arg)
|
||||
|
||||
# enable warnings
|
||||
# find all of the planet test modules
|
||||
modules = []
|
||||
for pattern in sys.argv[1:] or ['test_*.py']:
|
||||
modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
|
||||
|
||||
# enable logging
|
||||
import planet
|
||||
planet.getLogger("WARNING",None)
|
||||
if verbosity == 0: planet.getLogger("FATAL",None)
|
||||
if verbosity == 1: planet.getLogger("WARNING",None)
|
||||
if verbosity == 2: planet.getLogger("DEBUG",None)
|
||||
|
||||
# load all of the tests into a suite
|
||||
try:
|
||||
@ -33,11 +44,5 @@ except Exception, exception:
|
||||
for module in modules: __import__(module)
|
||||
raise
|
||||
|
||||
verbosity = 1
|
||||
if "-q" in sys.argv or '--quiet' in sys.argv:
|
||||
verbosity = 0
|
||||
if "-v" in sys.argv or '--verbose' in sys.argv:
|
||||
verbosity = 2
|
||||
|
||||
# run test suite
|
||||
unittest.TextTestRunner(verbosity=verbosity).run(suite)
|
||||
|
29
tests/data/apply/config-mememe.ini
Normal file
29
tests/data/apply/config-mememe.ini
Normal file
@ -0,0 +1,29 @@
|
||||
[Planet]
|
||||
output_theme = classic_fancy
|
||||
output_dir = tests/work/apply
|
||||
name = test planet
|
||||
cache_directory = tests/work/spider/cache
|
||||
|
||||
bill_of_materials:
|
||||
images/#{face}
|
||||
|
||||
[index.html.tmpl]
|
||||
filters:
|
||||
html2xhtml.plugin
|
||||
mememe.plugin
|
||||
|
||||
[mememe.plugin]
|
||||
sidebar = //*[@class='sidebar']
|
||||
|
||||
[tests/data/spider/testfeed0.atom]
|
||||
name = not found
|
||||
|
||||
[tests/data/spider/testfeed1b.atom]
|
||||
name = one
|
||||
face = jdub.png
|
||||
|
||||
[tests/data/spider/testfeed2.atom]
|
||||
name = two
|
||||
|
||||
[tests/data/spider/testfeed3.rss]
|
||||
name = three
|
@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase):
|
||||
os.makedirs(workdir)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(workdir)
|
||||
os.removedirs(os.path.split(workdir)[0])
|
||||
shutil.rmtree(os.path.split(workdir)[0])
|
||||
|
||||
def test_apply_asf(self):
|
||||
config.load(configfile % 'asf')
|
||||
@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase):
|
||||
output = open(os.path.join(workdir, 'index.html4')).read()
|
||||
self.assertTrue(output.find('/>')<0)
|
||||
|
||||
def test_apply_filter_mememe(self):
|
||||
config.load(configfile % 'mememe')
|
||||
self.apply_fancy()
|
||||
|
||||
output = open(os.path.join(workdir, 'index.html')).read()
|
||||
self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
|
||||
|
||||
def apply_fancy(self):
|
||||
# drop slow templates unrelated to test at hand
|
||||
templates = config.parser.get('Planet','template_files').split()
|
||||
templates.remove('rss10.xml.tmpl')
|
||||
templates.remove('rss20.xml.tmpl')
|
||||
config.parser.set('Planet','template_files',' '.join(templates))
|
||||
|
||||
splice.apply(self.feeddata)
|
||||
|
||||
# verify that selected files are there
|
||||
|
Loading…
Reference in New Issue
Block a user