MeMeme and html2xhtml plugins
This commit is contained in:
parent
ddf15fc689
commit
a5e1fde287
@ -1,3 +1,4 @@
|
|||||||
*.tmplc
|
*.tmplc
|
||||||
.DS_Store
|
.DS_Store
|
||||||
cache
|
cache
|
||||||
|
*.pluginc
|
||||||
|
@ -8,12 +8,13 @@
|
|||||||
<title>Venus Filters</title>
|
<title>Venus Filters</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h2>Filters</h2>
|
<h2>Filters and Plugins</h2>
|
||||||
<p>Filters are simple Unix pipes. Input comes in <code>stdin</code>,
|
<p>Filters and plugins are simple Unix pipes. Input comes in
|
||||||
parameters come from the config file, and output goes to <code>stdout</code>.
|
<code>stdin</code>, parameters come from the config file, and output goes to
|
||||||
Anything written to <code>stderr</code> is logged as an ERROR message. If no
|
<code>stdout</code>. Anything written to <code>stderr</code> is logged as an
|
||||||
<code>stdout</code> is produced, the entry is not written to the cache or
|
ERROR message. If no <code>stdout</code> is produced, the entry is not written
|
||||||
processed further; in fact, if the entry had previously been written to the cache, it will be removed.</p>
|
to the cache or processed further; in fact, if the entry had previously been
|
||||||
|
written to the cache, it will be removed.</p>
|
||||||
|
|
||||||
<p>There are two types of filters supported by Venus, input and template.</p>
|
<p>There are two types of filters supported by Venus, input and template.</p>
|
||||||
<p>Input to an input filter is a aggressively
|
<p>Input to an input filter is a aggressively
|
||||||
@ -89,6 +90,16 @@ an HTML output stream from one source.</li>
|
|||||||
<li>Templates written using htmltmpl or django currently only have access to a
|
<li>Templates written using htmltmpl or django currently only have access to a
|
||||||
fixed set of fields, whereas XSLT and genshi templates have access to
|
fixed set of fields, whereas XSLT and genshi templates have access to
|
||||||
everything.</li>
|
everything.</li>
|
||||||
|
|
||||||
|
<li>Plugins differ from filters in that while filters are forked, plugins are
|
||||||
|
<a href="http://docs.python.org/lib/module-imp.html">imported</a>. This
|
||||||
|
means that plugins are limited to Python and are run in-process. Plugins
|
||||||
|
therefore have direct access to planet internals like configuration and
|
||||||
|
logging facitilies, as well as access to the bundled libraries like the
|
||||||
|
<a href="http://feedparser.org/docs/">Universal Feed Parser</a> and
|
||||||
|
<a href="http://code.google.com/p/html5lib/">html5lib</a>; but it also
|
||||||
|
means that functions like <code>os.abort()</code> can't be recovered
|
||||||
|
from.</li>
|
||||||
</ul>
|
</ul>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
<ul>
|
<ul>
|
||||||
<li><a href="venus.svg">Architecture</a></li>
|
<li><a href="venus.svg">Architecture</a></li>
|
||||||
<li><a href="normalization.html">Normalization</a></li>
|
<li><a href="normalization.html">Normalization</a></li>
|
||||||
<li><a href="filters.html">Filters</a></li>
|
<li><a href="filters.html">Filters and Plugins</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
</li>
|
</li>
|
||||||
<li>Other
|
<li>Other
|
||||||
|
@ -36,6 +36,13 @@ filters = excerpt.py
|
|||||||
omit = img p br
|
omit = img p br
|
||||||
width = 500
|
width = 500
|
||||||
|
|
||||||
|
# add memes to output
|
||||||
|
[index.html.tmpl]
|
||||||
|
filters = mememe.plugin
|
||||||
|
|
||||||
|
[mememe.plugin]
|
||||||
|
sidebar = //*[@id="footer"]
|
||||||
|
|
||||||
# subscription list
|
# subscription list
|
||||||
[http://share.opml.org/opml/top100.opml]
|
[http://share.opml.org/opml/top100.opml]
|
||||||
content_type = opml
|
content_type = opml
|
||||||
|
6
filters/html2xhtml.plugin
Normal file
6
filters/html2xhtml.plugin
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import sys
|
||||||
|
from planet import html5lib
|
||||||
|
tree=html5lib.treebuilders.dom.TreeBuilder
|
||||||
|
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
||||||
|
document = parser.parse(sys.stdin)
|
||||||
|
sys.stdout.write(document.toxml("utf-8"))
|
475
filters/mememe.plugin
Normal file
475
filters/mememe.plugin
Normal file
@ -0,0 +1,475 @@
|
|||||||
|
#
|
||||||
|
# This Venus output filter will annotate an XHTML page with a list of
|
||||||
|
# "memes" (or most popular linked destinations, based on the last week
|
||||||
|
# of entries from the cache) and will update the subscription list with
|
||||||
|
# links to recent entries from each subscription.
|
||||||
|
#
|
||||||
|
# Templates that don't produce XHTML natively will need their output passed
|
||||||
|
# through html2xhtml.plugin first.
|
||||||
|
#
|
||||||
|
# Typical configuration (based on classic_fancy):
|
||||||
|
#
|
||||||
|
# [index.html.tmpl]
|
||||||
|
# filters:
|
||||||
|
# html2xhtml.plugin
|
||||||
|
# mememe.plugin
|
||||||
|
#
|
||||||
|
# [mememe.plugin]
|
||||||
|
# sidebar = @class='sidebar'
|
||||||
|
#
|
||||||
|
|
||||||
|
import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
from htmlentitydefs import entitydefs
|
||||||
|
|
||||||
|
import planet
|
||||||
|
from planet import config, feedparser
|
||||||
|
from planet.spider import filename
|
||||||
|
log = planet.getLogger(config.log_level(),config.log_format())
|
||||||
|
options = config.filter_options(sys.argv[0])
|
||||||
|
|
||||||
|
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
week = 7 * 86400
|
||||||
|
week_ago = now - week
|
||||||
|
|
||||||
|
cache = config.cache_directory()
|
||||||
|
meme_cache = os.path.join(cache, 'memes')
|
||||||
|
if not os.path.exists(meme_cache): os.makedirs(meme_cache)
|
||||||
|
|
||||||
|
all_links = {}
|
||||||
|
feed_links = {}
|
||||||
|
|
||||||
|
def check_cache(url):
|
||||||
|
try:
|
||||||
|
file = open(filename(meme_cache, url))
|
||||||
|
headers = eval(file.read())
|
||||||
|
file.close()
|
||||||
|
return headers or {}
|
||||||
|
except:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def cache_meme(url, headers):
|
||||||
|
json = []
|
||||||
|
for key,value in headers.items():
|
||||||
|
json.append(' %s: %s' % (toj(key), toj(value)))
|
||||||
|
file = open(filename(meme_cache, url),'w')
|
||||||
|
file.write('{\n' + ',\n'.join(json) + '\n}\n')
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
urlmap = {}
|
||||||
|
def canonicalize(url):
|
||||||
|
url = urlmap.get(url,url)
|
||||||
|
parts = list(urlparse.urlparse(url))
|
||||||
|
|
||||||
|
parts[0] = parts[0].lower()
|
||||||
|
parts[1] = parts[1].lower()
|
||||||
|
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
|
||||||
|
if not parts[2]: parts[2] = '/'
|
||||||
|
parts[-1] = ''
|
||||||
|
return urlparse.urlunparse(parts)
|
||||||
|
|
||||||
|
log.debug("Loading cached data")
|
||||||
|
for name in glob.glob(os.path.join(cache, '*')):
|
||||||
|
# ensure that this is within the past week
|
||||||
|
if os.path.isdir(name): continue
|
||||||
|
mtime = os.stat(name).st_mtime
|
||||||
|
if mtime < week_ago: continue
|
||||||
|
|
||||||
|
# parse the file
|
||||||
|
try:
|
||||||
|
doc = libxml2.parseFile(name)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
xp = doc.xpathNewContext()
|
||||||
|
xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
|
||||||
|
xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
|
||||||
|
|
||||||
|
# determine the entry
|
||||||
|
entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
|
||||||
|
if not entry: continue
|
||||||
|
entry = canonicalize(entry[0].prop("href"))
|
||||||
|
|
||||||
|
# determine the title
|
||||||
|
title = xp.xpathEval("/atom:entry/atom:title")
|
||||||
|
if title:
|
||||||
|
if title[0].prop('type') == 'html':
|
||||||
|
title = re.sub('<.*?>','',title[0].content)
|
||||||
|
else:
|
||||||
|
title = title[0].content
|
||||||
|
title = str(title or '')
|
||||||
|
|
||||||
|
# determine the feed id
|
||||||
|
feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
|
||||||
|
if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
|
||||||
|
if not feed: continue
|
||||||
|
feed = feed[0].content
|
||||||
|
|
||||||
|
# determine the author
|
||||||
|
author = xp.xpathEval("/atom:entry/atom:source/planet:name")
|
||||||
|
if author:
|
||||||
|
author = author[0].content
|
||||||
|
else:
|
||||||
|
author = ''
|
||||||
|
|
||||||
|
# track the feed_links
|
||||||
|
if author:
|
||||||
|
if not feed_links.has_key(author): feed_links[author] = list()
|
||||||
|
feed_links[author].append([mtime, entry, title])
|
||||||
|
|
||||||
|
# identify the unique links
|
||||||
|
entry_links = []
|
||||||
|
for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
|
||||||
|
parent = node.parent
|
||||||
|
while parent:
|
||||||
|
if parent.name == 'source': break
|
||||||
|
parent = parent.parent
|
||||||
|
else:
|
||||||
|
link = canonicalize(node.prop('href'))
|
||||||
|
if not link in entry_links:
|
||||||
|
entry_links.append(link)
|
||||||
|
if node.hasProp('title') and node.prop('title').startswith('http'):
|
||||||
|
link = canonicalize(node.prop('title'))
|
||||||
|
if not link in entry_links:
|
||||||
|
entry_links.append(link)
|
||||||
|
|
||||||
|
# add the votes
|
||||||
|
weight = 1.0 - (now - mtime)**2 / week**2
|
||||||
|
vote = [(weight, str(entry), str(feed), title, author, mtime)]
|
||||||
|
for link in entry_links:
|
||||||
|
all_links[link] = all_links.get(link,list()) + vote
|
||||||
|
|
||||||
|
# free the entry
|
||||||
|
doc.freeDoc()
|
||||||
|
|
||||||
|
# tally the votes
|
||||||
|
weighted_links = []
|
||||||
|
for link, votes in all_links.items():
|
||||||
|
site = {}
|
||||||
|
updated = 0
|
||||||
|
for weight, entry, feed, title, author, mtime in votes:
|
||||||
|
site[feed] = max(site.get(feed,0), weight)
|
||||||
|
if mtime > updated: updated=mtime
|
||||||
|
weighted_links.append((sum(site.values()), link, updated))
|
||||||
|
weighted_links.sort()
|
||||||
|
weighted_links.reverse()
|
||||||
|
|
||||||
|
cp1252 = {
|
||||||
|
128: 8364, # euro sign
|
||||||
|
130: 8218, # single low-9 quotation mark
|
||||||
|
131: 402, # latin small letter f with hook
|
||||||
|
132: 8222, # double low-9 quotation mark
|
||||||
|
133: 8230, # horizontal ellipsis
|
||||||
|
134: 8224, # dagger
|
||||||
|
135: 8225, # double dagger
|
||||||
|
136: 710, # modifier letter circumflex accent
|
||||||
|
137: 8240, # per mille sign
|
||||||
|
138: 352, # latin capital letter s with caron
|
||||||
|
139: 8249, # single left-pointing angle quotation mark
|
||||||
|
140: 338, # latin capital ligature oe
|
||||||
|
142: 381, # latin capital letter z with caron
|
||||||
|
145: 8216, # left single quotation mark
|
||||||
|
146: 8217, # right single quotation mark
|
||||||
|
147: 8220, # left double quotation mark
|
||||||
|
148: 8221, # right double quotation mark
|
||||||
|
149: 8226, # bullet
|
||||||
|
150: 8211, # en dash
|
||||||
|
151: 8212, # em dash
|
||||||
|
152: 732, # small tilde
|
||||||
|
153: 8482, # trade mark sign
|
||||||
|
154: 353, # latin small letter s with caron
|
||||||
|
155: 8250, # single right-pointing angle quotation mark
|
||||||
|
156: 339, # latin small ligature oe
|
||||||
|
158: 382, # latin small letter z with caron
|
||||||
|
159: 376} # latin capital letter y with diaeresis
|
||||||
|
|
||||||
|
# determine the title for a given url
|
||||||
|
class html(sgmllib.SGMLParser):
|
||||||
|
def __init__(self, url):
|
||||||
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
self.title = ""
|
||||||
|
self.feedurl = ""
|
||||||
|
self.intitle = False
|
||||||
|
|
||||||
|
headers = check_cache(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# fetch the page
|
||||||
|
request = urllib2.Request(url)
|
||||||
|
request.add_header('User-Agent', 'Venus/MeMeme')
|
||||||
|
if headers.has_key('etag'):
|
||||||
|
request.add_header('If-None-Match', headers['etag'])
|
||||||
|
if headers.has_key('last_modified'):
|
||||||
|
request.add_header('If-Modified-Since', headers['last-modified'])
|
||||||
|
response = urllib2.urlopen(request)
|
||||||
|
self.feed(response.read())
|
||||||
|
|
||||||
|
# ensure the data is in utf-8
|
||||||
|
try:
|
||||||
|
self.title = self.title.decode('utf-8')
|
||||||
|
except:
|
||||||
|
self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
|
||||||
|
for c in self.title.decode('iso-8859-1')])
|
||||||
|
|
||||||
|
# cache the results
|
||||||
|
headers = {}
|
||||||
|
if self.feedurl: headers['feedurl'] = self.feedurl
|
||||||
|
if self.title: headers['title'] = self.title
|
||||||
|
headers.update(response.headers)
|
||||||
|
cache_meme(url, headers)
|
||||||
|
except:
|
||||||
|
self.feedurl = headers.get('feedurl')
|
||||||
|
if headers.has_key('title'):
|
||||||
|
if isinstance(headers['title'],str):
|
||||||
|
self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
|
||||||
|
else:
|
||||||
|
self.title=headers['title']
|
||||||
|
|
||||||
|
# if there is a feed, look for an entry that matches, and take that title
|
||||||
|
if self.feedurl and not self.title:
|
||||||
|
headers = check_cache(self.feedurl)
|
||||||
|
data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
|
||||||
|
modified=headers.get('last-modified'))
|
||||||
|
|
||||||
|
if data.has_key('headers') and data.has_key('status') and \
|
||||||
|
data.status in [200, 301, 302]:
|
||||||
|
|
||||||
|
titles = {}
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key('title_detail') and entry.has_key('link'):
|
||||||
|
titles[entry.link] = entry.title_detail.value
|
||||||
|
if entry.title_detail.type == 'text/plain':
|
||||||
|
titles[entry.link] = escape(titles[entry.link])
|
||||||
|
|
||||||
|
if titles.has_key(url): self.title = titles[url]
|
||||||
|
|
||||||
|
data.headers.update(titles)
|
||||||
|
cache_meme(self.feedurl, data.headers)
|
||||||
|
else:
|
||||||
|
if headers.has_key(url):
|
||||||
|
if isinstance(headers[url],str):
|
||||||
|
self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
|
||||||
|
else:
|
||||||
|
self.title=headers[url]
|
||||||
|
|
||||||
|
# fallback is the basename of the URI
|
||||||
|
if not self.title:
|
||||||
|
self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
|
||||||
|
|
||||||
|
# parse out the first autodiscovery link
|
||||||
|
def start_link(self, attrs):
|
||||||
|
if self.feedurl: return
|
||||||
|
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
|
||||||
|
if not 'rel' in attrs: return
|
||||||
|
rels = attrs['rel'].split(' ')
|
||||||
|
if 'alternate' not in rels: return
|
||||||
|
if not 'type' in attrs or not attrs['type'].endswith('xml'): return
|
||||||
|
if 'href' in attrs:
|
||||||
|
self.feedurl = attrs['href']
|
||||||
|
|
||||||
|
# parse the page title
|
||||||
|
def start_title(self, attributes):
|
||||||
|
if not self.title: self.intitle = True
|
||||||
|
def end_title(self):
|
||||||
|
self.intitle = False
|
||||||
|
def handle_data(self, text):
|
||||||
|
if self.intitle: self.title += escape(text)
|
||||||
|
|
||||||
|
# convert unicode string to a json string
|
||||||
|
def toj(value):
|
||||||
|
result = repr(value).replace(r'\x',r'\u00')
|
||||||
|
if result[:1] == 'u': result=result[1:]
|
||||||
|
if result.startswith("'"):
|
||||||
|
result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
|
||||||
|
return result
|
||||||
|
|
||||||
|
seenit = []
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# construct an empty feed
|
||||||
|
feed_doc = libxml2.newDoc("1.0")
|
||||||
|
meme_feed = feed_doc.newChild(None, "feed", None)
|
||||||
|
meme_feed.newNs('http://www.w3.org/2005/Atom', None)
|
||||||
|
meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
|
||||||
|
author = meme_feed.newChild(None, 'author', None)
|
||||||
|
author.newTextChild(None, 'name', config.owner_name())
|
||||||
|
if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
|
||||||
|
meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
|
||||||
|
link = meme_feed.newChild(None, 'link', None)
|
||||||
|
link.setProp('href', os.path.join(config.link(), 'memes.atom'))
|
||||||
|
link.setProp('rel', 'self')
|
||||||
|
meme_feed.newTextChild(None, 'updated',
|
||||||
|
time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
|
||||||
|
|
||||||
|
# parse the input
|
||||||
|
log.debug("Parse input")
|
||||||
|
doc=libxml2.parseDoc(sys.stdin.read())
|
||||||
|
|
||||||
|
# find the sidebar/footer
|
||||||
|
sidebar = options.get('sidebar','//*[@class="sidebar"]')
|
||||||
|
footer = doc.xpathEval(sidebar)
|
||||||
|
if not hasattr(footer,'__len__') or len(footer) == 0:
|
||||||
|
raise Exception(sidebar + ' not found')
|
||||||
|
if len(footer) > 1:
|
||||||
|
log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
|
||||||
|
footer = footer[0]
|
||||||
|
|
||||||
|
# add up to 10 entry links to each subscription
|
||||||
|
subs_ul = footer.children
|
||||||
|
while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
|
||||||
|
child = subs_ul.children
|
||||||
|
while child:
|
||||||
|
if child.name == 'li':
|
||||||
|
if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
|
||||||
|
link = child.lastChild()
|
||||||
|
while link.isText(): link=link.prev
|
||||||
|
author = link.getContent()
|
||||||
|
state = 'inactive'
|
||||||
|
if feed_links.has_key(author):
|
||||||
|
ul2 = child.newChild(None, 'ul', None)
|
||||||
|
feed_links[author].sort()
|
||||||
|
feed_links[author].reverse()
|
||||||
|
link_count = 0
|
||||||
|
for mtime, entry, title in feed_links[author]:
|
||||||
|
if not title: continue
|
||||||
|
li2 = ul2.newChild(None, 'li', None)
|
||||||
|
a = li2.newTextChild(None, 'a', title)
|
||||||
|
a.setProp('href', entry)
|
||||||
|
link_count = link_count + 1
|
||||||
|
if link_count >= 10: break
|
||||||
|
if link_count > 0: state = None
|
||||||
|
if state:
|
||||||
|
link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
|
||||||
|
child=child.next
|
||||||
|
|
||||||
|
# create a h2 and ul for the memes list
|
||||||
|
footer_top = footer.children
|
||||||
|
memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
|
||||||
|
memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
|
||||||
|
|
||||||
|
# create a header for the memes list
|
||||||
|
a = memes.newChild(None, 'a', None)
|
||||||
|
a.setProp('href', 'memes.atom')
|
||||||
|
img = a.newChild(None, 'img', None)
|
||||||
|
img.setProp('src', 'images/feed-icon-10x10.png')
|
||||||
|
|
||||||
|
# collect the results
|
||||||
|
log.debug("Fetch titles and collect the results")
|
||||||
|
from urllib import quote_plus
|
||||||
|
for i in range(0,len(weighted_links)):
|
||||||
|
weight, link, updated = weighted_links[i]
|
||||||
|
|
||||||
|
# ensure that somebody new points to this entry. This guards against
|
||||||
|
# groups of related links which several posts point to all.
|
||||||
|
novel = False
|
||||||
|
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||||
|
if entry not in seenit:
|
||||||
|
seenit.append(entry)
|
||||||
|
novel = True
|
||||||
|
if not novel: continue
|
||||||
|
|
||||||
|
all_links[link].sort()
|
||||||
|
all_links[link].reverse()
|
||||||
|
cache_file = filename(cache, link)
|
||||||
|
title = None
|
||||||
|
|
||||||
|
# when possible, take the title from the cache
|
||||||
|
if os.path.exists(cache_file):
|
||||||
|
entry = feedparser.parse(cache_file).entries[0]
|
||||||
|
if entry.has_key('title_detail'):
|
||||||
|
title = entry.title_detail.value
|
||||||
|
if entry.title_detail.type == 'text/plain': title = escape(title)
|
||||||
|
|
||||||
|
# otherwise, parse the html
|
||||||
|
if not title:
|
||||||
|
title = html(link).title
|
||||||
|
|
||||||
|
# dehtmlize
|
||||||
|
title = re.sub('&(\w+);',
|
||||||
|
lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
|
||||||
|
title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
|
||||||
|
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
|
||||||
|
|
||||||
|
# title too long? Insert zero width spaces where appropriate
|
||||||
|
if max(map(len,title.split())) > 30:
|
||||||
|
title=re.sub('(\W+)',u'\\1\u200b',title)
|
||||||
|
|
||||||
|
# save the entry title (it is used later)
|
||||||
|
entry_title = title.strip()
|
||||||
|
|
||||||
|
# add to the memes list
|
||||||
|
memes_ul.addContent('\n')
|
||||||
|
li = memes_ul.newChild(None, 'li', None)
|
||||||
|
memes_ul.addContent('\n')
|
||||||
|
|
||||||
|
# technorati link
|
||||||
|
a = li.newChild(None, 'a', None)
|
||||||
|
tlink = 'http://technorati.com/cosmos/search.html?url='
|
||||||
|
if link.startswith('http://'):
|
||||||
|
a.setProp('href',tlink + quote_plus(link[7:]))
|
||||||
|
else:
|
||||||
|
a.setProp('href',tlink + quote_plus(link))
|
||||||
|
a.setProp('title','cosmos')
|
||||||
|
img = a.newChild(None, 'img', None)
|
||||||
|
img.setProp('src','tcosm11.gif')
|
||||||
|
|
||||||
|
# main link
|
||||||
|
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
|
||||||
|
a.setProp('href',link)
|
||||||
|
if (((i==0) or (updated>=weighted_links[i-1][2])) and
|
||||||
|
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
|
||||||
|
rank = 0
|
||||||
|
for j in range(0,len(weighted_links)):
|
||||||
|
if updated < weighted_links[j][2]: rank = rank + 1
|
||||||
|
if rank < len(weighted_links)/2:
|
||||||
|
a.setProp('class','rising')
|
||||||
|
|
||||||
|
# voters
|
||||||
|
ul2 = li.newChild(None, 'ul', None)
|
||||||
|
voters = []
|
||||||
|
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||||
|
if entry in voters: continue
|
||||||
|
li2 = ul2.newChild(None, 'li', None)
|
||||||
|
a = li2.newTextChild(None, 'a' , author)
|
||||||
|
a.setProp('href',entry)
|
||||||
|
if title: a.setProp('title',title)
|
||||||
|
voters.append(entry)
|
||||||
|
|
||||||
|
# add to the meme feed
|
||||||
|
if len(all_links[link]) > 2:
|
||||||
|
meme_feed.addContent('\n')
|
||||||
|
entry = meme_feed.newChild(None, 'entry', None)
|
||||||
|
meme_feed.addContent('\n')
|
||||||
|
|
||||||
|
# entry
|
||||||
|
tagbase = config.link().split('/')
|
||||||
|
if not tagbase[-1]: tagbase = tagbase[:-1]
|
||||||
|
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
|
||||||
|
entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
|
||||||
|
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
|
||||||
|
meme_link = entry.newTextChild(None, 'link', None)
|
||||||
|
meme_link.setProp('href', link)
|
||||||
|
entry.newTextChild(None, 'updated',
|
||||||
|
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
|
||||||
|
|
||||||
|
# voters
|
||||||
|
content = entry.newChild(None, 'content', None)
|
||||||
|
content.setProp('type', 'xhtml')
|
||||||
|
div = content.newTextChild(None, 'div', 'Spotted by:')
|
||||||
|
div.newNs('http://www.w3.org/1999/xhtml', None)
|
||||||
|
content_ul = div.newChild(None, 'ul', None)
|
||||||
|
for weight, entry, feed, title, author, mtime in all_links[link]:
|
||||||
|
li2 = content_ul.newTextChild(None, 'li', author + ": ")
|
||||||
|
a = li2.newTextChild(None, 'a' , title or 'untitled')
|
||||||
|
a.setProp('href',entry)
|
||||||
|
|
||||||
|
count = count + 1
|
||||||
|
if count >= 10: break
|
||||||
|
|
||||||
|
log.info("Writing " + MEMES_ATOM)
|
||||||
|
output=open(MEMES_ATOM,'w')
|
||||||
|
output.write(feed_doc.serialize('utf-8'))
|
||||||
|
output.close()
|
||||||
|
|
||||||
|
sys.stdout.write(doc.serialize('utf-8'))
|
@ -352,14 +352,15 @@ def filters(section=None):
|
|||||||
filters = []
|
filters = []
|
||||||
if parser.has_option('Planet', 'filters'):
|
if parser.has_option('Planet', 'filters'):
|
||||||
filters += parser.get('Planet', 'filters').split()
|
filters += parser.get('Planet', 'filters').split()
|
||||||
if section and parser.has_option(section, 'filters'):
|
|
||||||
filters += parser.get(section, 'filters').split()
|
|
||||||
if filter(section):
|
if filter(section):
|
||||||
filters.append('regexp_sifter.py?require=' +
|
filters.append('regexp_sifter.py?require=' +
|
||||||
urllib.quote(filter(section)))
|
urllib.quote(filter(section)))
|
||||||
if exclude(section):
|
if exclude(section):
|
||||||
filters.append('regexp_sifter.py?exclude=' +
|
filters.append('regexp_sifter.py?exclude=' +
|
||||||
urllib.quote(exclude(section)))
|
urllib.quote(exclude(section)))
|
||||||
|
for section in section and [section] or template_files():
|
||||||
|
if parser.has_option(section, 'filters'):
|
||||||
|
filters += parser.get(section, 'filters').split()
|
||||||
return filters
|
return filters
|
||||||
|
|
||||||
def planet_options():
|
def planet_options():
|
||||||
@ -382,6 +383,10 @@ def template_options(section):
|
|||||||
""" dictionary of template specific options"""
|
""" dictionary of template specific options"""
|
||||||
return feed_options(section)
|
return feed_options(section)
|
||||||
|
|
||||||
|
def filter_options(section):
|
||||||
|
""" dictionary of filter specific options"""
|
||||||
|
return feed_options(section)
|
||||||
|
|
||||||
def write(file=sys.stdout):
|
def write(file=sys.stdout):
|
||||||
""" write out an updated template """
|
""" write out an updated template """
|
||||||
print parser.write(file)
|
print parser.write(file)
|
||||||
|
@ -71,35 +71,40 @@ class HTMLParser(object):
|
|||||||
"trailingEnd": TrailingEndPhase(self, self.tree)
|
"trailingEnd": TrailingEndPhase(self, self.tree)
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse(self, stream, encoding=None, innerHTML=False):
|
def _parse(self, stream, innerHTML=False, container="div",
|
||||||
"""Parse a HTML document into a well-formed tree
|
encoding=None):
|
||||||
|
|
||||||
stream - a filelike object or string containing the HTML to be parsed
|
|
||||||
|
|
||||||
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
|
|
||||||
is not yet supported)
|
|
||||||
|
|
||||||
The optional encoding parameter must be a string that indicates
|
|
||||||
the encoding. If specified, that encoding will be used,
|
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
|
||||||
element)
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.tree.reset()
|
self.tree.reset()
|
||||||
self.firstStartTag = False
|
self.firstStartTag = False
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
self.phase = self.phases["initial"]
|
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
|
||||||
|
parseMeta=innerHTML)
|
||||||
|
|
||||||
|
if innerHTML:
|
||||||
|
self.innerHTML = container.lower()
|
||||||
|
|
||||||
|
if self.innerHTML in ('title', 'textarea'):
|
||||||
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
|
||||||
|
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
|
||||||
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
|
||||||
|
elif self.innerHTML == 'plaintext':
|
||||||
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
|
||||||
|
else:
|
||||||
|
# contentModelFlag already is PCDATA
|
||||||
|
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
|
||||||
|
pass
|
||||||
|
self.phase = self.phases["rootElement"]
|
||||||
|
self.phase.insertHtmlElement()
|
||||||
|
self.resetInsertionMode()
|
||||||
|
else:
|
||||||
|
self.innerHTML = False
|
||||||
|
self.phase = self.phases["initial"]
|
||||||
|
|
||||||
# We only seem to have InBodyPhase testcases where the following is
|
# We only seem to have InBodyPhase testcases where the following is
|
||||||
# relevant ... need others too
|
# relevant ... need others too
|
||||||
self.lastPhase = None
|
self.lastPhase = None
|
||||||
|
|
||||||
# We don't actually support innerHTML yet but this should allow
|
|
||||||
# assertations
|
|
||||||
self.innerHTML = innerHTML
|
|
||||||
|
|
||||||
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
|
|
||||||
|
|
||||||
# XXX This is temporary for the moment so there isn't any other
|
# XXX This is temporary for the moment so there isn't any other
|
||||||
# changes needed for the parser to work with the iterable tokenizer
|
# changes needed for the parser to work with the iterable tokenizer
|
||||||
for token in self.tokenizer:
|
for token in self.tokenizer:
|
||||||
@ -118,7 +123,34 @@ class HTMLParser(object):
|
|||||||
# When the loop finishes it's EOF
|
# When the loop finishes it's EOF
|
||||||
self.phase.processEOF()
|
self.phase.processEOF()
|
||||||
|
|
||||||
|
def parse(self, stream, encoding=None):
|
||||||
|
"""Parse a HTML document into a well-formed tree
|
||||||
|
|
||||||
|
stream - a filelike object or string containing the HTML to be parsed
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
"""
|
||||||
|
self._parse(stream, innerHTML=False, encoding=encoding)
|
||||||
return self.tree.getDocument()
|
return self.tree.getDocument()
|
||||||
|
|
||||||
|
def parseFragment(self, stream, container="div", encoding=None):
|
||||||
|
"""Parse a HTML fragment into a well-formed tree fragment
|
||||||
|
|
||||||
|
container - name of the element we're setting the innerHTML property
|
||||||
|
if set to None, default to 'div'
|
||||||
|
|
||||||
|
stream - a filelike object or string containing the HTML to be parsed
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
"""
|
||||||
|
self._parse(stream, True, container=container, encoding=encoding)
|
||||||
|
return self.tree.getFragment()
|
||||||
|
|
||||||
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
|
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||||
# XXX The idea is to make data mandatory.
|
# XXX The idea is to make data mandatory.
|
||||||
@ -187,28 +219,29 @@ class HTMLParser(object):
|
|||||||
"frameset":"inFrameset"
|
"frameset":"inFrameset"
|
||||||
}
|
}
|
||||||
for node in self.tree.openElements[::-1]:
|
for node in self.tree.openElements[::-1]:
|
||||||
|
nodeName = node.name
|
||||||
if node == self.tree.openElements[0]:
|
if node == self.tree.openElements[0]:
|
||||||
last = True
|
last = True
|
||||||
if node.name not in ['td', 'th']:
|
if nodeName not in ['td', 'th']:
|
||||||
# XXX
|
# XXX
|
||||||
assert self.innerHTML
|
assert self.innerHTML
|
||||||
raise NotImplementedError
|
nodeName = self.innerHTML
|
||||||
# Check for conditions that should only happen in the innerHTML
|
# Check for conditions that should only happen in the innerHTML
|
||||||
# case
|
# case
|
||||||
if node.name in ("select", "colgroup", "head", "frameset"):
|
if nodeName in ("select", "colgroup", "head", "frameset"):
|
||||||
# XXX
|
# XXX
|
||||||
assert self.innerHTML
|
assert self.innerHTML
|
||||||
if node.name in newModes:
|
if nodeName in newModes:
|
||||||
self.phase = self.phases[newModes[node.name]]
|
self.phase = self.phases[newModes[nodeName]]
|
||||||
break
|
break
|
||||||
elif node.name == "html":
|
elif nodeName == "html":
|
||||||
if self.tree.headPointer is None:
|
if self.tree.headPointer is None:
|
||||||
self.phase = self.phases["beforeHead"]
|
self.phase = self.phases["beforeHead"]
|
||||||
else:
|
else:
|
||||||
self.phase = self.phases["afterHead"]
|
self.phase = self.phases["afterHead"]
|
||||||
break
|
break
|
||||||
elif last:
|
elif last:
|
||||||
self.phase = self.phases["body"]
|
self.phase = self.phases["inBody"]
|
||||||
break
|
break
|
||||||
|
|
||||||
class Phase(object):
|
class Phase(object):
|
||||||
@ -434,9 +467,7 @@ class InHeadPhase(Phase):
|
|||||||
self.parser.phase.processCharacters(data)
|
self.parser.phase.processCharacters(data)
|
||||||
|
|
||||||
def startTagHead(self, name, attributes):
|
def startTagHead(self, name, attributes):
|
||||||
self.tree.insertElement(name, attributes)
|
self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
|
||||||
self.tree.headPointer = self.tree.openElements[-1]
|
|
||||||
self.parser.phase = self.parser.phases["inHead"]
|
|
||||||
|
|
||||||
def startTagTitle(self, name, attributes):
|
def startTagTitle(self, name, attributes):
|
||||||
element = self.tree.createElement(name, attributes)
|
element = self.tree.createElement(name, attributes)
|
||||||
@ -455,10 +486,11 @@ class InHeadPhase(Phase):
|
|||||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||||
|
|
||||||
def startTagScript(self, name, attributes):
|
def startTagScript(self, name, attributes):
|
||||||
|
#XXX Inner HTML case may be wrong
|
||||||
element = self.tree.createElement(name, attributes)
|
element = self.tree.createElement(name, attributes)
|
||||||
element._flags.append("parser-inserted")
|
element._flags.append("parser-inserted")
|
||||||
if self.tree.headPointer is not None and\
|
if (self.tree.headPointer is not None and
|
||||||
self.parser.phase == self.parser.phases["inHead"]:
|
self.parser.phase == self.parser.phases["inHead"]):
|
||||||
self.appendToHead(element)
|
self.appendToHead(element)
|
||||||
else:
|
else:
|
||||||
self.tree.openElements[-1].appendChild(element)
|
self.tree.openElements[-1].appendChild(element)
|
||||||
@ -653,8 +685,8 @@ class InBodyPhase(Phase):
|
|||||||
|
|
||||||
def startTagBody(self, name, attributes):
|
def startTagBody(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (body)."))
|
self.parser.parseError(_(u"Unexpected start tag (body)."))
|
||||||
if len(self.tree.openElements) == 1 \
|
if (len(self.tree.openElements) == 1
|
||||||
or self.tree.openElements[1].name != "body":
|
or self.tree.openElements[1].name != "body"):
|
||||||
assert self.parser.innerHTML
|
assert self.parser.innerHTML
|
||||||
else:
|
else:
|
||||||
for attr, value in attributes.iteritems():
|
for attr, value in attributes.iteritems():
|
||||||
@ -1179,6 +1211,7 @@ class InTablePhase(Phase):
|
|||||||
self.parser.resetInsertionMode()
|
self.parser.resetInsertionMode()
|
||||||
else:
|
else:
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
|
assert self.parser.innerHTML
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase):
|
|||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
|
def ignoreEndTagCaption(self):
|
||||||
|
return not self.tree.elementInScope("caption", True)
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
self.parser.phases["inBody"].processCharacters(data)
|
self.parser.phases["inBody"].processCharacters(data)
|
||||||
|
|
||||||
def startTagTableElement(self, name, attributes):
|
def startTagTableElement(self, name, attributes):
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
#XXX Have to duplicate logic here to find out if the tag is ignored
|
||||||
|
ignoreEndTag = self.ignoreEndTagCaption()
|
||||||
self.parser.phase.processEndTag("caption")
|
self.parser.phase.processEndTag("caption")
|
||||||
# XXX how do we know the tag is _always_ ignored in the innerHTML
|
if not ignoreEndTag:
|
||||||
# case and therefore shouldn't be processed again? I'm not sure this
|
|
||||||
# strategy makes sense...
|
|
||||||
if not self.parser.innerHTML:
|
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def endTagCaption(self, name):
|
def endTagCaption(self, name):
|
||||||
if self.tree.elementInScope(name, True):
|
if not self.ignoreEndTagCaption():
|
||||||
# AT this code is quite similar to endTagTable in "InTable"
|
# AT this code is quite similar to endTagTable in "InTable"
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != "caption":
|
if self.tree.openElements[-1].name != "caption":
|
||||||
@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase):
|
|||||||
self.parser.phase = self.parser.phases["inTable"]
|
self.parser.phase = self.parser.phases["inTable"]
|
||||||
else:
|
else:
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
|
assert self.parser.innerHTML
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagTable(self, name):
|
def endTagTable(self, name):
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
ignoreEndTag = self.ignoreEndTagCaption()
|
||||||
self.parser.phase.processEndTag("caption")
|
self.parser.phase.processEndTag("caption")
|
||||||
# XXX ...
|
if not ignoreEndTag:
|
||||||
if not self.parser.innerHTML:
|
self.parser.phase.processEndTag(name)
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||||
@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase):
|
|||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
|
def ignoreEndTagColgroup(self):
|
||||||
|
return self.tree.openElements[-1].name == "html"
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup("colgroup")
|
self.endTagColgroup("colgroup")
|
||||||
# XXX
|
if not ignoreEndTag:
|
||||||
if not self.parser.innerHTML:
|
|
||||||
self.parser.phase.processCharacters(data)
|
self.parser.phase.processCharacters(data)
|
||||||
|
|
||||||
def startTagCol(self, name ,attributes):
|
def startTagCol(self, name ,attributes):
|
||||||
@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase):
|
|||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup("colgroup")
|
self.endTagColgroup("colgroup")
|
||||||
# XXX how can be sure it's always ignored?
|
if not ignoreEndTag:
|
||||||
if not self.parser.innerHTML:
|
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def endTagColgroup(self, name):
|
def endTagColgroup(self, name):
|
||||||
if self.tree.openElements[-1].name == "html":
|
if self.ignoreEndTagColgroup():
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
|
assert self.parser.innerHTML
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
else:
|
else:
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase):
|
|||||||
u"col has no end tag."))
|
u"col has no end tag."))
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||||
self.endTagColgroup("colgroup")
|
self.endTagColgroup("colgroup")
|
||||||
# XXX how can be sure it's always ignored?
|
if not ignoreEndTag:
|
||||||
if not self.parser.innerHTML:
|
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
|
|
||||||
@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase):
|
|||||||
|
|
||||||
def startTagTableOther(self, name, attributes):
|
def startTagTableOther(self, name, attributes):
|
||||||
# XXX AT Any ideas on how to share this with endTagTable?
|
# XXX AT Any ideas on how to share this with endTagTable?
|
||||||
if self.tree.elementInScope("tbody", True) or \
|
if (self.tree.elementInScope("tbody", True) or
|
||||||
self.tree.elementInScope("thead", True) or \
|
self.tree.elementInScope("thead", True) or
|
||||||
self.tree.elementInScope("tfoot", True):
|
self.tree.elementInScope("tfoot", True)):
|
||||||
self.clearStackToTableBodyContext()
|
self.clearStackToTableBodyContext()
|
||||||
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase):
|
|||||||
") in the table body phase. Ignored."))
|
") in the table body phase. Ignored."))
|
||||||
|
|
||||||
def endTagTable(self, name):
|
def endTagTable(self, name):
|
||||||
if self.tree.elementInScope("tbody", True) or \
|
if (self.tree.elementInScope("tbody", True) or
|
||||||
self.tree.elementInScope("thead", True) or \
|
self.tree.elementInScope("thead", True) or
|
||||||
self.tree.elementInScope("tfoot", True):
|
self.tree.elementInScope("tfoot", True)):
|
||||||
self.clearStackToTableBodyContext()
|
self.clearStackToTableBodyContext()
|
||||||
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
self.endTagTableRowGroup(self.tree.openElements[-1].name)
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
@ -1428,6 +1468,9 @@ class InRowPhase(Phase):
|
|||||||
self.tree.openElements[-1].name + u") in the row phase."))
|
self.tree.openElements[-1].name + u") in the row phase."))
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
|
def ignoreEndTagTr(self):
|
||||||
|
return not self.tree.elementInScope("tr", tableVariant=True)
|
||||||
|
|
||||||
# the rest
|
# the rest
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
self.parser.phases["inTable"].processCharacters(data)
|
self.parser.phases["inTable"].processCharacters(data)
|
||||||
@ -1439,28 +1482,31 @@ class InRowPhase(Phase):
|
|||||||
self.tree.activeFormattingElements.append(Marker)
|
self.tree.activeFormattingElements.append(Marker)
|
||||||
|
|
||||||
def startTagTableOther(self, name, attributes):
|
def startTagTableOther(self, name, attributes):
|
||||||
|
ignoreEndTag = self.ignoreEndTagTr()
|
||||||
self.endTagTr("tr")
|
self.endTagTr("tr")
|
||||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||||
if not self.parser.innerHTML:
|
if not ignoreEndTag:
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.phases["inTable"].processStartTag(name, attributes)
|
self.parser.phases["inTable"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def endTagTr(self, name):
|
def endTagTr(self, name):
|
||||||
if self.tree.elementInScope("tr", True):
|
if not self.ignoreEndTagTr():
|
||||||
self.clearStackToTableRowContext()
|
self.clearStackToTableRowContext()
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
self.parser.phase = self.parser.phases["inTableBody"]
|
self.parser.phase = self.parser.phases["inTableBody"]
|
||||||
else:
|
else:
|
||||||
# innerHTML case
|
# innerHTML case
|
||||||
|
assert self.parser.innerHTML
|
||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagTable(self, name):
|
def endTagTable(self, name):
|
||||||
|
ignoreEndTag = self.ignoreEndTagTr()
|
||||||
self.endTagTr("tr")
|
self.endTagTr("tr")
|
||||||
# Reprocess the current tag if the tr end tag was not ignored
|
# Reprocess the current tag if the tr end tag was not ignored
|
||||||
# XXX how are we sure it's always ignored in the innerHTML case?
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
||||||
if not self.parser.innerHTML:
|
if not ignoreEndTag:
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagTableRowGroup(self, name):
|
def endTagTableRowGroup(self, name):
|
||||||
@ -1628,7 +1674,7 @@ class InSelectPhase(Phase):
|
|||||||
u"select phase. Ignored."))
|
u"select phase. Ignored."))
|
||||||
|
|
||||||
def endTagSelect(self, name):
|
def endTagSelect(self, name):
|
||||||
if self.tree.elementInScope(name, True):
|
if self.tree.elementInScope("select", True):
|
||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
while node.name != "select":
|
while node.name != "select":
|
||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
@ -1641,7 +1687,7 @@ class InSelectPhase(Phase):
|
|||||||
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
||||||
") in the select phase."))
|
") in the select phase."))
|
||||||
if self.tree.elementInScope(name, True):
|
if self.tree.elementInScope(name, True):
|
||||||
self.endTagSelect()
|
self.endTagSelect("select")
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase):
|
|||||||
u"in the frameset phase (innerHTML)."))
|
u"in the frameset phase (innerHTML)."))
|
||||||
else:
|
else:
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
if not self.parser.innerHTML and\
|
if (not self.parser.innerHTML and
|
||||||
self.tree.openElements[-1].name != "frameset":
|
self.tree.openElements[-1].name != "frameset"):
|
||||||
# If we're not in innerHTML mode and the the current node is not a
|
# If we're not in innerHTML mode and the the current node is not a
|
||||||
# "frameset" element (anymore) then switch.
|
# "frameset" element (anymore) then switch.
|
||||||
self.parser.phase = self.parser.phases["afterFrameset"]
|
self.parser.phase = self.parser.phases["afterFrameset"]
|
||||||
|
@ -14,7 +14,7 @@ class HTMLInputStream(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, source, encoding=None, chardet=True):
|
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||||
"""Initialises the HTMLInputStream.
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
@ -26,6 +26,8 @@ class HTMLInputStream(object):
|
|||||||
the encoding. If specified, that encoding will be used,
|
the encoding. If specified, that encoding will be used,
|
||||||
regardless of any BOM or later declaration (such as in a meta
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
element)
|
element)
|
||||||
|
|
||||||
|
parseMeta - Look for a <meta> element containing encoding information
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
@ -41,12 +43,9 @@ class HTMLInputStream(object):
|
|||||||
#Encoding to use if no other information can be found
|
#Encoding to use if no other information can be found
|
||||||
self.defaultEncoding = "windows-1252"
|
self.defaultEncoding = "windows-1252"
|
||||||
|
|
||||||
#Autodetect encoding if no other information can be found?
|
|
||||||
self.chardet = chardet
|
|
||||||
|
|
||||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
if encoding is None or not isValidEncoding(encoding):
|
if encoding is None or not isValidEncoding(encoding):
|
||||||
encoding = self.detectEncoding()
|
encoding = self.detectEncoding(parseMeta, chardet)
|
||||||
self.charEncoding = encoding
|
self.charEncoding = encoding
|
||||||
|
|
||||||
# Read bytes from stream decoding them into Unicode
|
# Read bytes from stream decoding them into Unicode
|
||||||
@ -79,17 +78,17 @@ class HTMLInputStream(object):
|
|||||||
stream = cStringIO.StringIO(str(source))
|
stream = cStringIO.StringIO(str(source))
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
def detectEncoding(self):
|
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||||
|
|
||||||
#First look for a BOM
|
#First look for a BOM
|
||||||
#This will also read past the BOM if present
|
#This will also read past the BOM if present
|
||||||
encoding = self.detectBOM()
|
encoding = self.detectBOM()
|
||||||
#If there is no BOM need to look for meta elements with encoding
|
#If there is no BOM need to look for meta elements with encoding
|
||||||
#information
|
#information
|
||||||
if encoding is None:
|
if encoding is None and parseMeta:
|
||||||
encoding = self.detectEncodingMeta()
|
encoding = self.detectEncodingMeta()
|
||||||
#Guess with chardet, if avaliable
|
#Guess with chardet, if avaliable
|
||||||
if encoding is None and self.chardet:
|
if encoding is None and chardet:
|
||||||
try:
|
try:
|
||||||
import chardet
|
import chardet
|
||||||
buffer = self.rawStream.read()
|
buffer = self.rawStream.read()
|
||||||
|
@ -32,8 +32,8 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
# XXX need to fix documentation
|
# XXX need to fix documentation
|
||||||
|
|
||||||
def __init__(self, stream, encoding=None):
|
def __init__(self, stream, encoding=None, parseMeta=True):
|
||||||
self.stream = HTMLInputStream(stream, encoding)
|
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
||||||
|
|
||||||
self.states = {
|
self.states = {
|
||||||
"data":self.dataState,
|
"data":self.dataState,
|
||||||
@ -338,31 +338,33 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["closeTagOpen"]
|
self.state = self.states["closeTagOpen"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
self.stream.queue.append(data)
|
self.stream.queue.insert(0, data)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def closeTagOpenState(self):
|
def closeTagOpenState(self):
|
||||||
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
|
||||||
contentModelFlags["CDATA"]):
|
contentModelFlags["CDATA"])):
|
||||||
charStack = []
|
if self.currentToken:
|
||||||
|
charStack = []
|
||||||
|
|
||||||
# So far we know that "</" has been consumed. We now need to know
|
# So far we know that "</" has been consumed. We now need to know
|
||||||
# whether the next few characters match the name of last emitted
|
# whether the next few characters match the name of last emitted
|
||||||
# start tag which also happens to be the currentToken. We also need
|
# start tag which also happens to be the currentToken. We also need
|
||||||
# to have the character directly after the characters that could
|
# to have the character directly after the characters that could
|
||||||
# match the start tag name.
|
# match the start tag name.
|
||||||
for x in xrange(len(self.currentToken["name"]) + 1):
|
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||||
charStack.append(self.stream.char())
|
charStack.append(self.stream.char())
|
||||||
# Make sure we don't get hit by EOF
|
# Make sure we don't get hit by EOF
|
||||||
if charStack[-1] == EOF:
|
if charStack[-1] == EOF:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Since this is just for checking. We put the characters back on
|
# Since this is just for checking. We put the characters back on
|
||||||
# the stack.
|
# the stack.
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.queue.extend(charStack)
|
||||||
|
|
||||||
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
if self.currentToken \
|
||||||
|
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||||
and charStack[-1] in (spaceCharacters |
|
and charStack[-1] in (spaceCharacters |
|
||||||
frozenset((u">", u"/", u"<", EOF))):
|
frozenset((u">", u"/", u"<", EOF))):
|
||||||
# Because the characters are correct we can safely switch to
|
# Because the characters are correct we can safely switch to
|
||||||
|
@ -108,6 +108,9 @@ class TreeBuilder(object):
|
|||||||
|
|
||||||
#The class to use for creating doctypes
|
#The class to use for creating doctypes
|
||||||
doctypeClass = None
|
doctypeClass = None
|
||||||
|
|
||||||
|
#Fragment class
|
||||||
|
fragmentClass = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.reset()
|
self.reset()
|
||||||
@ -294,7 +297,6 @@ class TreeBuilder(object):
|
|||||||
fosterParent = self.openElements[
|
fosterParent = self.openElements[
|
||||||
self.openElements.index(lastTable) - 1]
|
self.openElements.index(lastTable) - 1]
|
||||||
else:
|
else:
|
||||||
assert self.innerHTML
|
|
||||||
fosterParent = self.openElements[0]
|
fosterParent = self.openElements[0]
|
||||||
return fosterParent, insertBefore
|
return fosterParent, insertBefore
|
||||||
|
|
||||||
@ -310,6 +312,13 @@ class TreeBuilder(object):
|
|||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
"Return the final tree"
|
"Return the final tree"
|
||||||
return self.document
|
return self.document
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
"Return the final fragment"
|
||||||
|
#assert self.innerHTML
|
||||||
|
fragment = self.fragmentClass()
|
||||||
|
self.openElements[0].reparentChildren(fragment)
|
||||||
|
return fragment
|
||||||
|
|
||||||
def testSerializer(self, node):
|
def testSerializer(self, node):
|
||||||
"""Serialize the subtree of node in the format required by unit tests
|
"""Serialize the subtree of node in the format required by unit tests
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import _base
|
import _base
|
||||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||||
import new
|
import new
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
from constants import voidElements
|
||||||
|
|
||||||
import re
|
import re
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
|
|
||||||
def commentClass(self, data):
|
def commentClass(self, data):
|
||||||
return NodeBuilder(self.dom.createComment(data))
|
return NodeBuilder(self.dom.createComment(data))
|
||||||
|
|
||||||
|
def fragmentClass(self):
|
||||||
|
return NodeBuilder(self.dom.createDocumentFragment())
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
self.dom.appendChild(node.element)
|
self.dom.appendChild(node.element)
|
||||||
@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
|
|
||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
return self.dom
|
return self.dom
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return _base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
def insertText(self, data, parent=None):
|
def insertText(self, data, parent=None):
|
||||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||||
@ -118,7 +126,9 @@ def testSerializer(element):
|
|||||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||||
rv.append("#document")
|
rv.append("#document")
|
||||||
|
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
|
rv.append("#document-fragment")
|
||||||
elif element.nodeType == Node.COMMENT_NODE:
|
elif element.nodeType == Node.COMMENT_NODE:
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||||
elif element.nodeType == Node.TEXT_NODE:
|
elif element.nodeType == Node.TEXT_NODE:
|
||||||
@ -135,6 +145,32 @@ def testSerializer(element):
|
|||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
class HTMLSerializer(object):
|
||||||
|
def serialize(self, node):
|
||||||
|
rv = self.serializeNode(node)
|
||||||
|
for child in node.childNodes:
|
||||||
|
rv += self.serialize(child)
|
||||||
|
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
|
||||||
|
rv += "</%s>\n"%node.nodeName
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def serializeNode(self, node):
|
||||||
|
if node.nodeType == Node.TEXT_NODE:
|
||||||
|
rv = node.nodeValue
|
||||||
|
elif node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
rv = "<%s"%node.nodeName
|
||||||
|
if node.hasAttributes():
|
||||||
|
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||||
|
node.attributes.items()])
|
||||||
|
rv += ">"
|
||||||
|
elif node.nodeType == Node.COMMENT_NODE:
|
||||||
|
rv = "<!-- %s -->" % escape(node.nodeValue)
|
||||||
|
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
rv = "<!DOCTYPE %s>" % node.name
|
||||||
|
else:
|
||||||
|
rv = ""
|
||||||
|
return rv
|
||||||
|
|
||||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||||
if node.nodeType == Node.ELEMENT_NODE:
|
if node.nodeType == Node.ELEMENT_NODE:
|
||||||
if not nsmap:
|
if not nsmap:
|
||||||
@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
|||||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||||
handler.startDocument()
|
handler.startDocument()
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
handler.endDocument()
|
handler.endDocument()
|
||||||
|
|
||||||
|
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# ATTRIBUTE_NODE
|
# ATTRIBUTE_NODE
|
||||||
|
@ -129,6 +129,10 @@ class Document(Element):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
Element.__init__(self, Document)
|
Element.__init__(self, Document)
|
||||||
|
|
||||||
|
class DocumentFragment(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, DocumentFragment)
|
||||||
|
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
finalText = None
|
||||||
@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
doctypeClass = DocumentType
|
doctypeClass = DocumentType
|
||||||
elementClass = Element
|
elementClass = Element
|
||||||
commentClass = Comment
|
commentClass = Comment
|
||||||
|
fragmentClass = DocumentFragment
|
||||||
|
|
||||||
def testSerializer(self, element):
|
def testSerializer(self, element):
|
||||||
return testSerializer(element)
|
return testSerializer(element)
|
||||||
|
|
||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
return self.document._element
|
return self.document._element
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return _base.TreeBuilder.getFragment(self)._element
|
||||||
|
@ -4,6 +4,7 @@ from xml.sax.saxutils import escape
|
|||||||
|
|
||||||
# Really crappy basic implementation of a DOM-core like thing
|
# Really crappy basic implementation of a DOM-core like thing
|
||||||
class Node(_base.Node):
|
class Node(_base.Node):
|
||||||
|
type = -1
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.parent = None
|
self.parent = None
|
||||||
@ -11,15 +12,18 @@ class Node(_base.Node):
|
|||||||
self.childNodes = []
|
self.childNodes = []
|
||||||
self._flags = []
|
self._flags = []
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for node in self.childNodes:
|
||||||
|
yield node
|
||||||
|
for item in node:
|
||||||
|
yield item
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def toxml(self):
|
def toxml(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "<%s %s>" % (self.__class__, self.name)
|
|
||||||
|
|
||||||
def printTree(self, indent=0):
|
def printTree(self, indent=0):
|
||||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||||
for child in self.childNodes:
|
for child in self.childNodes:
|
||||||
@ -69,6 +73,7 @@ class Node(_base.Node):
|
|||||||
return bool(self.childNodes)
|
return bool(self.childNodes)
|
||||||
|
|
||||||
class Document(Node):
|
class Document(Node):
|
||||||
|
type = 1
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Node.__init__(self, None)
|
Node.__init__(self, None)
|
||||||
|
|
||||||
@ -93,7 +98,13 @@ class Document(Node):
|
|||||||
tree += child.printTree(2)
|
tree += child.printTree(2)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
|
class DocumentFragment(Document):
|
||||||
|
type = 2
|
||||||
|
def __unicode__(self):
|
||||||
|
return "#document-fragment"
|
||||||
|
|
||||||
class DocumentType(Node):
|
class DocumentType(Node):
|
||||||
|
type = 3
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
Node.__init__(self, name)
|
Node.__init__(self, name)
|
||||||
|
|
||||||
@ -106,6 +117,7 @@ class DocumentType(Node):
|
|||||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
||||||
|
|
||||||
class TextNode(Node):
|
class TextNode(Node):
|
||||||
|
type = 4
|
||||||
def __init__(self, value):
|
def __init__(self, value):
|
||||||
Node.__init__(self, None)
|
Node.__init__(self, None)
|
||||||
self.value = value
|
self.value = value
|
||||||
@ -119,6 +131,7 @@ class TextNode(Node):
|
|||||||
hilite = toxml
|
hilite = toxml
|
||||||
|
|
||||||
class Element(Node):
|
class Element(Node):
|
||||||
|
type = 5
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
Node.__init__(self, name)
|
Node.__init__(self, name)
|
||||||
self.attributes = {}
|
self.attributes = {}
|
||||||
@ -164,6 +177,7 @@ class Element(Node):
|
|||||||
return tree
|
return tree
|
||||||
|
|
||||||
class CommentNode(Node):
|
class CommentNode(Node):
|
||||||
|
type = 6
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
Node.__init__(self, None)
|
Node.__init__(self, None)
|
||||||
self.data = data
|
self.data = data
|
||||||
@ -177,11 +191,38 @@ class CommentNode(Node):
|
|||||||
def hilite(self):
|
def hilite(self):
|
||||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||||
|
|
||||||
|
class HTMLSerializer(object):
|
||||||
|
def serialize(self, node):
|
||||||
|
rv = self.serializeNode(node)
|
||||||
|
for child in node.childNodes:
|
||||||
|
rv += self.serialize(child)
|
||||||
|
if node.type == Element.type and node.name not in voidElements:
|
||||||
|
rv += "</%s>\n"%node.name
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def serializeNode(self, node):
|
||||||
|
if node.type == TextNode.type:
|
||||||
|
rv = node.value
|
||||||
|
elif node.type == Element.type:
|
||||||
|
rv = "<%s"%node.name
|
||||||
|
if node.attributes:
|
||||||
|
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||||
|
node.attributes.iteritems()])
|
||||||
|
rv += ">"
|
||||||
|
elif node.type == CommentNode.type:
|
||||||
|
rv = "<!-- %s -->" % escape(node.data)
|
||||||
|
elif node.type == DocumentType.type:
|
||||||
|
rv = "<!DOCTYPE %s>" % node.name
|
||||||
|
else:
|
||||||
|
rv = ""
|
||||||
|
return rv
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
documentClass = Document
|
documentClass = Document
|
||||||
doctypeClass = DocumentType
|
doctypeClass = DocumentType
|
||||||
elementClass = Element
|
elementClass = Element
|
||||||
commentClass = CommentNode
|
commentClass = CommentNode
|
||||||
|
fragmentClass = DocumentFragment
|
||||||
|
|
||||||
def testSerializer(self, node):
|
def testSerializer(self, node):
|
||||||
return node.printTree()
|
return node.printTree()
|
||||||
|
@ -54,6 +54,7 @@ def run(template_file, doc, mode='template'):
|
|||||||
|
|
||||||
# Execute the shell module
|
# Execute the shell module
|
||||||
options = planet.config.template_options(template_file)
|
options = planet.config.template_options(template_file)
|
||||||
|
if module_name == 'plugin': options['__file__'] = template_file
|
||||||
options.update(extra_options)
|
options.update(extra_options)
|
||||||
log.debug("Processing %s %s using %s", mode,
|
log.debug("Processing %s %s using %s", mode,
|
||||||
os.path.realpath(template_resolved), module_name)
|
os.path.realpath(template_resolved), module_name)
|
||||||
|
64
planet/shell/plugin.py
Normal file
64
planet/shell/plugin.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import os, sys, imp
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
def run(script, doc, output_file=None, options={}):
|
||||||
|
""" process an Python script using imp """
|
||||||
|
save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv)
|
||||||
|
plugin_stdout = StringIO()
|
||||||
|
plugin_stderr = StringIO()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# redirect stdin
|
||||||
|
sys.stdin = StringIO(doc)
|
||||||
|
|
||||||
|
# redirect stdout
|
||||||
|
if output_file:
|
||||||
|
sys.stdout = open(output_file, 'w')
|
||||||
|
else:
|
||||||
|
sys.stdout = plugin_stdout
|
||||||
|
|
||||||
|
# redirect stderr
|
||||||
|
sys.stderr = plugin_stderr
|
||||||
|
|
||||||
|
# determine __file__ value
|
||||||
|
if options.has_key("__file__"):
|
||||||
|
plugin_file = options["__file__"]
|
||||||
|
del options["__file__"]
|
||||||
|
else:
|
||||||
|
plugin_file = script
|
||||||
|
|
||||||
|
# set sys.argv
|
||||||
|
options = sum([['--'+key, value] for key,value in options.items()], [])
|
||||||
|
sys.argv = [plugin_file] + options
|
||||||
|
|
||||||
|
# import script
|
||||||
|
handle = open(script, 'r')
|
||||||
|
cwd = os.getcwd()
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
description=('.plugin', 'rb', imp.PY_SOURCE)
|
||||||
|
imp.load_module('__main__',handle,plugin_file,description)
|
||||||
|
except SystemExit,e:
|
||||||
|
if e.code: log.error('%s exit rc=%d',(plugin_file,e.code))
|
||||||
|
except Exception, e:
|
||||||
|
import traceback
|
||||||
|
type, value, tb = sys.exc_info()
|
||||||
|
plugin_stderr.write(''.join(
|
||||||
|
traceback.format_exception_only(type,value) +
|
||||||
|
traceback.format_tb(tb)))
|
||||||
|
finally:
|
||||||
|
handle.close()
|
||||||
|
if cwd != os.getcwd(): os.chdir(cwd)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# restore system state
|
||||||
|
sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys
|
||||||
|
|
||||||
|
# log anything sent to stderr
|
||||||
|
if plugin_stderr.getvalue():
|
||||||
|
import planet
|
||||||
|
planet.logger.error(plugin_stderr.getvalue())
|
||||||
|
|
||||||
|
# return stdout
|
||||||
|
return plugin_stdout.getvalue()
|
@ -329,7 +329,6 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
|
|
||||||
def spiderPlanet(only_if_new = False):
|
def spiderPlanet(only_if_new = False):
|
||||||
""" Spider (fetch) an entire planet """
|
""" Spider (fetch) an entire planet """
|
||||||
# log = planet.getLogger(config.log_level(),config.log_format())
|
|
||||||
log = planet.getLogger(config.log_level(),config.log_format())
|
log = planet.getLogger(config.log_level(),config.log_format())
|
||||||
|
|
||||||
global index
|
global index
|
||||||
|
25
runtests.py
25
runtests.py
@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'):
|
|||||||
if sys.path[0]: os.chdir(sys.path[0])
|
if sys.path[0]: os.chdir(sys.path[0])
|
||||||
sys.path[0] = os.getcwd()
|
sys.path[0] = os.getcwd()
|
||||||
|
|
||||||
# find all of the planet test modules
|
# determine verbosity
|
||||||
modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py')))
|
verbosity = 1
|
||||||
|
for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)):
|
||||||
|
if arg in sys.argv:
|
||||||
|
verbosity = value
|
||||||
|
sys.argv.remove(arg)
|
||||||
|
|
||||||
# enable warnings
|
# find all of the planet test modules
|
||||||
|
modules = []
|
||||||
|
for pattern in sys.argv[1:] or ['test_*.py']:
|
||||||
|
modules += map(fullmodname, glob.glob(os.path.join('tests', pattern)))
|
||||||
|
|
||||||
|
# enable logging
|
||||||
import planet
|
import planet
|
||||||
planet.getLogger("WARNING",None)
|
if verbosity == 0: planet.getLogger("FATAL",None)
|
||||||
|
if verbosity == 1: planet.getLogger("WARNING",None)
|
||||||
|
if verbosity == 2: planet.getLogger("DEBUG",None)
|
||||||
|
|
||||||
# load all of the tests into a suite
|
# load all of the tests into a suite
|
||||||
try:
|
try:
|
||||||
@ -33,11 +44,5 @@ except Exception, exception:
|
|||||||
for module in modules: __import__(module)
|
for module in modules: __import__(module)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
verbosity = 1
|
|
||||||
if "-q" in sys.argv or '--quiet' in sys.argv:
|
|
||||||
verbosity = 0
|
|
||||||
if "-v" in sys.argv or '--verbose' in sys.argv:
|
|
||||||
verbosity = 2
|
|
||||||
|
|
||||||
# run test suite
|
# run test suite
|
||||||
unittest.TextTestRunner(verbosity=verbosity).run(suite)
|
unittest.TextTestRunner(verbosity=verbosity).run(suite)
|
||||||
|
29
tests/data/apply/config-mememe.ini
Normal file
29
tests/data/apply/config-mememe.ini
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
[Planet]
|
||||||
|
output_theme = classic_fancy
|
||||||
|
output_dir = tests/work/apply
|
||||||
|
name = test planet
|
||||||
|
cache_directory = tests/work/spider/cache
|
||||||
|
|
||||||
|
bill_of_materials:
|
||||||
|
images/#{face}
|
||||||
|
|
||||||
|
[index.html.tmpl]
|
||||||
|
filters:
|
||||||
|
html2xhtml.plugin
|
||||||
|
mememe.plugin
|
||||||
|
|
||||||
|
[mememe.plugin]
|
||||||
|
sidebar = //*[@class='sidebar']
|
||||||
|
|
||||||
|
[tests/data/spider/testfeed0.atom]
|
||||||
|
name = not found
|
||||||
|
|
||||||
|
[tests/data/spider/testfeed1b.atom]
|
||||||
|
name = one
|
||||||
|
face = jdub.png
|
||||||
|
|
||||||
|
[tests/data/spider/testfeed2.atom]
|
||||||
|
name = two
|
||||||
|
|
||||||
|
[tests/data/spider/testfeed3.rss]
|
||||||
|
name = three
|
@ -21,8 +21,7 @@ class ApplyTest(unittest.TestCase):
|
|||||||
os.makedirs(workdir)
|
os.makedirs(workdir)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(workdir)
|
shutil.rmtree(os.path.split(workdir)[0])
|
||||||
os.removedirs(os.path.split(workdir)[0])
|
|
||||||
|
|
||||||
def test_apply_asf(self):
|
def test_apply_asf(self):
|
||||||
config.load(configfile % 'asf')
|
config.load(configfile % 'asf')
|
||||||
@ -65,7 +64,20 @@ class ApplyTest(unittest.TestCase):
|
|||||||
output = open(os.path.join(workdir, 'index.html4')).read()
|
output = open(os.path.join(workdir, 'index.html4')).read()
|
||||||
self.assertTrue(output.find('/>')<0)
|
self.assertTrue(output.find('/>')<0)
|
||||||
|
|
||||||
|
def test_apply_filter_mememe(self):
|
||||||
|
config.load(configfile % 'mememe')
|
||||||
|
self.apply_fancy()
|
||||||
|
|
||||||
|
output = open(os.path.join(workdir, 'index.html')).read()
|
||||||
|
self.assertTrue(output.find('<div class="sidebar"><h2>Memes <a href="memes.atom">')>=0)
|
||||||
|
|
||||||
def apply_fancy(self):
|
def apply_fancy(self):
|
||||||
|
# drop slow templates unrelated to test at hand
|
||||||
|
templates = config.parser.get('Planet','template_files').split()
|
||||||
|
templates.remove('rss10.xml.tmpl')
|
||||||
|
templates.remove('rss20.xml.tmpl')
|
||||||
|
config.parser.set('Planet','template_files',' '.join(templates))
|
||||||
|
|
||||||
splice.apply(self.feeddata)
|
splice.apply(self.feeddata)
|
||||||
|
|
||||||
# verify that selected files are there
|
# verify that selected files are there
|
||||||
|
Loading…
Reference in New Issue
Block a user