diff --git a/.bzrignore b/.bzrignore index 1d1886c..a8f0629 100644 --- a/.bzrignore +++ b/.bzrignore @@ -1,3 +1,4 @@ *.tmplc .DS_Store cache +*.pluginc diff --git a/THANKS b/THANKS index c6ab616..eb7c72f 100644 --- a/THANKS +++ b/THANKS @@ -13,6 +13,10 @@ Morten Frederiksen - Support WordPress LinkManager OPML Harry Fuecks - default item date to feed date Antonio Cavedoni - Django templates Morten Frederiksen - expungeCache +Lenny Domnitser - Coral CDN support for URLs with non-standard ports +Amit Chakradeo - Allow read-only files to be overwritten +Matt Brubeck - fix new_channel +Aristotle Pagaltzis - ensure byline_author filter doesn't drop foreign markup This codebase represents a radical refactoring of Planet 2.0, which lists the following contributors: diff --git a/docs/config.html b/docs/config.html index abb3f19..4a08ed7 100644 --- a/docs/config.html +++ b/docs/config.html @@ -68,6 +68,9 @@ can be found
filters
Space-separated list of filters to apply to each entry
+
filter_directories
+
Space-separated list of directories in which filters +can be found
@@ -148,6 +151,7 @@ processed as templates. With Planet 2.0, it is possible to override parameters like items_per_page on a per template basis, but at the current time Planet Venus doesn't implement this.

+

Filters can be defined on a per-template basis, and will be used to post-process the output of the template.

[filter]

Sections which are listed in [planet] filters are diff --git a/docs/etiquette.html b/docs/etiquette.html new file mode 100644 index 0000000..a567e77 --- /dev/null +++ b/docs/etiquette.html @@ -0,0 +1,48 @@ + + + + + +Etiquette + + +

Etiquette

+

You would think that people who publish syndication feeds do it with the +intent to be syndicated. But the truth is that we live in a world where +deep linking can +cause people to complain. Nothing is safe. But that doesn’t +stop us from doing links.

+ +

These concerns tend to increase when you profit, either directly via ads or +indirectly via search engine rankings, from the content of others.

+ +

While there are no hard and fast rules that apply here, here’s are a +few things you can do to mitigate the concern:

+ + + + diff --git a/docs/filters.html b/docs/filters.html index 865aa41..228f323 100644 --- a/docs/filters.html +++ b/docs/filters.html @@ -8,18 +8,21 @@ Venus Filters -

Filters

-

Filters are simple Unix pipes. Input comes in stdin, -parameters come from the config file, and output goes to stdout. -Anything written to stderr is logged as an ERROR message. If no -stdout is produced, the entry is not written to the cache or -processed further; in fact, if the entry had previously been written to the cache, it will be removed.

+

Filters and Plugins

+

Filters and plugins are simple Unix pipes. Input comes in +stdin, parameters come from the config file, and output goes to +stdout. Anything written to stderr is logged as an +ERROR message. If no stdout is produced, the entry is not written +to the cache or processed further; in fact, if the entry had previously been +written to the cache, it will be removed.

-

Input to a filter is a aggressively +

There are two types of filters supported by Venus, input and template.

+

Input to an input filter is a aggressively normalized entry. For example, if a feed is RSS 1.0 with 10 items, the filter will be called ten times, each with a single Atom 1.0 entry, with all textConstructs expressed as XHTML, and everything encoded as UTF-8.

+

Input to a template filter will be the output produced by the template.

You will find a small set of example filters in the filters directory. The

Notes

diff --git a/docs/index.html b/docs/index.html index 0f3695c..c461d7f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -21,13 +21,14 @@
  • Other
  • Reference diff --git a/docs/templates.html b/docs/templates.html index 1eb9d7a..b9fd9c1 100644 --- a/docs/templates.html +++ b/docs/templates.html @@ -167,5 +167,18 @@ a planet:format attribute containing the referenced date formatted according to the [planet] date_format specified in the configuration
  • + +

    genshi

    +

    Genshi approaches the power of XSLT, but with a syntax that many Python +programmers find more natural, succinct and expressive. Genshi templates +have access to the full range of feedparser values, with the following additions:

    + diff --git a/examples/opml-top100.ini b/examples/opml-top100.ini index 0522472..01b210d 100644 --- a/examples/opml-top100.ini +++ b/examples/opml-top100.ini @@ -36,6 +36,13 @@ filters = excerpt.py omit = img p br width = 500 +# add memes to output +[index.html.tmpl] +filters = mememe.plugin + +[mememe.plugin] +sidebar = //*[@id="footer"] + # subscription list [http://share.opml.org/opml/top100.opml] content_type = opml diff --git a/filters/addsearch.genshi b/filters/addsearch.genshi new file mode 100644 index 0000000..f6f36ce --- /dev/null +++ b/filters/addsearch.genshi @@ -0,0 +1,30 @@ + + + +
    + ${select('*')} +

    Search

    +
    +
    + + + + + + ${select('*')} + + + + + + + + ${input} + + diff --git a/filters/addsearch.xslt b/filters/addsearch.xslt new file mode 100644 index 0000000..f96db81 --- /dev/null +++ b/filters/addsearch.xslt @@ -0,0 +1,70 @@ + + + + + + +

    Search

    +
    +
    +
    + + + + + + + / + + + + + + + + + + + + + + + + + + + + opensearchdescription.xml + + + + + + + + + + + + + + + + + + <!DOCTYPE html> + + + + + + + + + + + + +
    diff --git a/filters/coral_cdn_filter.py b/filters/coral_cdn_filter.py index 0192c63..e0a8c1c 100644 --- a/filters/coral_cdn_filter.py +++ b/filters/coral_cdn_filter.py @@ -3,14 +3,15 @@ Remap all images to take advantage of the Coral Content Distribution Network . """ -import sys, urlparse, xml.dom.minidom +import re, sys, urlparse, xml.dom.minidom entry = xml.dom.minidom.parse(sys.stdin).documentElement for node in entry.getElementsByTagName('img'): if node.hasAttribute('src'): component = list(urlparse.urlparse(node.getAttribute('src'))) - if component[0]=='http' and component[1].find(':')<0: + if component[0] == 'http': + component[1] = re.sub(r':(\d+)$', r'.\1', component[1]) component[1] += '.nyud.net:8080' node.setAttribute('src', urlparse.urlunparse(component)) diff --git a/filters/delDupName/byline_author.xslt b/filters/delDupName/byline_author.xslt new file mode 100644 index 0000000..ad1fbec --- /dev/null +++ b/filters/delDupName/byline_author.xslt @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/filters/delDupName/p_by_name.xslt b/filters/delDupName/p_by_name.xslt new file mode 100644 index 0000000..878904f --- /dev/null +++ b/filters/delDupName/p_by_name.xslt @@ -0,0 +1,17 @@ + + + + + + + + + + + + + diff --git a/filters/delDupName/p_from.xslt b/filters/delDupName/p_from.xslt new file mode 100644 index 0000000..c551838 --- /dev/null +++ b/filters/delDupName/p_from.xslt @@ -0,0 +1,15 @@ + + + + + + + + + + + + + diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin new file mode 100644 index 0000000..456df48 --- /dev/null +++ b/filters/html2xhtml.plugin @@ -0,0 +1,6 @@ +import sys +from planet import html5lib +tree=html5lib.treebuilders.dom.TreeBuilder +parser = html5lib.html5parser.HTMLParser(tree=tree) +document = parser.parse(sys.stdin) +sys.stdout.write(document.toxml("utf-8")) diff --git a/filters/mememe.plugin b/filters/mememe.plugin new file mode 100644 index 0000000..36dea83 --- /dev/null +++ b/filters/mememe.plugin @@ -0,0 +1,480 @@ +# +# This Venus output filter will annotate an XHTML page with a list of +# "memes" (or most popular linked destinations, based on the last week +# of entries from the cache) and will update the subscription list with +# links to recent entries from each subscription. +# +# Templates that don't produce XHTML natively will need their output passed +# through html2xhtml.plugin first. +# +# Typical configuration (based on classic_fancy): +# +# [index.html.tmpl] +# filters: +# html2xhtml.plugin +# mememe.plugin +# +# [mememe.plugin] +# sidebar = @class='sidebar' +# + +import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5 +from xml.sax.saxutils import escape +from htmlentitydefs import entitydefs + +import planet +from planet import config, feedparser +from planet.spider import filename +log = planet.getLogger(config.log_level(),config.log_format()) +options = config.filter_options(sys.argv[0]) + +MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom') + +now = time.time() +week = 7 * 86400 +week_ago = now - week + +cache = config.cache_directory() +meme_cache = os.path.join(cache, 'memes') +if not os.path.exists(meme_cache): os.makedirs(meme_cache) + +bom = config.bill_of_materials() +if not 'images/tcosm11.gif' in bom: + bom.append('images/tcosm11.gif') + config.parser.set('Planet', 'bill_of_materials', ' '.join(bom)) + +all_links = {} +feed_links = {} + +def check_cache(url): + try: + file = open(filename(meme_cache, url)) + headers = eval(file.read()) + file.close() + return headers or {} + except: + return {} + +def cache_meme(url, headers): + json = [] + for key,value in headers.items(): + json.append(' %s: %s' % (toj(key), toj(value))) + file = open(filename(meme_cache, url),'w') + file.write('{\n' + ',\n'.join(json) + '\n}\n') + file.close() + +urlmap = {} +def canonicalize(url): + url = urlmap.get(url,url) + parts = list(urlparse.urlparse(url)) + + parts[0] = parts[0].lower() + parts[1] = parts[1].lower() + if parts[1].startswith('www.'): parts[1]=parts[1][4:] + if not parts[2]: parts[2] = '/' + parts[-1] = '' + return urlparse.urlunparse(parts) + +log.debug("Loading cached data") +for name in glob.glob(os.path.join(cache, '*')): + # ensure that this is within the past week + if os.path.isdir(name): continue + mtime = os.stat(name).st_mtime + if mtime < week_ago: continue + + # parse the file + try: + doc = libxml2.parseFile(name) + except: + continue + xp = doc.xpathNewContext() + xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom") + xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/") + + # determine the entry + entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']") + if not entry: continue + entry = canonicalize(entry[0].prop("href")) + + # determine the title + title = xp.xpathEval("/atom:entry/atom:title") + if title: + if title[0].prop('type') == 'html': + title = re.sub('<.*?>','',title[0].content) + else: + title = title[0].content + title = str(title or '') + + # determine the feed id + feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup") + if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id") + if not feed: continue + feed = feed[0].content + + # determine the author + author = xp.xpathEval("/atom:entry/atom:source/planet:name") + if author: + author = author[0].content + else: + author = '' + + # track the feed_links + if author: + if not feed_links.has_key(author): feed_links[author] = list() + feed_links[author].append([mtime, entry, title]) + + # identify the unique links + entry_links = [] + for node in doc.xpathEval("//*[@href and not(@rel='source')]"): + parent = node.parent + while parent: + if parent.name == 'source': break + parent = parent.parent + else: + link = canonicalize(node.prop('href')) + if not link in entry_links: + entry_links.append(link) + if node.hasProp('title') and node.prop('title').startswith('http'): + link = canonicalize(node.prop('title')) + if not link in entry_links: + entry_links.append(link) + + # add the votes + weight = 1.0 - (now - mtime)**2 / week**2 + vote = [(weight, str(entry), str(feed), title, author, mtime)] + for link in entry_links: + all_links[link] = all_links.get(link,list()) + vote + + # free the entry + doc.freeDoc() + +# tally the votes +weighted_links = [] +for link, votes in all_links.items(): + site = {} + updated = 0 + for weight, entry, feed, title, author, mtime in votes: + site[feed] = max(site.get(feed,0), weight) + if mtime > updated: updated=mtime + weighted_links.append((sum(site.values()), link, updated)) +weighted_links.sort() +weighted_links.reverse() + +cp1252 = { + 128: 8364, # euro sign + 130: 8218, # single low-9 quotation mark + 131: 402, # latin small letter f with hook + 132: 8222, # double low-9 quotation mark + 133: 8230, # horizontal ellipsis + 134: 8224, # dagger + 135: 8225, # double dagger + 136: 710, # modifier letter circumflex accent + 137: 8240, # per mille sign + 138: 352, # latin capital letter s with caron + 139: 8249, # single left-pointing angle quotation mark + 140: 338, # latin capital ligature oe + 142: 381, # latin capital letter z with caron + 145: 8216, # left single quotation mark + 146: 8217, # right single quotation mark + 147: 8220, # left double quotation mark + 148: 8221, # right double quotation mark + 149: 8226, # bullet + 150: 8211, # en dash + 151: 8212, # em dash + 152: 732, # small tilde + 153: 8482, # trade mark sign + 154: 353, # latin small letter s with caron + 155: 8250, # single right-pointing angle quotation mark + 156: 339, # latin small ligature oe + 158: 382, # latin small letter z with caron + 159: 376} # latin capital letter y with diaeresis + +# determine the title for a given url +class html(sgmllib.SGMLParser): + def __init__(self, url): + sgmllib.SGMLParser.__init__(self) + self.title = "" + self.feedurl = "" + self.intitle = False + + headers = check_cache(url) + + try: + # fetch the page + request = urllib2.Request(url) + request.add_header('User-Agent', 'Venus/MeMeme') + if headers.has_key('etag'): + request.add_header('If-None-Match', headers['etag']) + if headers.has_key('last_modified'): + request.add_header('If-Modified-Since', headers['last-modified']) + response = urllib2.urlopen(request) + self.feed(response.read()) + + # ensure the data is in utf-8 + try: + self.title = self.title.decode('utf-8') + except: + self.title = ''.join([unichr(cp1252.get(ord(c),ord(c))) + for c in self.title.decode('iso-8859-1')]) + + # cache the results + headers = {} + if self.feedurl: headers['feedurl'] = self.feedurl + if self.title: headers['title'] = self.title + headers.update(response.headers) + cache_meme(url, headers) + except: + self.feedurl = headers.get('feedurl') + if headers.has_key('title'): + if isinstance(headers['title'],str): + self.title=eval('u'+repr(headers['title']).replace('\\\\','\\')) + else: + self.title=headers['title'] + + # if there is a feed, look for an entry that matches, and take that title + if self.feedurl and not self.title: + headers = check_cache(self.feedurl) + data = feedparser.parse(self.feedurl, etag=headers.get('etag'), + modified=headers.get('last-modified')) + + if data.has_key('headers') and data.has_key('status') and \ + data.status in [200, 301, 302]: + + titles = {} + for entry in data.entries: + if entry.has_key('title_detail') and entry.has_key('link'): + titles[entry.link] = entry.title_detail.value + if entry.title_detail.type == 'text/plain': + titles[entry.link] = escape(titles[entry.link]) + + if titles.has_key(url): self.title = titles[url] + + data.headers.update(titles) + cache_meme(self.feedurl, data.headers) + else: + if headers.has_key(url): + if isinstance(headers[url],str): + self.title=eval('u'+repr(headers[url]).replace('\\\\','\\')) + else: + self.title=headers[url] + + # fallback is the basename of the URI + if not self.title: + self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0]) + + # parse out the first autodiscovery link + def start_link(self, attrs): + if self.feedurl: return + attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) + if not 'rel' in attrs: return + rels = attrs['rel'].split(' ') + if 'alternate' not in rels: return + if not 'type' in attrs or not attrs['type'].endswith('xml'): return + if 'href' in attrs: + self.feedurl = attrs['href'] + + # parse the page title + def start_title(self, attributes): + if not self.title: self.intitle = True + def end_title(self): + self.intitle = False + def handle_data(self, text): + if self.intitle: self.title += escape(text) + +# convert unicode string to a json string +def toj(value): + result = repr(value).replace(r'\x',r'\u00') + if result[:1] == 'u': result=result[1:] + if result.startswith("'"): + result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1] + return result + +seenit = [] +count = 0 + +# construct an empty feed +feed_doc = libxml2.newDoc("1.0") +meme_feed = feed_doc.newChild(None, "feed", None) +meme_feed.newNs('http://www.w3.org/2005/Atom', None) +meme_feed.newTextChild(None, 'title', config.name() + ': Memes') +author = meme_feed.newChild(None, 'author', None) +author.newTextChild(None, 'name', config.owner_name()) +if config.owner_email: author.newTextChild(None, 'email', config.owner_email()) +meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom')) +link = meme_feed.newChild(None, 'link', None) +link.setProp('href', os.path.join(config.link(), 'memes.atom')) +link.setProp('rel', 'self') +meme_feed.newTextChild(None, 'updated', + time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())) + +# parse the input +log.debug("Parse input") +doc=libxml2.parseDoc(sys.stdin.read()) + +# find the sidebar/footer +sidebar = options.get('sidebar','//*[@class="sidebar"]') +footer = doc.xpathEval(sidebar) +if not hasattr(footer,'__len__') or len(footer) == 0: + raise Exception(sidebar + ' not found') +if len(footer) > 1: + log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar)) +footer = footer[0] + +# add up to 10 entry links to each subscription +subs_ul = footer.children +while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next +child = subs_ul.children +while child: + if child.name == 'li': + if child.lastChild().name == 'ul': child.lastChild().unlinkNode() + link = child.lastChild() + while link.isText(): link=link.prev + author = link.getContent() + state = 'inactive' + if feed_links.has_key(author): + ul2 = child.newChild(None, 'ul', None) + feed_links[author].sort() + feed_links[author].reverse() + link_count = 0 + for mtime, entry, title in feed_links[author]: + if not title: continue + li2 = ul2.newChild(None, 'li', None) + a = li2.newTextChild(None, 'a', title) + a.setProp('href', entry) + link_count = link_count + 1 + if link_count >= 10: break + if link_count > 0: state = None + if state: + link.setProp('class',((link.prop('class') or '') + ' ' + state).strip()) + child=child.next + +# create a h2 and ul for the memes list +footer_top = footer.children +memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes ')) +memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None)) + +# create a header for the memes list +a = memes.newChild(None, 'a', None) +a.setProp('href', 'memes.atom') +img = a.newChild(None, 'img', None) +img.setProp('src', 'images/feed-icon-10x10.png') + +# collect the results +log.debug("Fetch titles and collect the results") +from urllib import quote_plus +for i in range(0,len(weighted_links)): + weight, link, updated = weighted_links[i] + + # ensure that somebody new points to this entry. This guards against + # groups of related links which several posts point to all. + novel = False + for weight, entry, feed, title, author, mtime in all_links[link]: + if entry not in seenit: + seenit.append(entry) + novel = True + if not novel: continue + + all_links[link].sort() + all_links[link].reverse() + cache_file = filename(cache, link) + title = None + + # when possible, take the title from the cache + if os.path.exists(cache_file): + entry = feedparser.parse(cache_file).entries[0] + if entry.has_key('title_detail'): + title = entry.title_detail.value + if entry.title_detail.type == 'text/plain': title = escape(title) + + # otherwise, parse the html + if not title: + title = html(link).title + + # dehtmlize + title = re.sub('&(\w+);', + lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title) + title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title) + title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title) + + # title too long? Insert zero width spaces where appropriate + if max(map(len,title.split())) > 30: + title=re.sub('(\W+)',u'\\1\u200b',title) + + # save the entry title (it is used later) + entry_title = title.strip() + + # add to the memes list + memes_ul.addContent('\n') + li = memes_ul.newChild(None, 'li', None) + memes_ul.addContent('\n') + + # technorati link + a = li.newChild(None, 'a', None) + tlink = 'http://technorati.com/cosmos/search.html?url=' + if link.startswith('http://'): + a.setProp('href',tlink + quote_plus(link[7:])) + else: + a.setProp('href',tlink + quote_plus(link)) + a.setProp('title','cosmos') + img = a.newChild(None, 'img', None) + img.setProp('src','images/tcosm11.gif') + + # main link + a = li.newTextChild(None, 'a', title.strip().encode('utf-8')) + a.setProp('href',link) + if (((i==0) or (updated>=weighted_links[i-1][2])) and + (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))): + rank = 0 + for j in range(0,len(weighted_links)): + if updated < weighted_links[j][2]: rank = rank + 1 + if rank < len(weighted_links)/2: + a.setProp('class','rising') + + # voters + ul2 = li.newChild(None, 'ul', None) + voters = [] + for weight, entry, feed, title, author, mtime in all_links[link]: + if entry in voters: continue + li2 = ul2.newChild(None, 'li', None) + a = li2.newTextChild(None, 'a' , author) + a.setProp('href',entry) + if title: a.setProp('title',title) + voters.append(entry) + + # add to the meme feed + if len(all_links[link]) > 2: + meme_feed.addContent('\n') + entry = meme_feed.newChild(None, 'entry', None) + meme_feed.addContent('\n') + + # entry + tagbase = config.link().split('/') + if not tagbase[-1]: tagbase = tagbase[:-1] + tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:])) + entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest()) + entry.newTextChild(None, 'title', entry_title.encode('utf-8')) + meme_link = entry.newTextChild(None, 'link', None) + meme_link.setProp('href', link) + entry.newTextChild(None, 'updated', + time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated))) + + # voters + content = entry.newChild(None, 'content', None) + content.setProp('type', 'xhtml') + div = content.newTextChild(None, 'div', 'Spotted by:') + div.newNs('http://www.w3.org/1999/xhtml', None) + content_ul = div.newChild(None, 'ul', None) + for weight, entry, feed, title, author, mtime in all_links[link]: + li2 = content_ul.newTextChild(None, 'li', author + ": ") + a = li2.newTextChild(None, 'a' , title or 'untitled') + a.setProp('href',entry) + + count = count + 1 + if count >= 10: break + +log.info("Writing " + MEMES_ATOM) +output=open(MEMES_ATOM,'w') +output.write(feed_doc.serialize('utf-8')) +output.close() + +sys.stdout.write(doc.serialize('utf-8')) diff --git a/filters/xhtml2html.py b/filters/xhtml2html.py new file mode 100644 index 0000000..9c2073e --- /dev/null +++ b/filters/xhtml2html.py @@ -0,0 +1,5 @@ +import sys +from genshi.input import XMLParser +from genshi.output import HTMLSerializer + +print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8') diff --git a/planet/config.py b/planet/config.py index 53195c4..fb436e8 100644 --- a/planet/config.py +++ b/planet/config.py @@ -352,14 +352,15 @@ def filters(section=None): filters = [] if parser.has_option('Planet', 'filters'): filters += parser.get('Planet', 'filters').split() - if section and parser.has_option(section, 'filters'): - filters += parser.get(section, 'filters').split() if filter(section): filters.append('regexp_sifter.py?require=' + urllib.quote(filter(section))) if exclude(section): filters.append('regexp_sifter.py?exclude=' + - urllib.quote(filter(section))) + urllib.quote(exclude(section))) + for section in section and [section] or template_files(): + if parser.has_option(section, 'filters'): + filters += parser.get(section, 'filters').split() return filters def planet_options(): @@ -382,6 +383,10 @@ def template_options(section): """ dictionary of template specific options""" return feed_options(section) +def filter_options(section): + """ dictionary of filter specific options""" + return feed_options(section) + def write(file=sys.stdout): """ write out an updated template """ print parser.write(file) diff --git a/planet/feedparser.py b/planet/feedparser.py index e562d1f..9244646 100755 --- a/planet/feedparser.py +++ b/planet/feedparser.py @@ -11,8 +11,8 @@ Recommended: Python 2.3 or later Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs" -__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. +__version__ = "4.2-pre-" + "$Revision: 262 $"[11:14] + "-svn" +__license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -39,7 +39,8 @@ __contributors__ = ["Jason Diamond ", "John Beimler ", "Fazal Majid ", "Aaron Swartz ", - "Kevin Marks "] + "Kevin Marks ", + "Sam Ruby "] _debug = 0 # HTTP "User-Agent" header to send to servers when downloading feeds. @@ -229,6 +230,10 @@ class FeedParserDict(UserDict): if key == 'enclosures': norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] + if key == 'license': + for link in UserDict.__getitem__(self, 'links'): + if link['rel']=='license' and link.has_key('href'): + return link['href'] if key == 'categories': return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] realkey = self.keymap.get(key, key) @@ -424,7 +429,7 @@ class _FeedParserMixin: } _matchnamespaces = {} - can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] + can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] html_types = ['text/html', 'application/xhtml+xml'] @@ -460,6 +465,7 @@ class _FeedParserMixin: self.langstack = [] self.baseuri = baseuri or '' self.lang = baselang or None + self.svgOK = 0 if baselang: self.feeddata['language'] = baselang.replace('_','-') @@ -514,6 +520,7 @@ class _FeedParserMixin: attrs.append(('xmlns',namespace)) if tag=='svg' and namespace=='http://www.w3.org/2000/svg': attrs.append(('xmlns',namespace)) + if tag == 'svg': self.svgOK = 1 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) # match namespaces @@ -549,6 +556,7 @@ class _FeedParserMixin: prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' + if suffix == 'svg': self.svgOK = 0 # call special handler (if defined) or default handler methodname = '_end_' + prefix + suffix @@ -1247,17 +1255,26 @@ class _FeedParserMixin: self._save('expired_parsed', _parse_date(self.pop('expired'))) def _start_cc_license(self, attrsD): - self.push('license', 1) + context = self._getContext() value = self._getAttribute(attrsD, 'rdf:resource') - if value: - self.elementstack[-1][2].append(value) - self.pop('license') + attrsD = FeedParserDict() + attrsD['rel']='license' + if value: attrsD['href']=value + context.setdefault('links', []).append(attrsD) def _start_creativecommons_license(self, attrsD): self.push('license', 1) + _start_creativeCommons_license = _start_creativecommons_license def _end_creativecommons_license(self): - self.pop('license') + value = self.pop('license') + context = self._getContext() + attrsD = FeedParserDict() + attrsD['rel']='license' + if value: attrsD['href']=value + context.setdefault('links', []).append(attrsD) + del context['license'] + _end_creativeCommons_license = _end_creativecommons_license def _addXFN(self, relationships, href, name): context = self._getContext() @@ -1349,12 +1366,13 @@ class _FeedParserMixin: self._save('link', value) def _start_title(self, attrsD): - if self.incontent: return self.unknown_starttag('title', attrsD) + if self.svgOK: return self.unknown_starttag('title', attrsD.items()) self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) _start_dc_title = _start_title _start_media_title = _start_title def _end_title(self): + if self.svgOK: return value = self.popContent('title') if not value: return context = self._getContext() @@ -2233,27 +2251,41 @@ def _resolveRelativeURIs(htmlSource, baseURI, encoding, type): return p.output() class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', - 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', - 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', - 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', - 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', - 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', - 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', - 'ul', 'var'] + acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article', + 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', + 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', + 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', + 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer', + 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', + 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', + 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', + 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', + 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', + 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead', + 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', - 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', - 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', - 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', - 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', - 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', - 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', - 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', - 'type', 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] + 'action', 'align', 'alt', 'autoplay', 'autocomplete', 'autofocus', 'axis', + 'background', 'balance', 'bgcolor', 'bgproperties', 'border', + 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', + 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', + 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', + 'colspan', 'compact', 'contenteditable', 'coords', 'data', 'datafld', + 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', 'dir', + 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', + 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', + 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', + 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', + 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', + 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', + 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', + 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', + 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', + 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', + 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', + 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', + 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', + 'xml:lang'] unacceptable_elements_with_end_tag = ['script', 'applet'] @@ -2300,36 +2332,38 @@ class _HTMLSanitizer(_BaseHTMLProcessor): svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', - 'linearGradient', 'line', 'metadata', 'missing-glyph', 'mpath', 'path', - 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg', - 'switch', 'text', 'title', 'use'] + 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', + 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', + 'svg', 'switch', 'text', 'title', 'tspan', 'use'] # svgtiny + class + opacity + offset + xmlns + xmlns:xlink svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', 'arabic-form', 'ascent', 'attributeName', 'attributeType', 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', - 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', - 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family', - 'font-size', 'font-stretch', 'font-style', 'font-variant', + 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', + 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', + 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes', - 'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity', - 'origin', 'overline-position', 'overline-thickness', 'panose-1', - 'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', - 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', - 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', - 'stop-color', 'stop-opacity', 'strikethrough-position', - 'strikethrough-thickness', 'stroke', 'stroke-dasharray', - 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', - 'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target', - 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', - 'underline-position', 'underline-thickness', 'unicode', - 'unicode-range', 'units-per-em', 'values', 'version', 'viewBox', - 'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2', - 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role', - 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang', - 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] + 'lang', 'mathematical', 'marker-end', 'marker-mid', 'marker-start', + 'markerHeight', 'markerUnits', 'markerWidth', 'max', 'min', 'name', + 'offset', 'opacity', 'orient', 'origin', 'overline-position', + 'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', + 'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', 'repeatDur', + 'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx', + 'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity', + 'strikethrough-position', 'strikethrough-thickness', 'stroke', + 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', + 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', + 'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to', + 'transform', 'type', 'u1', 'u2', 'underline-position', + 'underline-thickness', 'unicode', 'unicode-range', 'units-per-em', + 'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x', + 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', 'xlink:href', + 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', + 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', + 'zoomAndPan'] svg_attr_map = None svg_elem_map = None @@ -3506,7 +3540,8 @@ class TextSerializer(Serializer): class PprintSerializer(Serializer): def write(self, stream=sys.stdout): - stream.write(self.results['href'] + '\n\n') + if self.results.has_key('href'): + stream.write(self.results['href'] + '\n\n') from pprint import pprint pprint(self.results, stream) stream.write('\n') @@ -3767,4 +3802,3 @@ if __name__ == '__main__': # currently supports rel-tag (maps to 'tags'), rel-enclosure (maps to # 'enclosures'), XFN links within content elements (maps to 'xfn'), # and hCard (parses as vCard); bug [ 1481975 ] Misencoded utf-8/win-1252 - diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py index a007616..898ec9f 100644 --- a/planet/html5lib/html5parser.py +++ b/planet/html5lib/html5parser.py @@ -71,35 +71,40 @@ class HTMLParser(object): "trailingEnd": TrailingEndPhase(self, self.tree) } - def parse(self, stream, encoding=None, innerHTML=False): - """Parse a HTML document into a well-formed tree - - stream - a filelike object or string containing the HTML to be parsed - - innerHTML - Are we parsing in innerHTML mode (note innerHTML=True - is not yet supported) - - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) - """ - + def _parse(self, stream, innerHTML=False, container="div", + encoding=None): + self.tree.reset() self.firstStartTag = False self.errors = [] - self.phase = self.phases["initial"] + self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding, + parseMeta=innerHTML) + + if innerHTML: + self.innerHTML = container.lower() + + if self.innerHTML in ('title', 'textarea'): + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] + elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'): + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"] + elif self.innerHTML == 'plaintext': + self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"] + else: + # contentModelFlag already is PCDATA + #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"] + pass + self.phase = self.phases["rootElement"] + self.phase.insertHtmlElement() + self.resetInsertionMode() + else: + self.innerHTML = False + self.phase = self.phases["initial"] + # We only seem to have InBodyPhase testcases where the following is # relevant ... need others too self.lastPhase = None - # We don't actually support innerHTML yet but this should allow - # assertations - self.innerHTML = innerHTML - - self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding) - # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer for token in self.tokenizer: @@ -118,7 +123,34 @@ class HTMLParser(object): # When the loop finishes it's EOF self.phase.processEOF() + def parse(self, stream, encoding=None): + """Parse a HTML document into a well-formed tree + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, innerHTML=False, encoding=encoding) return self.tree.getDocument() + + def parseFragment(self, stream, container="div", encoding=None): + """Parse a HTML fragment into a well-formed tree fragment + + container - name of the element we're setting the innerHTML property + if set to None, default to 'div' + + stream - a filelike object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) + """ + self._parse(stream, True, container=container, encoding=encoding) + return self.tree.getFragment() def parseError(self, data="XXX ERROR MESSAGE NEEDED"): # XXX The idea is to make data mandatory. @@ -187,28 +219,29 @@ class HTMLParser(object): "frameset":"inFrameset" } for node in self.tree.openElements[::-1]: + nodeName = node.name if node == self.tree.openElements[0]: last = True - if node.name not in ['td', 'th']: + if nodeName not in ['td', 'th']: # XXX assert self.innerHTML - raise NotImplementedError + nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case - if node.name in ("select", "colgroup", "head", "frameset"): + if nodeName in ("select", "colgroup", "head", "frameset"): # XXX assert self.innerHTML - if node.name in newModes: - self.phase = self.phases[newModes[node.name]] + if nodeName in newModes: + self.phase = self.phases[newModes[nodeName]] break - elif node.name == "html": + elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] else: self.phase = self.phases["afterHead"] break elif last: - self.phase = self.phases["body"] + self.phase = self.phases["inBody"] break class Phase(object): @@ -434,9 +467,7 @@ class InHeadPhase(Phase): self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): - self.tree.insertElement(name, attributes) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] + self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored")) def startTagTitle(self, name, attributes): element = self.tree.createElement(name, attributes) @@ -455,10 +486,11 @@ class InHeadPhase(Phase): self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagScript(self, name, attributes): + #XXX Inner HTML case may be wrong element = self.tree.createElement(name, attributes) element._flags.append("parser-inserted") - if self.tree.headPointer is not None and\ - self.parser.phase == self.parser.phases["inHead"]: + if (self.tree.headPointer is not None and + self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) @@ -653,8 +685,8 @@ class InBodyPhase(Phase): def startTagBody(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (body).")) - if len(self.tree.openElements) == 1 \ - or self.tree.openElements[1].name != "body": + if (len(self.tree.openElements) == 1 + or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: for attr, value in attributes.iteritems(): @@ -1179,6 +1211,7 @@ class InTablePhase(Phase): self.parser.resetInsertionMode() else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagIgnore(self, name): @@ -1215,23 +1248,25 @@ class InCaptionPhase(Phase): ]) self.endTagHandler.default = self.endTagOther + def ignoreEndTagCaption(self): + return not self.tree.elementInScope("caption", True) + def processCharacters(self, data): self.parser.phases["inBody"].processCharacters(data) def startTagTableElement(self, name, attributes): self.parser.parseError() + #XXX Have to duplicate logic here to find out if the tag is ignored + ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag("caption") - # XXX how do we know the tag is _always_ ignored in the innerHTML - # case and therefore shouldn't be processed again? I'm not sure this - # strategy makes sense... - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inBody"].processStartTag(name, attributes) def endTagCaption(self, name): - if self.tree.elementInScope(name, True): + if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": @@ -1244,14 +1279,15 @@ class InCaptionPhase(Phase): self.parser.phase = self.parser.phases["inTable"] else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, name): self.parser.parseError() + ignoreEndTag = self.ignoreEndTagCaption() self.parser.phase.processEndTag("caption") - # XXX ... - if not self.parser.innerHTML: - self.parser.phase.processStartTag(name, attributes) + if not ignoreEndTag: + self.parser.phase.processEndTag(name) def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ @@ -1279,10 +1315,13 @@ class InColumnGroupPhase(Phase): ]) self.endTagHandler.default = self.endTagOther + def ignoreEndTagColgroup(self): + return self.tree.openElements[-1].name == "html" + def processCharacters(self, data): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processCharacters(data) def startTagCol(self, name ,attributes): @@ -1290,14 +1329,15 @@ class InColumnGroupPhase(Phase): self.tree.openElements.pop() def startTagOther(self, name, attributes): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX how can be sure it's always ignored? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def endTagColgroup(self, name): - if self.tree.openElements[-1].name == "html": + if self.ignoreEndTagColgroup(): # innerHTML case + assert self.parser.innerHTML self.parser.parseError() else: self.tree.openElements.pop() @@ -1308,9 +1348,9 @@ class InColumnGroupPhase(Phase): u"col has no end tag.")) def endTagOther(self, name): + ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") - # XXX how can be sure it's always ignored? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processEndTag(name) @@ -1359,9 +1399,9 @@ class InTableBodyPhase(Phase): def startTagTableOther(self, name, attributes): # XXX AT Any ideas on how to share this with endTagTable? - if self.tree.elementInScope("tbody", True) or \ - self.tree.elementInScope("thead", True) or \ - self.tree.elementInScope("tfoot", True): + if (self.tree.elementInScope("tbody", True) or + self.tree.elementInScope("thead", True) or + self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processStartTag(name, attributes) @@ -1382,9 +1422,9 @@ class InTableBodyPhase(Phase): ") in the table body phase. Ignored.")) def endTagTable(self, name): - if self.tree.elementInScope("tbody", True) or \ - self.tree.elementInScope("thead", True) or \ - self.tree.elementInScope("tfoot", True): + if (self.tree.elementInScope("tbody", True) or + self.tree.elementInScope("thead", True) or + self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() self.endTagTableRowGroup(self.tree.openElements[-1].name) self.parser.phase.processEndTag(name) @@ -1428,6 +1468,9 @@ class InRowPhase(Phase): self.tree.openElements[-1].name + u") in the row phase.")) self.tree.openElements.pop() + def ignoreEndTagTr(self): + return not self.tree.elementInScope("tr", tableVariant=True) + # the rest def processCharacters(self, data): self.parser.phases["inTable"].processCharacters(data) @@ -1439,28 +1482,31 @@ class InRowPhase(Phase): self.tree.activeFormattingElements.append(Marker) def startTagTableOther(self, name, attributes): + ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.parser.phases["inTable"].processStartTag(name, attributes) def endTagTr(self, name): - if self.tree.elementInScope("tr", True): + if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTableBody"] else: # innerHTML case + assert self.parser.innerHTML self.parser.parseError() def endTagTable(self, name): + ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? - if not self.parser.innerHTML: + if not ignoreEndTag: self.parser.phase.processEndTag(name) def endTagTableRowGroup(self, name): @@ -1628,7 +1674,7 @@ class InSelectPhase(Phase): u"select phase. Ignored.")) def endTagSelect(self, name): - if self.tree.elementInScope(name, True): + if self.tree.elementInScope("select", True): node = self.tree.openElements.pop() while node.name != "select": node = self.tree.openElements.pop() @@ -1641,7 +1687,7 @@ class InSelectPhase(Phase): self.parser.parseError(_(u"Unexpected table end tag (" + name +\ ") in the select phase.")) if self.tree.elementInScope(name, True): - self.endTagSelect() + self.endTagSelect("select") self.parser.phase.processEndTag(name) def endTagOther(self, name): @@ -1736,8 +1782,8 @@ class InFramesetPhase(Phase): u"in the frameset phase (innerHTML).")) else: self.tree.openElements.pop() - if not self.parser.innerHTML and\ - self.tree.openElements[-1].name != "frameset": + if (not self.parser.innerHTML and + self.tree.openElements[-1].name != "frameset"): # If we're not in innerHTML mode and the the current node is not a # "frameset" element (anymore) then switch. self.parser.phase = self.parser.phases["afterFrameset"] diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py index 9140456..e197415 100644 --- a/planet/html5lib/inputstream.py +++ b/planet/html5lib/inputstream.py @@ -14,7 +14,7 @@ class HTMLInputStream(object): """ - def __init__(self, source, encoding=None, chardet=True): + def __init__(self, source, encoding=None, parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -26,6 +26,8 @@ class HTMLInputStream(object): the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) + + parseMeta - Look for a element containing encoding information """ # List of where new lines occur @@ -41,12 +43,9 @@ class HTMLInputStream(object): #Encoding to use if no other information can be found self.defaultEncoding = "windows-1252" - #Autodetect encoding if no other information can be found? - self.chardet = chardet - #Detect encoding iff no explicit "transport level" encoding is supplied if encoding is None or not isValidEncoding(encoding): - encoding = self.detectEncoding() + encoding = self.detectEncoding(parseMeta, chardet) self.charEncoding = encoding # Read bytes from stream decoding them into Unicode @@ -79,17 +78,17 @@ class HTMLInputStream(object): stream = cStringIO.StringIO(str(source)) return stream - def detectEncoding(self): + def detectEncoding(self, parseMeta=True, chardet=True): #First look for a BOM #This will also read past the BOM if present encoding = self.detectBOM() #If there is no BOM need to look for meta elements with encoding #information - if encoding is None: + if encoding is None and parseMeta: encoding = self.detectEncodingMeta() #Guess with chardet, if avaliable - if encoding is None and self.chardet: + if encoding is None and chardet: try: import chardet buffer = self.rawStream.read() diff --git a/planet/html5lib/tokenizer.py b/planet/html5lib/tokenizer.py index 3f4db08..584b268 100644 --- a/planet/html5lib/tokenizer.py +++ b/planet/html5lib/tokenizer.py @@ -32,8 +32,8 @@ class HTMLTokenizer(object): # XXX need to fix documentation - def __init__(self, stream, encoding=None): - self.stream = HTMLInputStream(stream, encoding) + def __init__(self, stream, encoding=None, parseMeta=True): + self.stream = HTMLInputStream(stream, encoding, parseMeta) self.states = { "data":self.dataState, @@ -338,31 +338,33 @@ class HTMLTokenizer(object): self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": "Characters", "data": u"<"}) - self.stream.queue.append(data) + self.stream.queue.insert(0, data) self.state = self.states["data"] return True def closeTagOpenState(self): - if self.contentModelFlag in (contentModelFlags["RCDATA"],\ - contentModelFlags["CDATA"]): - charStack = [] + if (self.contentModelFlag in (contentModelFlags["RCDATA"], + contentModelFlags["CDATA"])): + if self.currentToken: + charStack = [] - # So far we know that "", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py index 2502466..6c7bb0b 100755 --- a/planet/html5lib/treebuilders/_base.py +++ b/planet/html5lib/treebuilders/_base.py @@ -108,6 +108,9 @@ class TreeBuilder(object): #The class to use for creating doctypes doctypeClass = None + + #Fragment class + fragmentClass = None def __init__(self): self.reset() @@ -294,7 +297,6 @@ class TreeBuilder(object): fosterParent = self.openElements[ self.openElements.index(lastTable) - 1] else: - assert self.innerHTML fosterParent = self.openElements[0] return fosterParent, insertBefore @@ -310,6 +312,13 @@ class TreeBuilder(object): def getDocument(self): "Return the final tree" return self.document + + def getFragment(self): + "Return the final fragment" + #assert self.innerHTML + fragment = self.fragmentClass() + self.openElements[0].reparentChildren(fragment) + return fragment def testSerializer(self, node): """Serialize the subtree of node in the format required by unit tests diff --git a/planet/html5lib/treebuilders/dom.py b/planet/html5lib/treebuilders/dom.py index 8b52d6a..bfaa880 100755 --- a/planet/html5lib/treebuilders/dom.py +++ b/planet/html5lib/treebuilders/dom.py @@ -1,6 +1,8 @@ import _base from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE import new +from xml.sax.saxutils import escape +from constants import voidElements import re illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") @@ -87,6 +89,9 @@ class TreeBuilder(_base.TreeBuilder): def commentClass(self, data): return NodeBuilder(self.dom.createComment(data)) + + def fragmentClass(self): + return NodeBuilder(self.dom.createDocumentFragment()) def appendChild(self, node): self.dom.appendChild(node.element) @@ -96,6 +101,9 @@ class TreeBuilder(_base.TreeBuilder): def getDocument(self): return self.dom + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element def insertText(self, data, parent=None): data=illegal_xml_chars.sub(u'\uFFFD',data) @@ -118,7 +126,9 @@ def testSerializer(element): if element.nodeType == Node.DOCUMENT_TYPE_NODE: rv.append("|%s"%(' '*indent, element.name)) elif element.nodeType == Node.DOCUMENT_NODE: - rv.append("#document") + rv.append("#document") + elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + rv.append("#document-fragment") elif element.nodeType == Node.COMMENT_NODE: rv.append("|%s"%(' '*indent, element.nodeValue)) elif element.nodeType == Node.TEXT_NODE: @@ -135,6 +145,32 @@ def testSerializer(element): return "\n".join(rv) +class HTMLSerializer(object): + def serialize(self, node): + rv = self.serializeNode(node) + for child in node.childNodes: + rv += self.serialize(child) + if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements: + rv += "\n"%node.nodeName + return rv + + def serializeNode(self, node): + if node.nodeType == Node.TEXT_NODE: + rv = node.nodeValue + elif node.nodeType == Node.ELEMENT_NODE: + rv = "<%s"%node.nodeName + if node.hasAttributes(): + rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in + node.attributes.items()]) + rv += ">" + elif node.nodeType == Node.COMMENT_NODE: + rv = "" % escape(node.nodeValue) + elif node.nodeType == Node.DOCUMENT_TYPE_NODE: + rv = "" % node.name + else: + rv = "" + return rv + def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): if node.nodeType == Node.ELEMENT_NODE: if not nsmap: @@ -179,7 +215,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): elif node.nodeType == Node.DOCUMENT_NODE: handler.startDocument() for child in node.childNodes: dom2sax(child, handler, nsmap) - handler.endDocument() + handler.endDocument() + + elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: dom2sax(child, handler, nsmap) else: # ATTRIBUTE_NODE diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py index acead55..2629664 100644 --- a/planet/html5lib/treebuilders/etreefull.py +++ b/planet/html5lib/treebuilders/etreefull.py @@ -129,6 +129,10 @@ class Document(Element): def __init__(self): Element.__init__(self, Document) +class DocumentFragment(Element): + def __init__(self): + Element.__init__(self, DocumentFragment) + def testSerializer(element): rv = [] finalText = None @@ -211,9 +215,13 @@ class TreeBuilder(_base.TreeBuilder): doctypeClass = DocumentType elementClass = Element commentClass = Comment + fragmentClass = DocumentFragment def testSerializer(self, element): return testSerializer(element) def getDocument(self): return self.document._element + + def getFragment(self): + return _base.TreeBuilder.getFragment(self)._element diff --git a/planet/html5lib/treebuilders/simpletree.py b/planet/html5lib/treebuilders/simpletree.py index 6b2f09e..05dc0c0 100755 --- a/planet/html5lib/treebuilders/simpletree.py +++ b/planet/html5lib/treebuilders/simpletree.py @@ -4,6 +4,7 @@ from xml.sax.saxutils import escape # Really crappy basic implementation of a DOM-core like thing class Node(_base.Node): + type = -1 def __init__(self, name): self.name = name self.parent = None @@ -11,15 +12,18 @@ class Node(_base.Node): self.childNodes = [] self._flags = [] + def __iter__(self): + for node in self.childNodes: + yield node + for item in node: + yield item + def __unicode__(self): return self.name def toxml(self): raise NotImplementedError - def __repr__(self): - return "<%s %s>" % (self.__class__, self.name) - def printTree(self, indent=0): tree = '\n|%s%s' % (' '* indent, unicode(self)) for child in self.childNodes: @@ -69,6 +73,7 @@ class Node(_base.Node): return bool(self.childNodes) class Document(Node): + type = 1 def __init__(self): Node.__init__(self, None) @@ -93,7 +98,13 @@ class Document(Node): tree += child.printTree(2) return tree +class DocumentFragment(Document): + type = 2 + def __unicode__(self): + return "#document-fragment" + class DocumentType(Node): + type = 3 def __init__(self, name): Node.__init__(self, name) @@ -106,6 +117,7 @@ class DocumentType(Node): return '<!DOCTYPE %s>' % self.name class TextNode(Node): + type = 4 def __init__(self, value): Node.__init__(self, None) self.value = value @@ -119,6 +131,7 @@ class TextNode(Node): hilite = toxml class Element(Node): + type = 5 def __init__(self, name): Node.__init__(self, name) self.attributes = {} @@ -164,6 +177,7 @@ class Element(Node): return tree class CommentNode(Node): + type = 6 def __init__(self, data): Node.__init__(self, None) self.data = data @@ -177,11 +191,38 @@ class CommentNode(Node): def hilite(self): return '<!--%s-->' % escape(self.data) +class HTMLSerializer(object): + def serialize(self, node): + rv = self.serializeNode(node) + for child in node.childNodes: + rv += self.serialize(child) + if node.type == Element.type and node.name not in voidElements: + rv += "\n"%node.name + return rv + + def serializeNode(self, node): + if node.type == TextNode.type: + rv = node.value + elif node.type == Element.type: + rv = "<%s"%node.name + if node.attributes: + rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in + node.attributes.iteritems()]) + rv += ">" + elif node.type == CommentNode.type: + rv = "" % escape(node.data) + elif node.type == DocumentType.type: + rv = "" % node.name + else: + rv = "" + return rv + class TreeBuilder(_base.TreeBuilder): documentClass = Document doctypeClass = DocumentType elementClass = Element commentClass = CommentNode + fragmentClass = DocumentFragment def testSerializer(self, node): return node.printTree() diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py index 18c764a..dd2abd3 100644 --- a/planet/shell/__init__.py +++ b/planet/shell/__init__.py @@ -44,13 +44,17 @@ def run(template_file, doc, mode='template'): base,ext = os.path.splitext(os.path.basename(template_resolved)) module_name = ext[1:] try: - module = __import__(module_name) + try: + module = __import__("_" + module_name) + except: + module = __import__(module_name) except Exception, inst: return log.error("Skipping %s '%s' after failing to load '%s': %s", mode, template_resolved, module_name, inst) # Execute the shell module options = planet.config.template_options(template_file) + if module_name == 'plugin': options['__file__'] = template_file options.update(extra_options) log.debug("Processing %s %s using %s", mode, os.path.realpath(template_resolved), module_name) @@ -60,3 +64,4 @@ def run(template_file, doc, mode='template'): output_dir = planet.config.output_dir() output_file = os.path.join(output_dir, base) module.run(template_resolved, doc, output_file, options) + return output_file diff --git a/planet/shell/_genshi.py b/planet/shell/_genshi.py new file mode 100644 index 0000000..5dffab2 --- /dev/null +++ b/planet/shell/_genshi.py @@ -0,0 +1,143 @@ +from StringIO import StringIO +from xml.sax.saxutils import escape + +from genshi.input import HTMLParser, XMLParser +from genshi.template import Context, MarkupTemplate + +subscriptions = [] +feed_types = [ + 'application/atom+xml', + 'application/rss+xml', + 'application/rdf+xml' +] + +def norm(value): + """ Convert to Unicode """ + if hasattr(value,'items'): + return dict([(norm(n),norm(v)) for n,v in value.items()]) + + try: + return value.decode('utf-8') + except: + return value.decode('iso-8859-1') + +def find_config(config, feed): + # match based on self link + for link in feed.links: + if link.has_key('rel') and link.rel=='self': + if link.has_key('type') and link.type in feed_types: + if link.has_key('href') and link.href in subscriptions: + return norm(dict(config.parser.items(link.href))) + + # match based on name + for sub in subscriptions: + if config.parser.has_option(sub, 'name') and \ + norm(config.parser.get(sub, 'name')) == feed.planet_name: + return norm(dict(config.parser.items(sub))) + + return {} + +class XHTMLParser(object): + """ parse an XHTML fragment """ + def __init__(self, text): + self.parser = XMLParser(StringIO("
    %s
    " % text)) + self.depth = 0 + def __iter__(self): + self.iter = self.parser.__iter__() + return self + def next(self): + object = self.iter.next() + if object[0] == 'END': self.depth = self.depth - 1 + predepth = self.depth + if object[0] == 'START': self.depth = self.depth + 1 + if predepth: return object + return self.next() + +def streamify(text,bozo): + """ add a .stream to a _detail textConstruct """ + if text.type == 'text/plain': + text.stream = HTMLParser(StringIO(escape(text.value))) + elif text.type == 'text/html' or bozo != 'false': + text.stream = HTMLParser(StringIO(text.value)) + else: + text.stream = XHTMLParser(text.value) + +def run(script, doc, output_file=None, options={}): + """ process an Genshi template """ + + context = Context(**options) + + tmpl_fileobj = open(script) + tmpl = MarkupTemplate(tmpl_fileobj, script) + tmpl_fileobj.close() + + if not output_file: + # filter + context.push({'input':XMLParser(StringIO(doc))}) + else: + # template + import time + from planet import config,feedparser + from planet.spider import filename + + # gather a list of subscriptions, feeds + global subscriptions + feeds = [] + sources = config.cache_sources_directory() + for sub in config.subscriptions(): + data=feedparser.parse(filename(sources,sub)) + data.feed.config = norm(dict(config.parser.items(sub))) + if data.feed.has_key('link'): + feeds.append((data.feed.config.get('name',''),data.feed)) + subscriptions.append(norm(sub)) + feeds.sort() + + # annotate each entry + new_date_format = config.new_date_format() + vars = feedparser.parse(StringIO(doc)) + vars.feeds = [value for name,value in feeds] + last_feed = None + last_date = None + for entry in vars.entries: + entry.source.config = find_config(config, entry.source) + + # add new_feed and new_date fields + entry.new_feed = entry.source.id + entry.new_date = date = None + if entry.has_key('published_parsed'): date=entry.published_parsed + if entry.has_key('updated_parsed'): date=entry.updated_parsed + if date: entry.new_date = time.strftime(new_date_format, date) + + # remove new_feed and new_date fields if not "new" + if entry.new_date == last_date: + entry.new_date = None + if entry.new_feed == last_feed: + entry.new_feed = None + else: + last_feed = entry.new_feed + elif entry.new_date: + last_date = entry.new_date + last_feed = None + + # add streams for all text constructs + for key in entry.keys(): + if key.endswith("_detail") and entry[key].has_key('type') and \ + entry[key].has_key('value'): + streamify(entry[key],entry.source.planet_bozo) + if entry.has_key('content'): + for content in entry.content: + streamify(content,entry.source.planet_bozo) + + # add cumulative feed information to the Genshi context + vars.feed.config = dict(config.parser.items('Planet',True)) + context.push(vars) + + # apply template + output=tmpl.generate(context).render('xml') + + if output_file: + out_file = open(output_file,'w') + out_file.write(output) + out_file.close() + else: + return output diff --git a/planet/shell/plugin.py b/planet/shell/plugin.py new file mode 100644 index 0000000..dd94380 --- /dev/null +++ b/planet/shell/plugin.py @@ -0,0 +1,64 @@ +import os, sys, imp +from StringIO import StringIO + +def run(script, doc, output_file=None, options={}): + """ process an Python script using imp """ + save_sys = (sys.stdin, sys.stdout, sys.stderr, sys.argv) + plugin_stdout = StringIO() + plugin_stderr = StringIO() + + try: + # redirect stdin + sys.stdin = StringIO(doc) + + # redirect stdout + if output_file: + sys.stdout = open(output_file, 'w') + else: + sys.stdout = plugin_stdout + + # redirect stderr + sys.stderr = plugin_stderr + + # determine __file__ value + if options.has_key("__file__"): + plugin_file = options["__file__"] + del options["__file__"] + else: + plugin_file = script + + # set sys.argv + options = sum([['--'+key, value] for key,value in options.items()], []) + sys.argv = [plugin_file] + options + + # import script + handle = open(script, 'r') + cwd = os.getcwd() + try: + try: + try: + description=('.plugin', 'rb', imp.PY_SOURCE) + imp.load_module('__main__',handle,plugin_file,description) + except SystemExit,e: + if e.code: log.error('%s exit rc=%d',(plugin_file,e.code)) + except Exception, e: + import traceback + type, value, tb = sys.exc_info() + plugin_stderr.write(''.join( + traceback.format_exception_only(type,value) + + traceback.format_tb(tb))) + finally: + handle.close() + if cwd != os.getcwd(): os.chdir(cwd) + + finally: + # restore system state + sys.stdin, sys.stdout, sys.stderr, sys.argv = save_sys + + # log anything sent to stderr + if plugin_stderr.getvalue(): + import planet + planet.logger.error(plugin_stderr.getvalue()) + + # return stdout + return plugin_stdout.getvalue() diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py index 620f45e..4f4d822 100644 --- a/planet/shell/tmpl.py +++ b/planet/shell/tmpl.py @@ -102,7 +102,7 @@ Items = [ ['enclosure_type', String, 'links', {'rel': 'enclosure'}, 'type'], ['id', String, 'id'], ['link', String, 'links', {'rel': 'alternate'}, 'href'], - ['new_channel', String, 'id'], + ['new_channel', String, 'source', 'id'], ['new_date', NewDate, 'published_parsed'], ['new_date', NewDate, 'updated_parsed'], ['rights', String, 'rights_detail', 'value'], @@ -226,7 +226,7 @@ def template_info(source): date = item['new_date'] if item.has_key('new_channel'): - if item['new_channel'] == channel: + if item['new_channel'] == channel and not item.has_key('new_date'): del item['new_channel'] else: channel = item['new_channel'] @@ -241,12 +241,15 @@ def run(script, doc, output_file=None, options={}): for key,value in template_info(doc).items(): tp.set(key, value) - reluri = os.path.splitext(os.path.basename(output_file))[0] - tp.set('url', urlparse.urljoin(config.link(),reluri)) + if output_file: + reluri = os.path.splitext(os.path.basename(output_file))[0] + tp.set('url', urlparse.urljoin(config.link(),reluri)) - output = open(output_file, "w") - output.write(tp.process(template)) - output.close() + output = open(output_file, "w") + output.write(tp.process(template)) + output.close() + else: + return tp.process(template) if __name__ == '__main__': sys.path.insert(0, os.path.split(sys.path[0])[0]) diff --git a/planet/spider.py b/planet/spider.py index e88d203..b18a787 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -323,14 +323,12 @@ def httpThread(thread_index, input_queue, output_queue, log): for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) - continue output_queue.put(block=True, item=(uri, feed_info, feed)) uri, feed_info = input_queue.get(block=True) def spiderPlanet(only_if_new = False): """ Spider (fetch) an entire planet """ - # log = planet.getLogger(config.log_level(),config.log_format()) log = planet.getLogger(config.log_level(),config.log_format()) global index diff --git a/planet/splice.py b/planet/splice.py index ed2a856..ccc55e0 100644 --- a/planet/splice.py +++ b/planet/splice.py @@ -111,9 +111,37 @@ def apply(doc): if not os.path.exists(output_dir): os.makedirs(output_dir) log = planet.getLogger(config.log_level(),config.log_format()) + planet_filters = config.filters('Planet') + # Go-go-gadget-template for template_file in config.template_files(): - shell.run(template_file, doc) + output_file = shell.run(template_file, doc) + + # run any template specific filters + if config.filters(template_file) != planet_filters: + output = open(output_file).read() + for filter in config.filters(template_file): + if filter in planet_filters: continue + if filter.find('>')>0: + # tee'd output + filter,dest = filter.split('>',1) + tee = shell.run(filter.strip(), output, mode="filter") + if tee: + output_dir = planet.config.output_dir() + dest_file = os.path.join(output_dir, dest.strip()) + dest_file = open(dest_file,'w') + dest_file.write(tee) + dest_file.close() + else: + # pipe'd output + output = shell.run(filter, output, mode="filter") + if not output: + os.unlink(output_file) + break + else: + handle = open(output_file,'w') + handle.write(output) + handle.close() # Process bill of materials for copy_file in config.bill_of_materials(): @@ -123,6 +151,9 @@ def apply(doc): if os.path.exists(source): break else: log.error('Unable to locate %s', copy_file) + log.info("Template search path:") + for template_dir in config.template_directories(): + log.info(" %s", os.path.realpath(template_dir)) continue mtime = os.stat(source).st_mtime @@ -131,5 +162,6 @@ def apply(doc): if not os.path.exists(dest_dir): os.makedirs(dest_dir) log.info("Copying %s to %s", source, dest) + if os.path.exists(dest): os.chmod(dest, 0644) shutil.copyfile(source, dest) shutil.copystat(source, dest) diff --git a/runtests.py b/runtests.py index d14058d..7783d14 100755 --- a/runtests.py +++ b/runtests.py @@ -18,12 +18,23 @@ if not hasattr(unittest.TestCase, 'assertFalse'): if sys.path[0]: os.chdir(sys.path[0]) sys.path[0] = os.getcwd() -# find all of the planet test modules -modules = map(fullmodname, glob.glob(os.path.join('tests', 'test_*.py'))) +# determine verbosity +verbosity = 1 +for arg,value in (('-q',0),('--quiet',0),('-v',2),('--verbose',2)): + if arg in sys.argv: + verbosity = value + sys.argv.remove(arg) -# enable warnings +# find all of the planet test modules +modules = [] +for pattern in sys.argv[1:] or ['test_*.py']: + modules += map(fullmodname, glob.glob(os.path.join('tests', pattern))) + +# enable logging import planet -planet.getLogger("WARNING",None) +if verbosity == 0: planet.getLogger("FATAL",None) +if verbosity == 1: planet.getLogger("WARNING",None) +if verbosity == 2: planet.getLogger("DEBUG",None) # load all of the tests into a suite try: @@ -33,11 +44,5 @@ except Exception, exception: for module in modules: __import__(module) raise -verbosity = 1 -if "-q" in sys.argv or '--quiet' in sys.argv: - verbosity = 0 -if "-v" in sys.argv or '--verbose' in sys.argv: - verbosity = 2 - # run test suite unittest.TextTestRunner(verbosity=verbosity).run(suite) diff --git a/tests/data/apply/config-filter.ini b/tests/data/apply/config-filter.ini new file mode 100644 index 0000000..6bea6db --- /dev/null +++ b/tests/data/apply/config-filter.ini @@ -0,0 +1,21 @@ +[Planet] +output_theme = asf +output_dir = tests/work/apply +name = test planet +cache_directory = tests/work/spider/cache +filter_directories = tests/data/apply + +[index.html.xslt] +filters = rebase.py?base=http://example.com/ + +[tests/data/spider/testfeed0.atom] +name = not found + +[tests/data/spider/testfeed1b.atom] +name = one + +[tests/data/spider/testfeed2.atom] +name = two + +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/data/apply/config-genshi.ini b/tests/data/apply/config-genshi.ini new file mode 100644 index 0000000..28c9c2f --- /dev/null +++ b/tests/data/apply/config-genshi.ini @@ -0,0 +1,21 @@ +[Planet] +output_theme = genshi_fancy +output_dir = tests/work/apply +name = test planet +cache_directory = tests/work/spider/cache + +bill_of_materials: + images/#{face} + +[tests/data/spider/testfeed0.atom] +name = not found + +[tests/data/spider/testfeed1b.atom] +name = one +face = jdub.png + +[tests/data/spider/testfeed2.atom] +name = two + +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/data/apply/config-html.ini b/tests/data/apply/config-html.ini new file mode 100644 index 0000000..635b552 --- /dev/null +++ b/tests/data/apply/config-html.ini @@ -0,0 +1,25 @@ +[Planet] +output_theme = genshi_fancy +output_dir = tests/work/apply +name = test planet +cache_directory = tests/work/spider/cache + +bill_of_materials: + images/#{face} + +[index.html.genshi] +filters: + xhtml2html.py>index.html4 + +[tests/data/spider/testfeed0.atom] +name = not found + +[tests/data/spider/testfeed1b.atom] +name = one +face = jdub.png + +[tests/data/spider/testfeed2.atom] +name = two + +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/data/apply/config-mememe.ini b/tests/data/apply/config-mememe.ini new file mode 100644 index 0000000..c6ca9bd --- /dev/null +++ b/tests/data/apply/config-mememe.ini @@ -0,0 +1,29 @@ +[Planet] +output_theme = classic_fancy +output_dir = tests/work/apply +name = test planet +cache_directory = tests/work/spider/cache + +bill_of_materials: + images/#{face} + +[index.html.tmpl] +filters: + html2xhtml.plugin + mememe.plugin + +[mememe.plugin] +sidebar = //*[@class='sidebar'] + +[tests/data/spider/testfeed0.atom] +name = not found + +[tests/data/spider/testfeed1b.atom] +name = one +face = jdub.png + +[tests/data/spider/testfeed2.atom] +name = two + +[tests/data/spider/testfeed3.rss] +name = three diff --git a/tests/data/apply/rebase.py b/tests/data/apply/rebase.py new file mode 100644 index 0000000..9cd77d1 --- /dev/null +++ b/tests/data/apply/rebase.py @@ -0,0 +1,24 @@ +# make href attributes absolute, using base argument passed in + +import sys +try: + base = sys.argv[sys.argv.index('--base')+1] +except: + sys.stderr.write('Missing required argument: base\n') + sys.exit() + +from xml.dom import minidom, Node +from urlparse import urljoin + +def rebase(node, newbase): + if node.hasAttribute('href'): + href=node.getAttribute('href') + if href != urljoin(base,href): + node.setAttribute('href', urljoin(base,href)) + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE: + rebase(child, newbase) + +doc = minidom.parse(sys.stdin) +rebase(doc.documentElement, base) +print doc.toxml('utf-8') diff --git a/tests/data/filter/coral_cdn.xml b/tests/data/filter/coral_cdn.xml index 072353c..c31f7aa 100644 --- a/tests/data/filter/coral_cdn.xml +++ b/tests/data/filter/coral_cdn.xml @@ -1,7 +1,10 @@
    - + Plain old image: + Host has a non-standard port: + A non-port colon: + Several colons:
    diff --git a/tests/data/filter/index.html b/tests/data/filter/index.html new file mode 100644 index 0000000..4e621fc --- /dev/null +++ b/tests/data/filter/index.html @@ -0,0 +1,18 @@ + + + + +Planet Intertwingly')>=0) + + def test_xhtml2html_filter(self): + testfile = 'tests/data/filter/index.html' + filter = 'xhtml2html.py' + output = shell.run(filter, open(testfile).read(), mode="filter") + self.assertTrue(output.find('/>')<0) + self.assertTrue(output.find('')>=0) + +try: + import genshi +except: + logger.warn("Genshi is not available => can't test genshi filters") + for method in dir(GenshiFilterTests): + if method.startswith('test_'): delattr(GenshiFilterTests,method) diff --git a/tests/test_filter_xslt.py b/tests/test_filter_xslt.py index 061f805..46c4e82 100644 --- a/tests/test_filter_xslt.py +++ b/tests/test_filter_xslt.py @@ -15,14 +15,30 @@ class XsltFilterTests(unittest.TestCase): catterm = dom.getElementsByTagName('category')[0].getAttribute('term') self.assertEqual('OnE', catterm) + def test_addsearch_filter(self): + testfile = 'tests/data/filter/index.html' + filter = 'addsearch.xslt' + output = shell.run(filter, open(testfile).read(), mode="filter") + self.assertTrue(output.find('

    Search

    ')>=0) + self.assertTrue(output.find('
    ')>=0) + self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0) + self.assertTrue(output.find('')>=0) + try: import libxslt except: try: - from subprocess import Popen, PIPE - xsltproc=Popen(['xsltproc','--version'],stdout=PIPE,stderr=PIPE) - xsltproc.communicate() - if xsltproc.returncode != 0: raise ImportError + try: + # Python 2.5 bug 1704790 workaround (alas, Unix only) + import commands + if commands.getstatusoutput('xsltproc --version')[0] != 0: + raise ImportError + except: + from subprocess import Popen, PIPE + xsltproc=Popen(['xsltproc','--version'],stdout=PIPE,stderr=PIPE) + xsltproc.communicate() + if xsltproc.returncode != 0: raise ImportError except: logger.warn("libxslt is not available => can't test xslt filters") del XsltFilterTests.test_xslt_filter + del XsltFilterTests.test_addsearch_filter diff --git a/tests/test_filters.py b/tests/test_filters.py index 02c5f57..e8b9488 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -11,8 +11,11 @@ class FilterTests(unittest.TestCase): output = shell.run(filter, open(testfile).read(), mode="filter") dom = xml.dom.minidom.parseString(output) - imgsrc = dom.getElementsByTagName('img')[0].getAttribute('src') - self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrc) + imgsrcs = [img.getAttribute('src') for img in dom.getElementsByTagName('img')] + self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrcs[0]) + self.assertEqual('http://example.com.1234.nyud.net:8080/foo.png', imgsrcs[1]) + self.assertEqual('http://u:p@example.com.nyud.net:8080/foo.png', imgsrcs[2]) + self.assertEqual('http://u:p@example.com.1234.nyud.net:8080/foo.png', imgsrcs[3]) def test_excerpt_images1(self): config.load('tests/data/filter/excerpt-images.ini') @@ -108,17 +111,44 @@ class FilterTests(unittest.TestCase): self.assertNotEqual('', output) + def test_regexp_filter2(self): + config.load('tests/data/filter/regexp-sifter2.ini') + + testfile = 'tests/data/filter/category-one.xml' + + output = open(testfile).read() + for filter in config.filters(): + output = shell.run(filter, output, mode="filter") + + self.assertNotEqual('', output) + + testfile = 'tests/data/filter/category-two.xml' + + output = open(testfile).read() + for filter in config.filters(): + output = shell.run(filter, output, mode="filter") + + self.assertEqual('', output) + try: from subprocess import Popen, PIPE - _no_sed = False - try: - sed = Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) - sed.communicate() - if sed.returncode != 0: - _no_sed = True - except WindowsError: - _no_sed = True + _no_sed = True + if _no_sed: + try: + # Python 2.5 bug 1704790 workaround (alas, Unix only) + import commands + if commands.getstatusoutput('sed --version')[0]==0: _no_sed = False + except: + pass + + if _no_sed: + try: + sed = Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) + sed.communicate() + if sed.returncode == 0: _no_sed = False + except WindowsError: + pass if _no_sed: logger.warn("sed is not available => can't test stripAd_yahoo") diff --git a/themes/asf/default.css b/themes/asf/default.css index 0c8c0ef..64d7076 100644 --- a/themes/asf/default.css +++ b/themes/asf/default.css @@ -208,7 +208,7 @@ body > h1 { text-align: right; } -#body h2.date { +#body > h2 { text-transform: none; font-size: medium; color: #333; @@ -466,11 +466,28 @@ ul.tags a:link, ul.tags a:visited { color:green } +a[rel='tag'] img { + border: 0; +} + /* DiveIntoMark */ .framed { float: none; } +/* BurningBird */ +.update:before { + content: 'Update'; + font-weight: bold; +} + +.update { + margin: 2em; + padding: 0 1em 0 1em; + background: #eee; + border: 1px solid #aaa; +} + /* ----------------------------- Footer ---------------------------- */ #footer { diff --git a/themes/asf/index.html.xslt b/themes/asf/index.html.xslt index f4ff71e..e286771 100644 --- a/themes/asf/index.html.xslt +++ b/themes/asf/index.html.xslt @@ -49,9 +49,9 @@
    Last updated:
    - + +
    Powered by:
    @@ -131,7 +131,7 @@ - +
      @@ -165,10 +165,12 @@ -

      - - , - +

      +

      @@ -231,9 +233,9 @@ at - + + diff --git a/themes/asf/personalize.js b/themes/asf/personalize.js index 2aa1dc8..4f36d8a 100644 --- a/themes/asf/personalize.js +++ b/themes/asf/personalize.js @@ -71,6 +71,7 @@ function createCookie(name,value,days) { // read a cookie function readCookie(name) { var nameEQ = name + "="; + if (!document.cookie) return; var ca = document.cookie.split(';'); for(var i=0;i < ca.length;i++) { var c = ca[i]; @@ -134,11 +135,27 @@ function addOption(event) { } } -// convert date to local time +// Parse an HTML5-liberalized version of RFC 3339 datetime values +Date.parseRFC3339 = function (string) { + var date=new Date(); + date.setTime(0); + var match = string.match(/(\d{4})-(\d\d)-(\d\d)\s*(?:[\sT]\s*(\d\d):(\d\d)(?::(\d\d))?(\.\d*)?\s*(Z|([-+])(\d\d):(\d\d))?)?/); + if (!match) return; + if (match[2]) match[2]--; + if (match[7]) match[7] = (match[7]+'000').substring(1,4); + var field = [null,'FullYear','Month','Date','Hours','Minutes','Seconds','Milliseconds']; + for (var i=1; i<=7; i++) if (match[i]) date['setUTC'+field[i]](match[i]); + if (match[9]) date.setTime(date.getTime()+ + (match[9]=='-'?1:-1)*(match[10]*3600000+match[11]*60000) ); + return date.getTime(); +} + +// convert datetime to local date var localere = /^(\w+) (\d+) (\w+) \d+ 0?(\d\d?:\d\d):\d\d ([AP]M) (EST|EDT|CST|CDT|MST|MDT|PST|PDT)/; function localizeDate(element) { var date = new Date(); - date.setTime(Date.parse(element.innerHTML + " GMT")); + date.setTime(Date.parseRFC3339(element.getAttribute('datetime'))); + if (!date.getTime()) return; var local = date.toLocaleString(); var match = local.match(localere); @@ -160,13 +177,13 @@ function localizeDate(element) { // find entries (and localizeDates) function findEntries() { - var span = document.getElementsByTagName('span'); + var times = document.getElementsByTagName('time'); - for (var i=0; i no - + diff --git a/themes/common/images/tcosm11.gif b/themes/common/images/tcosm11.gif new file mode 100644 index 0000000..548c998 Binary files /dev/null and b/themes/common/images/tcosm11.gif differ diff --git a/themes/genshi_fancy/config.ini b/themes/genshi_fancy/config.ini new file mode 100644 index 0000000..d5a127d --- /dev/null +++ b/themes/genshi_fancy/config.ini @@ -0,0 +1,20 @@ +# This theme reimplements the classic "fancy" htmltmpl using genshi + +[Planet] +template_files: + atom.xml.xslt + foafroll.xml.xslt + index.html.genshi + opml.xml.xslt + rss10.xml.tmpl + rss20.xml.tmpl + +template_directories: + ../common + ../classic_fancy + +bill_of_materials: + planet.css + images/feed-icon-10x10.png + images/logo.png + images/venus.png diff --git a/themes/genshi_fancy/index.html.genshi b/themes/genshi_fancy/index.html.genshi new file mode 100644 index 0000000..fe26934 --- /dev/null +++ b/themes/genshi_fancy/index.html.genshi @@ -0,0 +1,95 @@ + + + + + +$feed.config.name + + + + + + + +

      $feed.config.name

      + + + +
      +

      $entry.new_date

      + +
      +

      $entry.source.config.name

      + + + +

      $entry.title_detail.stream

      + +
      +
      +${entry.content[0].stream} +${entry.summary_detail.stream} +
      + +

      by $entry.author_detail.name at $entry.updated

      +
      + +
      +
      + +
      + + + + + diff --git a/themes/genshi_fancy/planet.css b/themes/genshi_fancy/planet.css new file mode 100644 index 0000000..05653c0 --- /dev/null +++ b/themes/genshi_fancy/planet.css @@ -0,0 +1,150 @@ +body { + border-right: 1px solid black; + margin-right: 200px; + + padding-left: 20px; + padding-right: 20px; +} + +h1 { + margin-top: 0px; + padding-top: 20px; + + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + letter-spacing: -2px; + text-transform: lowercase; + text-align: right; + + color: grey; +} + +.admin { + text-align: right; +} + +h2 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + color: #200080; + + margin-left: -20px; +} + +h3 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: normal; + + background-color: #a0c0ff; + border: 1px solid #5080b0; + + padding: 4px; +} + +h3 a { + text-decoration: none; + color: inherit; +} + +h4 { + font-family: "Bitstream Vera Sans", sans-serif; + font-weight: bold; +} + +h4 a { + text-decoration: none; + color: inherit; +} + +img.face { + float: right; + margin-top: -3em; +} + +.entry { + margin-bottom: 2em; +} + +.entry .date { + font-family: "Bitstream Vera Sans", sans-serif; + color: grey; +} + +.entry .date a { + text-decoration: none; + color: inherit; +} + +.sidebar { + position: absolute; + top: 0px; + right: 0px; + width: 200px; + + margin-left: 0px; + margin-right: 0px; + padding-right: 0px; + + padding-top: 20px; + padding-left: 0px; + + font-family: "Bitstream Vera Sans", sans-serif; + font-size: 85%; +} + +.sidebar h2 { + font-size: 110%; + font-weight: bold; + color: black; + + padding-left: 5px; + margin-left: 0px; +} + +.sidebar ul { + padding-left: 1em; + margin-left: 0px; + + list-style-type: none; +} + +.sidebar ul li:hover { + color: grey; +} + +.sidebar ul li a { + text-decoration: none; +} + +.sidebar ul li a:hover { + text-decoration: underline; +} + +.sidebar ul li a img { + border: 0; +} + +.sidebar p { + border-top: 1px solid grey; + margin-top: 30px; + padding-top: 10px; + + padding-left: 5px; +} + +.sidebar .message { + cursor: help; + border-bottom: 1px dashed red; +} + +.sidebar a.message:hover { + cursor: help; + background-color: #ff0000; + color: #ffffff !important; + text-decoration: none !important; +} + +a:hover { + text-decoration: underline !important; + color: blue !important; +}