planet/filters/mememe.plugin
2007-06-15 12:22:54 -04:00

481 lines
16 KiB
Plaintext

#
# This Venus output filter will annotate an XHTML page with a list of
# "memes" (or most popular linked destinations, based on the last week
# of entries from the cache) and will update the subscription list with
# links to recent entries from each subscription.
#
# Templates that don't produce XHTML natively will need their output passed
# through html2xhtml.plugin first.
#
# Typical configuration (based on classic_fancy):
#
# [index.html.tmpl]
# filters:
# html2xhtml.plugin
# mememe.plugin
#
# [mememe.plugin]
# sidebar = @class='sidebar'
#
import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5
from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
import planet
from planet import config, feedparser
from planet.spider import filename
log = planet.logger
options = config.filter_options(sys.argv[0])
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
now = time.time()
week = 7 * 86400
week_ago = now - week
cache = config.cache_directory()
meme_cache = os.path.join(cache, 'memes')
if not os.path.exists(meme_cache): os.makedirs(meme_cache)
bom = config.bill_of_materials()
if not 'images/tcosm11.gif' in bom:
bom.append('images/tcosm11.gif')
config.parser.set('Planet', 'bill_of_materials', ' '.join(bom))
all_links = {}
feed_links = {}
def check_cache(url):
try:
file = open(filename(meme_cache, url))
headers = eval(file.read())
file.close()
return headers or {}
except:
return {}
def cache_meme(url, headers):
json = []
for key,value in headers.items():
json.append(' %s: %s' % (toj(key), toj(value)))
file = open(filename(meme_cache, url),'w')
file.write('{\n' + ',\n'.join(json) + '\n}\n')
file.close()
urlmap = {}
def canonicalize(url):
url = urlmap.get(url,url)
parts = list(urlparse.urlparse(url))
parts[0] = parts[0].lower()
parts[1] = parts[1].lower()
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
if not parts[2]: parts[2] = '/'
parts[-1] = ''
return urlparse.urlunparse(parts)
log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')):
# ensure that this is within the past week
if os.path.isdir(name): continue
mtime = os.stat(name).st_mtime
if mtime < week_ago: continue
# parse the file
try:
doc = libxml2.parseFile(name)
except:
continue
xp = doc.xpathNewContext()
xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")
# determine the entry
entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
if not entry: continue
entry = canonicalize(entry[0].prop("href"))
# determine the title
title = xp.xpathEval("/atom:entry/atom:title")
if title:
if title[0].prop('type') == 'html':
title = re.sub('<.*?>','',title[0].content)
else:
title = title[0].content
title = str(title or '')
# determine the feed id
feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
if not feed: continue
feed = feed[0].content
# determine the author
author = xp.xpathEval("/atom:entry/atom:source/planet:name")
if author:
author = author[0].content
else:
author = ''
# track the feed_links
if author:
if not feed_links.has_key(author): feed_links[author] = list()
feed_links[author].append([mtime, entry, title])
# identify the unique links
entry_links = []
for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
parent = node.parent
while parent:
if parent.name == 'source': break
parent = parent.parent
else:
link = canonicalize(node.prop('href'))
if not link in entry_links:
entry_links.append(link)
if node.hasProp('title') and node.prop('title').startswith('http'):
link = canonicalize(node.prop('title'))
if not link in entry_links:
entry_links.append(link)
# add the votes
weight = 1.0 - (now - mtime)**2 / week**2
vote = [(weight, str(entry), str(feed), title, author, mtime)]
for link in entry_links:
all_links[link] = all_links.get(link,list()) + vote
# free the entry
doc.freeDoc()
# tally the votes
weighted_links = []
for link, votes in all_links.items():
site = {}
updated = 0
for weight, entry, feed, title, author, mtime in votes:
site[feed] = max(site.get(feed,0), weight)
if mtime > updated: updated=mtime
weighted_links.append((sum(site.values()), link, updated))
weighted_links.sort()
weighted_links.reverse()
cp1252 = {
128: 8364, # euro sign
130: 8218, # single low-9 quotation mark
131: 402, # latin small letter f with hook
132: 8222, # double low-9 quotation mark
133: 8230, # horizontal ellipsis
134: 8224, # dagger
135: 8225, # double dagger
136: 710, # modifier letter circumflex accent
137: 8240, # per mille sign
138: 352, # latin capital letter s with caron
139: 8249, # single left-pointing angle quotation mark
140: 338, # latin capital ligature oe
142: 381, # latin capital letter z with caron
145: 8216, # left single quotation mark
146: 8217, # right single quotation mark
147: 8220, # left double quotation mark
148: 8221, # right double quotation mark
149: 8226, # bullet
150: 8211, # en dash
151: 8212, # em dash
152: 732, # small tilde
153: 8482, # trade mark sign
154: 353, # latin small letter s with caron
155: 8250, # single right-pointing angle quotation mark
156: 339, # latin small ligature oe
158: 382, # latin small letter z with caron
159: 376} # latin capital letter y with diaeresis
# determine the title for a given url
class html(sgmllib.SGMLParser):
def __init__(self, url):
sgmllib.SGMLParser.__init__(self)
self.title = ""
self.feedurl = ""
self.intitle = False
headers = check_cache(url)
try:
# fetch the page
request = urllib2.Request(url)
request.add_header('User-Agent', 'Venus/MeMeme')
if headers.has_key('etag'):
request.add_header('If-None-Match', headers['etag'])
if headers.has_key('last_modified'):
request.add_header('If-Modified-Since', headers['last-modified'])
response = urllib2.urlopen(request)
self.feed(response.read())
# ensure the data is in utf-8
try:
self.title = self.title.decode('utf-8')
except:
self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
for c in self.title.decode('iso-8859-1')])
# cache the results
headers = {}
if self.feedurl: headers['feedurl'] = self.feedurl
if self.title: headers['title'] = self.title
headers.update(response.headers)
cache_meme(url, headers)
except:
self.feedurl = headers.get('feedurl')
if headers.has_key('title'):
if isinstance(headers['title'],str):
self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
else:
self.title=headers['title']
# if there is a feed, look for an entry that matches, and take that title
if self.feedurl and not self.title:
headers = check_cache(self.feedurl)
data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
modified=headers.get('last-modified'))
if data.has_key('headers') and data.has_key('status') and \
data.status in [200, 301, 302]:
titles = {}
for entry in data.entries:
if entry.has_key('title_detail') and entry.has_key('link'):
titles[entry.link] = entry.title_detail.value
if entry.title_detail.type == 'text/plain':
titles[entry.link] = escape(titles[entry.link])
if titles.has_key(url): self.title = titles[url]
data.headers.update(titles)
cache_meme(self.feedurl, data.headers)
else:
if headers.has_key(url):
if isinstance(headers[url],str):
self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
else:
self.title=headers[url]
# fallback is the basename of the URI
if not self.title:
self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])
# parse out the first autodiscovery link
def start_link(self, attrs):
if self.feedurl: return
attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
if not 'rel' in attrs: return
rels = attrs['rel'].split(' ')
if 'alternate' not in rels: return
if not 'type' in attrs or not attrs['type'].endswith('xml'): return
if 'href' in attrs:
self.feedurl = attrs['href']
# parse the page title
def start_title(self, attributes):
if not self.title: self.intitle = True
def end_title(self):
self.intitle = False
def handle_data(self, text):
if self.intitle: self.title += escape(text)
# convert unicode string to a json string
def toj(value):
result = repr(value).replace(r'\x',r'\u00')
if result[:1] == 'u': result=result[1:]
if result.startswith("'"):
result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
return result
seenit = []
count = 0
# construct an empty feed
feed_doc = libxml2.newDoc("1.0")
meme_feed = feed_doc.newChild(None, "feed", None)
meme_feed.newNs('http://www.w3.org/2005/Atom', None)
meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
author = meme_feed.newChild(None, 'author', None)
author.newTextChild(None, 'name', config.owner_name())
if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
link = meme_feed.newChild(None, 'link', None)
link.setProp('href', os.path.join(config.link(), 'memes.atom'))
link.setProp('rel', 'self')
meme_feed.newTextChild(None, 'updated',
time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))
# parse the input
log.debug("Parse input")
doc=libxml2.parseDoc(sys.stdin.read())
# find the sidebar/footer
sidebar = options.get('sidebar','//*[@class="sidebar"]')
footer = doc.xpathEval(sidebar)
if not hasattr(footer,'__len__') or len(footer) == 0:
raise Exception(sidebar + ' not found')
if len(footer) > 1:
log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar))
footer = footer[0]
# add up to 10 entry links to each subscription
subs_ul = footer.children
while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
child = subs_ul.children
while child:
if child.name == 'li':
if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
link = child.lastChild()
while link.isText(): link=link.prev
author = link.getContent()
state = 'inactive'
if feed_links.has_key(author):
ul2 = child.newChild(None, 'ul', None)
feed_links[author].sort()
feed_links[author].reverse()
link_count = 0
for mtime, entry, title in feed_links[author]:
if not title: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a', title)
a.setProp('href', entry)
link_count = link_count + 1
if link_count >= 10: break
if link_count > 0: state = None
if state:
link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
child=child.next
# create a h2 and ul for the memes list
footer_top = footer.children
memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))
# create a header for the memes list
a = memes.newChild(None, 'a', None)
a.setProp('href', 'memes.atom')
img = a.newChild(None, 'img', None)
img.setProp('src', 'images/feed-icon-10x10.png')
# collect the results
log.debug("Fetch titles and collect the results")
from urllib import quote_plus
for i in range(0,len(weighted_links)):
weight, link, updated = weighted_links[i]
# ensure that somebody new points to this entry. This guards against
# groups of related links which several posts point to all.
novel = False
for weight, entry, feed, title, author, mtime in all_links[link]:
if entry not in seenit:
seenit.append(entry)
novel = True
if not novel: continue
all_links[link].sort()
all_links[link].reverse()
cache_file = filename(cache, link)
title = None
# when possible, take the title from the cache
if os.path.exists(cache_file):
entry = feedparser.parse(cache_file).entries[0]
if entry.has_key('title_detail'):
title = entry.title_detail.value
if entry.title_detail.type == 'text/plain': title = escape(title)
# otherwise, parse the html
if not title:
title = html(link).title
# dehtmlize
title = re.sub('&(\w+);',
lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)
# title too long? Insert zero width spaces where appropriate
if max(map(len,title.split())) > 30:
title=re.sub('(\W+)',u'\\1\u200b',title)
# save the entry title (it is used later)
entry_title = title.strip()
# add to the memes list
memes_ul.addContent('\n')
li = memes_ul.newChild(None, 'li', None)
memes_ul.addContent('\n')
# technorati link
a = li.newChild(None, 'a', None)
tlink = 'http://technorati.com/cosmos/search.html?url='
if link.startswith('http://'):
a.setProp('href',tlink + quote_plus(link[7:]))
else:
a.setProp('href',tlink + quote_plus(link))
a.setProp('title','cosmos')
img = a.newChild(None, 'img', None)
img.setProp('src','images/tcosm11.gif')
# main link
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
a.setProp('href',link)
if (((i==0) or (updated>=weighted_links[i-1][2])) and
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
rank = 0
for j in range(0,len(weighted_links)):
if updated < weighted_links[j][2]: rank = rank + 1
if rank < len(weighted_links)/2:
a.setProp('class','rising')
# voters
ul2 = li.newChild(None, 'ul', None)
voters = []
for weight, entry, feed, title, author, mtime in all_links[link]:
if entry in voters: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a' , author)
a.setProp('href',entry)
if title: a.setProp('title',title)
voters.append(entry)
# add to the meme feed
if len(all_links[link]) > 2:
meme_feed.addContent('\n')
entry = meme_feed.newChild(None, 'entry', None)
meme_feed.addContent('\n')
# entry
tagbase = config.link().split('/')
if not tagbase[-1]: tagbase = tagbase[:-1]
tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest())
entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
meme_link = entry.newTextChild(None, 'link', None)
meme_link.setProp('href', link)
entry.newTextChild(None, 'updated',
time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))
# voters
content = entry.newChild(None, 'content', None)
content.setProp('type', 'xhtml')
div = content.newTextChild(None, 'div', 'Spotted by:')
div.newNs('http://www.w3.org/1999/xhtml', None)
content_ul = div.newChild(None, 'ul', None)
for weight, entry, feed, title, author, mtime in all_links[link]:
li2 = content_ul.newTextChild(None, 'li', author + ": ")
a = li2.newTextChild(None, 'a' , title or 'untitled')
a.setProp('href',entry)
count = count + 1
if count >= 10: break
log.info("Writing " + MEMES_ATOM)
output=open(MEMES_ATOM,'w')
output.write(feed_doc.serialize('utf-8'))
output.close()
sys.stdout.write(doc.serialize('utf-8'))