# # This Venus output filter will annotate an XHTML page with a list of # "memes" (or most popular linked destinations, based on the last week # of entries from the cache) and will update the subscription list with # links to recent entries from each subscription. # # Templates that don't produce XHTML natively will need their output passed # through html2xhtml.plugin first. # # Typical configuration (based on classic_fancy): # # [index.html.tmpl] # filters: # html2xhtml.plugin # mememe.plugin # # [mememe.plugin] # sidebar = @class='sidebar' # import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re, md5 from xml.sax.saxutils import escape from htmlentitydefs import entitydefs import planet from planet import config from planet.spider import filename import feedparser log = planet.logger options = config.filter_options(sys.argv[0]) MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom') now = time.time() week = 7 * 86400 week_ago = now - week cache = config.cache_directory() meme_cache = os.path.join(cache, 'memes') if not os.path.exists(meme_cache): os.makedirs(meme_cache) bom = config.bill_of_materials() if not 'images/tcosm11.gif' in bom: bom.append('images/tcosm11.gif') config.parser.set('Planet', 'bill_of_materials', ' '.join(bom)) all_links = {} feed_links = {} def check_cache(url): try: file = open(filename(meme_cache, url)) headers = eval(file.read()) file.close() return headers or {} except: return {} def cache_meme(url, headers): json = [] for key,value in headers.items(): json.append(' %s: %s' % (toj(key), toj(value))) file = open(filename(meme_cache, url),'w') file.write('{\n' + ',\n'.join(json) + '\n}\n') file.close() urlmap = {} revmap = {} def canonicalize(url): url = urlmap.get(url,url) parts = list(urlparse.urlparse(url)) parts[0] = parts[0].lower() parts[1] = parts[1].lower() if parts[1].startswith('www.'): parts[1]=parts[1][4:] if not parts[2]: parts[2] = '/' parts[-1] = '' canonurl = urlparse.urlunparse(parts) revmap[canonurl] = url return canonurl log.debug("Loading cached data") for name in glob.glob(os.path.join(cache, '*')): # ensure that this is within the past week if os.path.isdir(name): continue mtime = os.stat(name).st_mtime if mtime < week_ago: continue # parse the file try: doc = libxml2.parseFile(name) except: continue xp = doc.xpathNewContext() xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom") xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/") # determine the entry entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']") if not entry: continue entry = canonicalize(entry[0].prop("href")) # determine the title title = xp.xpathEval("/atom:entry/atom:title") if title: if title[0].prop('type') == 'html': title = re.sub('<.*?>','',title[0].content) else: title = title[0].content title = str(title or '') # determine the feed id feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup") if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id") if not feed: continue feed = feed[0].content # determine the author author = xp.xpathEval("/atom:entry/atom:source/planet:name") if author: author = author[0].content else: author = '' # track the feed_links if author: if not feed_links.has_key(author): feed_links[author] = list() feed_links[author].append([mtime, entry, title]) # identify the unique links entry_links = [] for node in doc.xpathEval("//*[@href and not(@rel='source') and not(@rel='license')]"): parent = node.parent while parent: if parent.name == 'source': break parent = parent.parent else: link = canonicalize(node.prop('href')) if not link in entry_links: entry_links.append(link) if node.hasProp('title') and node.prop('title').startswith('http'): link = canonicalize(node.prop('title')) if not link in entry_links: entry_links.append(link) # add the votes weight = 1.0 - (now - mtime)**2 / week**2 vote = [(weight, str(entry), str(feed), title, author, mtime)] for link in entry_links: all_links[link] = all_links.get(link,list()) + vote # free the entry doc.freeDoc() # tally the votes weighted_links = [] for link, votes in all_links.items(): site = {} updated = 0 for weight, entry, feed, title, author, mtime in votes: site[feed] = max(site.get(feed,0), weight) if mtime > updated: updated=mtime weighted_links.append((sum(site.values()), link, updated)) weighted_links.sort() weighted_links.reverse() cp1252 = { 128: 8364, # euro sign 130: 8218, # single low-9 quotation mark 131: 402, # latin small letter f with hook 132: 8222, # double low-9 quotation mark 133: 8230, # horizontal ellipsis 134: 8224, # dagger 135: 8225, # double dagger 136: 710, # modifier letter circumflex accent 137: 8240, # per mille sign 138: 352, # latin capital letter s with caron 139: 8249, # single left-pointing angle quotation mark 140: 338, # latin capital ligature oe 142: 381, # latin capital letter z with caron 145: 8216, # left single quotation mark 146: 8217, # right single quotation mark 147: 8220, # left double quotation mark 148: 8221, # right double quotation mark 149: 8226, # bullet 150: 8211, # en dash 151: 8212, # em dash 152: 732, # small tilde 153: 8482, # trade mark sign 154: 353, # latin small letter s with caron 155: 8250, # single right-pointing angle quotation mark 156: 339, # latin small ligature oe 158: 382, # latin small letter z with caron 159: 376} # latin capital letter y with diaeresis # determine the title for a given url class html(sgmllib.SGMLParser): def __init__(self, url): sgmllib.SGMLParser.__init__(self) self.title = "" self.feedurl = "" self.intitle = False headers = check_cache(url) try: # fetch the page request = urllib2.Request(url) request.add_header('User-Agent', 'Venus/MeMeme') if headers.has_key('etag'): request.add_header('If-None-Match', headers['etag']) if headers.has_key('last_modified'): request.add_header('If-Modified-Since', headers['last-modified']) response = urllib2.urlopen(request) self.feed(response.read()) # ensure the data is in utf-8 try: self.title = self.title.decode('utf-8') except: self.title = ''.join([unichr(cp1252.get(ord(c),ord(c))) for c in self.title.decode('iso-8859-1')]) # cache the results headers = {} if self.feedurl: headers['feedurl'] = self.feedurl if self.title: headers['title'] = self.title headers.update(response.headers) cache_meme(url, headers) except: self.feedurl = headers.get('feedurl') if headers.has_key('title'): if isinstance(headers['title'],str): self.title=eval('u'+repr(headers['title']).replace('\\\\','\\')) else: self.title=headers['title'] # if there is a feed, look for an entry that matches, and take that title if self.feedurl and not self.title: headers = check_cache(self.feedurl) data = feedparser.parse(self.feedurl, etag=headers.get('etag'), modified=headers.get('last-modified')) if data.has_key('headers') and data.has_key('status') and \ data.status in [200, 301, 302]: titles = {} for entry in data.entries: if entry.has_key('title_detail') and entry.has_key('link'): titles[entry.link] = entry.title_detail.value if entry.title_detail.type == 'text/plain': titles[entry.link] = escape(titles[entry.link]) if titles.has_key(url): self.title = titles[url] data.headers.update(titles) cache_meme(self.feedurl, data.headers) else: if headers.has_key(url): if isinstance(headers[url],str): self.title=eval('u'+repr(headers[url]).replace('\\\\','\\')) else: self.title=headers[url] # fallback is the basename of the URI if not self.title: self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0]) # parse out the first autodiscovery link def start_link(self, attrs): if self.feedurl: return attrs = dict(map(lambda (k,v): (k.lower(),v), attrs)) if not 'rel' in attrs: return rels = attrs['rel'].split(' ') if 'alternate' not in rels: return if not 'type' in attrs or not attrs['type'].endswith('xml'): return if 'href' in attrs: self.feedurl = attrs['href'] # parse the page title def start_title(self, attributes): if not self.title: self.intitle = True def end_title(self): self.intitle = False def handle_data(self, text): if self.intitle: self.title += escape(text) # convert unicode string to a json string def toj(value): result = repr(value).replace(r'\x',r'\u00') if result[:1] == 'u': result=result[1:] if result.startswith("'"): result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1] return result seenit = [] count = 0 # construct an empty feed feed_doc = libxml2.newDoc("1.0") meme_feed = feed_doc.newChild(None, "feed", None) meme_feed.newNs('http://www.w3.org/2005/Atom', None) meme_feed.newTextChild(None, 'title', config.name() + ': Memes') author = meme_feed.newChild(None, 'author', None) author.newTextChild(None, 'name', config.owner_name()) if config.owner_email: author.newTextChild(None, 'email', config.owner_email()) meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom')) link = meme_feed.newChild(None, 'link', None) link.setProp('href', os.path.join(config.link(), 'memes.atom')) link.setProp('rel', 'self') meme_feed.newTextChild(None, 'updated', time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())) # parse the input log.debug("Parse input") doc=libxml2.readDoc(sys.stdin.read(), '', 'utf-8', libxml2.XML_PARSE_NONET) # find the sidebar/footer sidebar = options.get('sidebar','//*[@class="sidebar"]') footer = doc.xpathEval(sidebar) if not hasattr(footer,'__len__') or len(footer) == 0: raise Exception(sidebar + ' not found') if len(footer) > 1: log.info("%d occurrences of %s found, taking first" % (len(footer),sidebar)) footer = footer[0] # add up to 10 entry links to each subscription subs_ul = footer.children while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next child = subs_ul.children while child: if child.name == 'li': if child.lastChild().name == 'ul': child.lastChild().unlinkNode() link = child.lastChild() while link.isText(): link=link.prev author = link.getContent() state = 'inactive' if feed_links.has_key(author): ul2 = child.newChild(None, 'ul', None) feed_links[author].sort() feed_links[author].reverse() link_count = 0 for mtime, entry, title in feed_links[author]: if not title: continue li2 = ul2.newChild(None, 'li', None) a = li2.newTextChild(None, 'a', title) a.setProp('href', revmap.get(entry,entry)) link_count = link_count + 1 if link_count >= 10: break if link_count > 0: state = None if state: link.setProp('class',((link.prop('class') or '') + ' ' + state).strip()) child=child.next # create a h2 and ul for the memes list footer_top = footer.children memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes ')) memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None)) # create a header for the memes list a = memes.newChild(None, 'a', None) a.setProp('href', 'memes.atom') img = a.newChild(None, 'img', None) img.setProp('src', 'images/feed-icon-10x10.png') # collect the results log.debug("Fetch titles and collect the results") from urllib import quote_plus for i in range(0,len(weighted_links)): weight, link, updated = weighted_links[i] # ensure that somebody new points to this entry. This guards against # groups of related links which several posts point to all. novel = False for weight, entry, feed, title, author, mtime in all_links[link]: if entry not in seenit: seenit.append(entry) novel = True if not novel: continue all_links[link].sort() all_links[link].reverse() cache_file = filename(cache, link) title = None # when possible, take the title from the cache if os.path.exists(cache_file): entry = feedparser.parse(cache_file).entries[0] if entry.has_key('title_detail'): title = entry.title_detail.value if entry.title_detail.type == 'text/plain': title = escape(title) # otherwise, parse the html if not title: title = html(revmap.get(link,link)).title # dehtmlize title = re.sub('&(\w+);', lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title) title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title) title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title) # title too long? Insert zero width spaces where appropriate if max(map(len,title.split())) > 30: title=re.sub('(\W+)',u'\\1\u200b',title) # save the entry title (it is used later) entry_title = title.strip() # add to the memes list memes_ul.addContent('\n') li = memes_ul.newChild(None, 'li', None) memes_ul.addContent('\n') # technorati link a = li.newChild(None, 'a', None) tlink = 'http://technorati.com/cosmos/search.html?url=' if link.startswith('http://'): a.setProp('href',tlink + quote_plus(link[7:])) else: a.setProp('href',tlink + quote_plus(link)) a.setProp('title','cosmos') img = a.newChild(None, 'img', None) img.setProp('src','images/tcosm11.gif') # main link a = li.newTextChild(None, 'a', title.strip().encode('utf-8')) a.setProp('href',revmap.get(link,link)) if (((i==0) or (updated>=weighted_links[i-1][2])) and (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))): rank = 0 for j in range(0,len(weighted_links)): if updated < weighted_links[j][2]: rank = rank + 1 if rank < len(weighted_links)/2: a.setProp('class','rising') # voters ul2 = li.newChild(None, 'ul', None) voters = [] for weight, entry, feed, title, author, mtime in all_links[link]: if entry in voters: continue li2 = ul2.newChild(None, 'li', None) a = li2.newTextChild(None, 'a' , author) a.setProp('href',revmap.get(entry,entry)) if title: a.setProp('title',title) voters.append(entry) # add to the meme feed if len(all_links[link]) > 2: meme_feed.addContent('\n') entry = meme_feed.newChild(None, 'entry', None) meme_feed.addContent('\n') # entry tagbase = config.link().split('/') if not tagbase[-1]: tagbase = tagbase[:-1] tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:])) entry.newTextChild(None, 'id', tagbase % md5.new(link).hexdigest()) entry.newTextChild(None, 'title', entry_title.encode('utf-8')) meme_link = entry.newTextChild(None, 'link', None) meme_link.setProp('href', link) entry.newTextChild(None, 'updated', time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated))) # voters content = entry.newChild(None, 'content', None) content.setProp('type', 'xhtml') div = content.newTextChild(None, 'div', 'Spotted by:') div.newNs('http://www.w3.org/1999/xhtml', None) content_ul = div.newChild(None, 'ul', None) for weight, entry, feed, title, author, mtime in all_links[link]: li2 = content_ul.newTextChild(None, 'li', author + ": ") a = li2.newTextChild(None, 'a' , title or 'untitled') a.setProp('href',entry) count = count + 1 if count >= 10: break log.info("Writing " + MEMES_ATOM) output=open(MEMES_ATOM,'w') output.write(feed_doc.serialize('utf-8')) output.close() sys.stdout.write(doc.serialize('utf-8'))