planet/filters/mememe.plugin

#
# This Venus output filter will annotate an XHTML page with a list of
# "memes" (or most popular linked destinations, based on the last week
# of entries from the cache) and will update the subscription list with
# links to recent entries from each subscription.
#
# Templates that don't produce XHTML natively will need their output passed
# through html2xhtml.plugin first.
#
# Typical configuration (based on classic_fancy):
#
#   [index.html.tmpl]
#   filters:
#     html2xhtml.plugin
#     mememe.plugin
#
#   [mememe.plugin]
#   sidebar = @class='sidebar'
#

import glob, libxml2, os, time, sys, sgmllib, urllib2, urlparse, re
from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs

try:
  from hashlib import md5
except:
  from md5 import new as md5

import planet
from planet import config
from planet.spider import filename
import feedparser
log = planet.logger
options = config.filter_options(sys.argv[0])
spam = options.get('spam', '').split()

MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')

now = time.time()
week = 7 * 86400
week_ago = now - week

cache = config.cache_directory()
meme_cache = os.path.join(cache, 'memes')
if not os.path.exists(meme_cache): os.makedirs(meme_cache)

bom = config.bill_of_materials()
if not 'images/tcosm11.gif' in bom:
  bom.append('images/tcosm11.gif')
  config.parser.set('Planet', 'bill_of_materials', ' '.join(bom))

all_links = {}
feed_links = {}

def check_cache(url):
  try:
    file = open(filename(meme_cache, url))
    headers = eval(file.read())
    file.close()
    return headers or {}
  except:
    return {}

def cache_meme(url, headers):
  json = []
  for key,value in headers.items():
    json.append('  %s: %s' % (toj(key), toj(value)))
  file = open(filename(meme_cache, url),'w')
  file.write('{\n' + ',\n'.join(json) + '\n}\n')
  file.close()

urlmap = {}
revmap = {}
def canonicalize(url):
  url = urlmap.get(url,url)
  parts = list(urlparse.urlparse(url))

  parts[0] = parts[0].lower()
  parts[1] = parts[1].lower()
  if parts[1].startswith('www.'): parts[1]=parts[1][4:]
  if not parts[2]: parts[2] = '/'
  parts[-1] = ''

  canonurl = urlparse.urlunparse(parts)
  revmap[canonurl] = url
  return canonurl

def unique_votes(links):
  voters = []
  for weight, entry, feed, title, author, mtime in links:
    if feed not in voters: voters.append(feed)
  return len(voters)

log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')):
  # ensure that this is within the past week
  if os.path.isdir(name): continue
  mtime = os.stat(name).st_mtime
  if mtime < week_ago: continue

  # parse the file
  try:
    doc = libxml2.parseFile(name)
  except:
    continue
  xp = doc.xpathNewContext()
  xp.xpathRegisterNs("atom", "http://www.w3.org/2005/Atom")
  xp.xpathRegisterNs("planet", "http://planet.intertwingly.net/")

  # determine the entry
  entry = xp.xpathEval("/atom:entry/atom:link[@rel='alternate']")
  if not entry: continue
  entry = canonicalize(entry[0].prop("href"))

  # determine the title
  title = xp.xpathEval("/atom:entry/atom:title")
  if title:
    if title[0].prop('type') == 'html':
      title = re.sub('<.*?>','',title[0].content)
    else:
      title = title[0].content
  title = str(title or '')

  # determine the feed id
  feed = xp.xpathEval("/atom:entry/atom:source/planet:memegroup")
  if not feed: feed = xp.xpathEval("/atom:entry/atom:source/atom:id")
  if not feed: continue
  feed = feed[0].content

  # determine the author
  author = xp.xpathEval("/atom:entry/atom:source/planet:name")
  if author:
    author = author[0].content
  else:
    author = ''

  # track the feed_links
  if author:
    if not feed_links.has_key(author): feed_links[author] = list()
    feed_links[author].append([mtime, entry, title])

  # identify the unique links
  entry_links = []
  for node in doc.xpathEval("//*[@href and not(@rel='source') and not(@rel='license')]"):
      parent = node.parent
      while parent:
          if parent.name == 'source': break
          parent = parent.parent
      else:
          link = canonicalize(node.prop('href'))
          if not link in entry_links:
              entry_links.append(link)
          if node.hasProp('title') and node.prop('title').startswith('http'):
              link = canonicalize(node.prop('title'))
              if not link in entry_links:
                  entry_links.append(link)

  # add the votes
  weight = 1.0 - (now - mtime)**2 / week**2
  vote = [(weight, str(entry), str(feed), title, author, mtime)]
  for link in entry_links:
    all_links[link] = all_links.get(link,list()) + vote

  # free the entry
  doc.freeDoc()

# tally the votes
weighted_links = []
for link, votes in all_links.items():
  site = {}
  updated = 0
  for weight, entry, feed, title, author, mtime in votes:
    site[feed] = max(site.get(feed,0), weight)
    if mtime > updated: updated=mtime
  weighted_links.append((sum(site.values()), link, updated))
weighted_links.sort()
weighted_links.reverse()

cp1252 = {
  128: 8364, # euro sign
  130: 8218, # single low-9 quotation mark
  131:  402, # latin small letter f with hook
  132: 8222, # double low-9 quotation mark
  133: 8230, # horizontal ellipsis
  134: 8224, # dagger
  135: 8225, # double dagger
  136:  710, # modifier letter circumflex accent
  137: 8240, # per mille sign
  138:  352, # latin capital letter s with caron
  139: 8249, # single left-pointing angle quotation mark
  140:  338, # latin capital ligature oe
  142:  381, # latin capital letter z with caron
  145: 8216, # left single quotation mark
  146: 8217, # right single quotation mark
  147: 8220, # left double quotation mark
  148: 8221, # right double quotation mark
  149: 8226, # bullet
  150: 8211, # en dash
  151: 8212, # em dash
  152:  732, # small tilde
  153: 8482, # trade mark sign
  154:  353, # latin small letter s with caron
  155: 8250, # single right-pointing angle quotation mark
  156:  339, # latin small ligature oe
  158:  382, # latin small letter z with caron
  159:  376} # latin capital letter y with diaeresis

# determine the title for a given url
class html(sgmllib.SGMLParser):
  def __init__(self, url):
    sgmllib.SGMLParser.__init__(self)
    self.title = ""
    self.feedurl = ""
    self.intitle = False

    headers = check_cache(url)

    try:
        # fetch the page
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Venus/MeMeme')
        if headers.has_key('etag'):
            request.add_header('If-None-Match', headers['etag'])
        if headers.has_key('last_modified'):
            request.add_header('If-Modified-Since', headers['last-modified'])
        response = urllib2.urlopen(request)
        self.feed(response.read())

        # ensure the data is in utf-8
        try:
            self.title = self.title.decode('utf-8')
        except:
            self.title = ''.join([unichr(cp1252.get(ord(c),ord(c)))
                for c in self.title.decode('iso-8859-1')])

        # cache the results
        headers = {}
        if self.feedurl: headers['feedurl'] = self.feedurl
        if self.title: headers['title'] = self.title
        headers.update(response.headers)
        cache_meme(url, headers)
    except:
        self.feedurl = headers.get('feedurl')
        if headers.has_key('title'):
           if isinstance(headers['title'],str):
               self.title=eval('u'+repr(headers['title']).replace('\\\\','\\'))
           else:
               self.title=headers['title']

    # if there is a feed, look for an entry that matches, and take that title
    if self.feedurl and not self.title:
        headers = check_cache(self.feedurl)
        data = feedparser.parse(self.feedurl, etag=headers.get('etag'),
            modified=headers.get('last-modified'))

        if data.has_key('headers') and data.has_key('status') and \
            data.status in [200, 301, 302]:

            titles = {}
            for entry in data.entries:
                if entry.has_key('title_detail') and entry.has_key('link'):
                    titles[entry.link] = entry.title_detail.value
                    if entry.title_detail.type == 'text/plain':
                        titles[entry.link] = escape(titles[entry.link])

            if titles.has_key(url): self.title = titles[url]

            data.headers.update(titles)
            cache_meme(self.feedurl, data.headers)
        else:
            if headers.has_key(url):
               if isinstance(headers[url],str):
                   self.title=eval('u'+repr(headers[url]).replace('\\\\','\\'))
               else:
                   self.title=headers[url]

    # fallback is the basename of the URI
    if not self.title:
        self.title = escape(url.rstrip('/').split('/')[-1].split('?')[0])

  # parse out the first autodiscovery link
  def start_link(self, attrs):
    if self.feedurl: return
    attrs = dict(map(lambda (k,v): (k.lower(),v), attrs))
    if not 'rel' in attrs: return
    rels = attrs['rel'].split(' ')
    if 'alternate' not in rels: return
    if not 'type' in attrs or not attrs['type'].endswith('xml'): return
    if 'href' in attrs:
      self.feedurl = attrs['href']

  # parse the page title
  def start_title(self, attributes):
    if not self.title: self.intitle = True
  def end_title(self):
    self.intitle = False
  def handle_data(self, text):
    if self.intitle: self.title += escape(text)

# convert unicode string to a json string
def toj(value):
  result = repr(value).replace(r'\x',r'\u00')
  if result[:1] == 'u': result=result[1:]
  if result.startswith("'"):
    result = '"%s"' % result.replace('"',r'\"').replace(r"\'","'")[1:-1]
  return result

seenit = []
count = 0

# construct an empty feed
feed_doc = libxml2.newDoc("1.0")
meme_feed = feed_doc.newChild(None, "feed", None)
meme_feed.newNs('http://www.w3.org/2005/Atom', None)
meme_feed.newTextChild(None, 'title', config.name() + ': Memes')
author = meme_feed.newChild(None, 'author', None)
author.newTextChild(None, 'name', config.owner_name())
if config.owner_email: author.newTextChild(None, 'email', config.owner_email())
meme_feed.newTextChild(None, 'id', os.path.join(config.link(), 'memes.atom'))
link = meme_feed.newChild(None, 'link', None)
link.setProp('href', os.path.join(config.link(), 'memes.atom'))
link.setProp('rel', 'self')
meme_feed.newTextChild(None, 'updated',
  time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()))

# parse the input
log.debug("Parse input")
doc=libxml2.readDoc(sys.stdin.read(), '', 'utf-8', libxml2.XML_PARSE_NONET)

# find the sidebar/footer
sidebar = options.get('sidebar','//*[@class="sidebar"]')
footer = doc.xpathEval(sidebar)
if not hasattr(footer,'__len__') or len(footer) == 0:
  raise Exception(sidebar + ' not found')
if len(footer) > 1:
  log.info("%d occurrences of %s found, taking last" % (len(footer),sidebar))
  if '@id' in sidebar:
    for element in footer[:-1]:
      element.unsetProp('id')
footer = footer[-1]

# add up to 10 entry links to each subscription
subs_ul = footer.children
while subs_ul.isText() or subs_ul.name != 'ul': subs_ul = subs_ul.next
child = subs_ul.children
while child:
  if child.name == 'li':
    if child.lastChild().name == 'ul': child.lastChild().unlinkNode()
    link = child.lastChild()
    while link.isText(): link=link.prev
    author = link.getContent()
    state = 'inactive'
    if feed_links.has_key(author):
      ul2 = child.newChild(None, 'ul', None)
      feed_links[author].sort()
      feed_links[author].reverse()
      link_count = 0
      for mtime, entry, title in feed_links[author]:
        if not title: continue
        li2 = ul2.newChild(None, 'li', None)
        a = li2.newTextChild(None, 'a', title)
        a.setProp('href', revmap.get(entry,entry))
        link_count = link_count + 1
        if link_count >= 10: break
      if link_count > 0: state = None
    if state:
      link.setProp('class',((link.prop('class') or '') + ' ' + state).strip())
  child=child.next

# create a h2 and ul for the memes list
footer_top = footer.children
memes = footer_top.addPrevSibling(footer.newTextChild(None, 'h2', 'Memes '))
memes_ul = footer_top.addPrevSibling(footer.newChild(None, 'ul', None))

# create a header for the memes list
a = memes.newChild(None, 'a', None)
a.setProp('href', 'memes.atom')
img = a.newChild(None, 'img', None)
img.setProp('src', 'images/feed-icon-10x10.png')

# collect the results
log.debug("Fetch titles and collect the results")
from urllib import quote_plus
for i in range(0,len(weighted_links)):
  weight, link, updated = weighted_links[i]
  if link in spam: continue

  # ensure that somebody new points to this entry.  This guards against
  # groups of related links which several posts point to all.
  novel = False
  for weight, entry, feed, title, author, mtime in all_links[link]:
    if entry not in seenit:
      seenit.append(entry)
      novel = True
  if not novel: continue

  all_links[link].sort()
  all_links[link].reverse()
  cache_file = filename(cache, link)
  title = None

  # when possible, take the title from the cache
  if os.path.exists(cache_file):
      entry = feedparser.parse(cache_file).entries[0]
      if entry.has_key('title_detail'):
        title = entry.title_detail.value
        if entry.title_detail.type == 'text/plain': title = escape(title)

  # otherwise, parse the html
  if not title:
    title = html(revmap.get(link,link)).title

  # dehtmlize
  title = re.sub('&(\w+);',
    lambda n: entitydefs.get(n.group(1), '&'+n.group(1)+';'), title)
  title = re.sub('&#(\d+);',lambda n: unichr(int(n.group(1))), title)
  title = re.sub('&#x(\w+);',lambda n: unichr(int(n.group(1),16)), title)

  # title too long?  Insert zero width spaces where appropriate
  if max(map(len,title.split())) > 30:
    title=re.sub('(\W+)',u'\\1\u200b',title)

  # save the entry title (it is used later)
  entry_title = title.strip()

  # add to the memes list
  memes_ul.addContent('\n')
  li = memes_ul.newChild(None, 'li', None)
  memes_ul.addContent('\n')

  # technorati link
  a = li.newChild(None, 'a', None)
  tlink = 'http://technorati.com/search/'
  if link.startswith('http://'):
    a.setProp('href',tlink + quote_plus(link[7:]))
  else:
    a.setProp('href',tlink + quote_plus(link))
  a.setProp('title','cosmos')
  img = a.newChild(None, 'img', None)
  img.setProp('src','images/tcosm11.gif')

  # main link
  a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
  a.setProp('href',revmap.get(link,link))
  if (((i==0) or (updated>=weighted_links[i-1][2])) and
    (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
    rank = 0
    for j in range(0,len(weighted_links)):
      if updated < weighted_links[j][2]: rank = rank + 1
    if rank < len(weighted_links)/2:
      a.setProp('class','rising')

  # voters
  ul2 = li.newChild(None, 'ul', None)
  voters = []
  for weight, entry, feed, title, author, mtime in all_links[link]:
    if entry in voters: continue
    li2 = ul2.newChild(None, 'li', None)
    a = li2.newTextChild(None, 'a' , author)
    a.setProp('href',revmap.get(entry,entry))
    if title: a.setProp('title',title)
    voters.append(entry)

  # add to the meme feed
  if unique_votes(all_links[link]) > 2:
    meme_feed.addContent('\n')
    entry = meme_feed.newChild(None, 'entry', None)
    meme_feed.addContent('\n')

    # entry
    tagbase = config.link().split('/')
    if not tagbase[-1]: tagbase = tagbase[:-1]
    tagbase = 'tag:%s,2007:%smeme/%%s' % (tagbase[2],'/'.join(tagbase[3:]))
    entry.newTextChild(None, 'id', tagbase % md5(link).hexdigest())
    entry.newTextChild(None, 'title', entry_title.encode('utf-8'))
    meme_link = entry.newTextChild(None, 'link', None)
    meme_link.setProp('href', link)
    entry.newTextChild(None, 'updated',
      time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(updated)))

    # voters
    content = entry.newChild(None, 'content', None)
    content.setProp('type', 'xhtml')
    div = content.newTextChild(None, 'div', 'Spotted by:')
    div.newNs('http://www.w3.org/1999/xhtml', None)
    content_ul = div.newChild(None, 'ul', None)
    for weight, entry, feed, title, author, mtime in all_links[link]:
      li2 = content_ul.newTextChild(None, 'li', author + ": ")
      a = li2.newTextChild(None, 'a' , title or 'untitled')
      a.setProp('href',entry)

  count = count + 1
  if count >= 10: break

log.info("Writing " + MEMES_ATOM)
output=open(MEMES_ATOM,'w')
output.write(feed_doc.serialize('utf-8'))
output.close()

sys.stdout.write(doc.serialize('utf-8'))