From 68d8233010efd671f4c94b128b761132fac6a366 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 10 Feb 2011 18:27:36 -0500 Subject: [PATCH] Add a script to scan for favicons --- favicon.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 favicon.py diff --git a/favicon.py b/favicon.py new file mode 100644 index 0000000..ad9f5d2 --- /dev/null +++ b/favicon.py @@ -0,0 +1,79 @@ +import sys, socket +from planet import config, feedparser +from planet.spider import filename +from urllib2 import urlopen +from urlparse import urljoin +from html5lib import html5parser, treebuilders +from ConfigParser import ConfigParser + +# load config files (default: config.ini) +for arg in sys.argv[1:]: + config.load(arg) +if len(sys.argv) == 1: + config.load('config.ini') + +from Queue import Queue +from threading import Thread + +# determine which subscriptions have no icon but do have a html page +fetch_queue = Queue() +html = ['text/html', 'application/xhtml+xml'] +sources = config.cache_sources_directory() +for sub in config.subscriptions(): + data=feedparser.parse(filename(sources,sub)) + if data.feed.get('icon'): continue + if not data.feed.get('links'): continue + for link in data.feed.links: + if link.rel=='alternate' and link.type in html: + fetch_queue.put((sub, link.href)) + break + +# find the favicon for a given webpage +def favicon(page): + parser=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) + doc=parser.parse(urlopen(page)) + for link in doc.getElementsByTagName('link'): + if link.hasAttribute('rel') and link.hasAttribute('href'): + if 'icon' in link.attributes['rel'].value.split(' '): + return urljoin(page, link.attributes['href'].value) + favicon = urljoin(page, '/favicon.ico') + if urlopen(favicon).info()['content-length'] != '0': + return favicon + +# thread worker that fills in the dictionary which maps subs to favicon +icons = {} +def fetch(thread_index, fetch_queue, icons): + while 1: + sub, html = fetch_queue.get() + if not html: break + try: + icon = favicon(html) + if icon: icons[sub] = icon + except: + pass + +# set timeout +try: + socket.setdefaulttimeout(float(config.feed_timeout())) +except: + pass + +# (optionally) spawn threads, fetch pages +threads = {} +if int(config.spider_threads()): + for i in range(int(config.spider_threads())): + threads[i] = Thread(target=fetch, args=(i, fetch_queue, icons)) + fetch_queue.put((None, None)) + threads[i].start() + for i in range(int(config.spider_threads())): + threads[i].join() +else: + fetch_queue.put((None, None)) + fetch(0, fetch_queue, icons) + +# produce config file +config = ConfigParser() +for sub, icon in icons.items(): + config.add_section(sub) + config.set(sub, 'favicon', icon) +config.write(sys.stdout)