some sites *require* the www... go figure.

This commit is contained in:
Sam Ruby 2007-10-26 21:44:20 -04:00
parent 6826ee28f7
commit d73e98e874

View File

@ -65,6 +65,7 @@ def cache_meme(url, headers):
file.close() file.close()
urlmap = {} urlmap = {}
revmap = {}
def canonicalize(url): def canonicalize(url):
url = urlmap.get(url,url) url = urlmap.get(url,url)
parts = list(urlparse.urlparse(url)) parts = list(urlparse.urlparse(url))
@ -74,7 +75,10 @@ def canonicalize(url):
if parts[1].startswith('www.'): parts[1]=parts[1][4:] if parts[1].startswith('www.'): parts[1]=parts[1][4:]
if not parts[2]: parts[2] = '/' if not parts[2]: parts[2] = '/'
parts[-1] = '' parts[-1] = ''
return urlparse.urlunparse(parts)
canonurl = urlparse.urlunparse(parts)
revmap[canonurl] = url
return canonurl
log.debug("Loading cached data") log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')): for name in glob.glob(os.path.join(cache, '*')):
@ -341,7 +345,7 @@ while child:
if not title: continue if not title: continue
li2 = ul2.newChild(None, 'li', None) li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a', title) a = li2.newTextChild(None, 'a', title)
a.setProp('href', entry) a.setProp('href', revmap.get(entry,entry))
link_count = link_count + 1 link_count = link_count + 1
if link_count >= 10: break if link_count >= 10: break
if link_count > 0: state = None if link_count > 0: state = None
@ -389,7 +393,7 @@ for i in range(0,len(weighted_links)):
# otherwise, parse the html # otherwise, parse the html
if not title: if not title:
title = html(link).title title = html(revmap.get(link,link)).title
# dehtmlize # dehtmlize
title = re.sub('&(\w+);', title = re.sub('&(\w+);',
@ -422,7 +426,7 @@ for i in range(0,len(weighted_links)):
# main link # main link
a = li.newTextChild(None, 'a', title.strip().encode('utf-8')) a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
a.setProp('href',link) a.setProp('href',revmap.get(link,link))
if (((i==0) or (updated>=weighted_links[i-1][2])) and if (((i==0) or (updated>=weighted_links[i-1][2])) and
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))): (i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
rank = 0 rank = 0
@ -438,7 +442,7 @@ for i in range(0,len(weighted_links)):
if entry in voters: continue if entry in voters: continue
li2 = ul2.newChild(None, 'li', None) li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a' , author) a = li2.newTextChild(None, 'a' , author)
a.setProp('href',entry) a.setProp('href',revmap.get(entry,entry))
if title: a.setProp('title',title) if title: a.setProp('title',title)
voters.append(entry) voters.append(entry)