Select latest entry by id if ids are duplicated in a feed

2007-08-19 12:46:51 -04:00 · 2007-08-19 12:46:51 -04:00 · 4d603b6bf7
commit 4d603b6bf7
parent 597f0ea2fe
3 changed files with 65 additions and 3 deletions
--- a/planet/spider.py
+++ b/planet/spider.py
@ -154,15 +154,28 @@ def writeCache(feed_uri, feed_info, data):
    from planet import idindex
    global index
    if index != None: index = idindex.open()
-
-    # write each entry to the cache
-    cache = config.cache_directory()
+ 
+    # select latest entry for each unique id
+    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
            if not entry['id']: continue

+        # determine updated date for purposes of selection
+        updated = ''
+        if entry.has_key('published'): updated=entry.published
+        if entry.has_key('updated'):   updated=entry.updated
+
+        # if not seen or newer than last seen, select it
+        if updated >= ids.get(entry.id,('',))[0]:
+            ids[entry.id] = (updated, entry)
+
+    # write each entry to the cache
+    cache = config.cache_directory()
+    for updated, entry in ids.values():
+
        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

--- a/tests/data/spider/testfeed4.atom
+++ b/tests/data/spider/testfeed4.atom
@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <link rel="self" href="http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom"/>
+  <id>tag:planet.intertwingly.net,2006:testfeed1</id>
+
+  <title>Sam Ruby</title>
+  <subtitle>It’s just data</subtitle>
+  <author>
+    <name>Sam Ruby</name>
+    <email>rubys@intertwingly.net</email>
+    <uri>http://www.intertwingly.net/blog/</uri>
+  </author>
+  <updated>2006-06-16T20:15:18-04:00</updated>
+  <link href="http://www.intertwingly.net/blog/"/>
+
+  <entry>
+    <id>tag:planet.intertwingly.net,2006:testfeed4</id>
+    <link href="http://example.com/1"/>
+    <title>Mercury</title>
+    <content>one</content>
+    <updated>2006-01-01T00:00:00Z</updated>
+  </entry>
+
+  <entry>
+    <id>tag:planet.intertwingly.net,2006:testfeed4</id>
+    <link href="http://example.com/3"/>
+    <title>Earth</title>
+    <content>three</content>
+    <updated>2006-01-03T00:00:00Z</updated>
+  </entry>
+
+  <entry>
+    <id>tag:planet.intertwingly.net,2006:testfeed4</id>
+    <link href="http://example.com/2"/>
+    <title>Venus</title>
+    <content>two</content>
+    <updated>2006-01-02T00:00:00Z</updated>
+  </entry>
+
+</feed>
+
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -88,6 +88,14 @@ class SpiderTest(unittest.TestCase):
        self.spiderFeed(testfeed % '1b')
        self.verify_spiderFeed()

+    def test_spiderFeedUpdatedEntries(self):
+        config.load(configfile)
+        self.spiderFeed(testfeed % '4')
+        self.assertEqual(2, len(glob.glob(workdir+"/*")))
+        data = feedparser.parse(workdir + 
+            '/planet.intertwingly.net,2006,testfeed4')
+        self.assertEqual(u'three', data.entries[0].content[0].value)
+
    def verify_spiderPlanet(self):
        files = glob.glob(workdir+"/*")