diff --git a/planet/spider.py b/planet/spider.py index 0ad530d..454ef5c 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -14,8 +14,8 @@ except: PrettyPrint = None # Regular expressions to sanitise cache filenames -re_url_scheme = re.compile(r'^[^:]*://') -re_slash = re.compile(r'[?/]+') +re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?') +re_slash = re.compile(r'[?/:]+') re_initial_cruft = re.compile(r'^[,.]*') re_final_cruft = re.compile(r'[,.]*$') @@ -68,9 +68,13 @@ def spiderFeed(feed): xml = reconstitute.reconstitute(data, entry) file = open(out,'w') - if PrettyPrint: + try: PrettyPrint(reconstitute.reconstitute(data, entry), file) - else: + except: + # known reasons for failure include no pretty printer installed, + # and absurdly high levels of markup nesting causing Python to + # declare infinite recursion. + file.seek(0) file.write(reconstitute.reconstitute(data, entry).toxml('utf-8')) file.close() diff --git a/tests/test_spider.py b/tests/test_spider.py index 0954539..3bd9284 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -24,8 +24,12 @@ class SpiderTest(unittest.TestCase): def test_filename(self): self.assertEqual('./example.com,index.html', filename('.', 'http://example.com/index.html')) - self.assertEqual('./www.xn--8ws00zhy3a.com', + self.assertEqual('./xn--8ws00zhy3a.com', filename('.', u'http://www.\u8a79\u59c6\u65af.com/')) + self.assertEqual('./planet.intertwingly.net,2006,testfeed1,1', + filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1')) + self.assertEqual('./00000000-0000-0000-0000-000000000000', + filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000')) def test_spiderFeed(self): config.load(configfile) @@ -37,7 +41,7 @@ class SpiderTest(unittest.TestCase): # verify that the file names are as expected self.assertTrue(workdir + - '/tag:planet.intertwingly.net,2006:testfeed1,1' in files) + '/planet.intertwingly.net,2006,testfeed1,1' in files) # verify that the file timestamps match atom:updated for file in files: @@ -59,7 +63,7 @@ class SpiderTest(unittest.TestCase): # verify that the file names are as expected self.assertTrue(workdir + - '/tag:planet.intertwingly.net,2006:testfeed1,1' in files) + '/planet.intertwingly.net,2006,testfeed1,1' in files) self.assertTrue(workdir + - '/tag:planet.intertwingly.net,2006:testfeed2,1' in files) + '/planet.intertwingly.net,2006,testfeed2,1' in files)