From 552dae3c1431ac87985078c1aa5064ea3d835950 Mon Sep 17 00:00:00 2001 From: Mikael Nilsson Date: Tue, 1 Jun 2010 16:55:51 +0200 Subject: [PATCH] First attempt at blacklisting --- planet/config.py | 13 ++++++++++--- planet/spider.py | 8 ++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/planet/config.py b/planet/config.py index e1325d1..0b7f5a4 100644 --- a/planet/config.py +++ b/planet/config.py @@ -303,7 +303,7 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru def http_cache_directory(): if parser.has_option('Planet', 'http_cache_directory'): - os.path.join(cache_directory(), + return os.path.join(cache_directory(), parser.get('Planet', 'http_cache_directory')) else: return os.path.join(cache_directory(), "cache") @@ -315,9 +315,16 @@ def cache_sources_directory(): else: return os.path.join(cache_directory(), 'sources') +def cache_blacklist_directory(): + if parser.has_option('Planet', 'cache_blacklist_directory'): + return os.path.join(cache_directory(), + parser.get('Planet', 'cache_blacklist_directory')) + else: + return os.path.join(cache_directory(), 'blacklist') + def cache_lists_directory(): if parser.has_option('Planet', 'cache_lists_directory'): - parser.get('Planet', 'cache_lists_directory') + return parser.get('Planet', 'cache_lists_directory') else: return os.path.join(cache_directory(), 'lists') @@ -332,7 +339,7 @@ def feed(): def feedtype(): if parser.has_option('Planet', 'feedtype'): - parser.get('Planet', 'feedtype') + return parser.get('Planet', 'feedtype') elif feed() and feed().find('atom')>=0: return 'atom' elif feed() and feed().find('rss')>=0: diff --git a/planet/spider.py b/planet/spider.py index 71de07b..034b312 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -69,6 +69,7 @@ def _is_http_uri(uri): def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() + blacklist = config.cache_blacklist_directory() # capture http status if not data.has_key("status"): @@ -190,6 +191,13 @@ def writeCache(feed_uri, feed_info, data): cache = config.cache_directory() for updated, entry in ids.values(): + # compute blacklist file name based on the id + blacklist_file = filename(blacklist, entry.id) + + # check if blacklist file exists. If so, skip it. + if os.path.exists(blacklist_file): + continue + # compute cache file name based on the id cache_file = filename(cache, entry.id)