From acad3937f8101809b259baf3ff6a1d1910209c37 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 24 Oct 2007 11:17:59 -0400 Subject: [PATCH] Only look for DOCTYPE and ENTITY declarations at the beginning of the doc http://xn--8ws00zhy3a.com/blog/2007/10/obfuscated-atom --- planet/vendor/feedparser.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/planet/vendor/feedparser.py b/planet/vendor/feedparser.py index ec46598..9961c34 100755 --- a/planet/vendor/feedparser.py +++ b/planet/vendor/feedparser.py @@ -11,7 +11,7 @@ Recommended: Python 2.3 or later Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.2-pre-" + "$Revision: 265 $"[11:14] + "-svn" +__version__ = "4.2-pre-" + "$Revision: 266 $"[11:14] + "-svn" __license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -3297,11 +3297,15 @@ def _stripDoctype(data): rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE ''' - entity_pattern = re.compile(r']*?)>', re.MULTILINE) - entity_results=entity_pattern.findall(data) - data = entity_pattern.sub('', data) - doctype_pattern = re.compile(r']*?)>', re.MULTILINE) - doctype_results = doctype_pattern.findall(data) + start = re.search('<\w',data) + start = start and start.start() or -1 + head,data = data[:start+1], data[start+1:] + + entity_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE) + entity_results=entity_pattern.findall(head) + head = entity_pattern.sub('', head) + doctype_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE) + doctype_results = doctype_pattern.findall(head) doctype = doctype_results and doctype_results[0] or '' if doctype.lower().count('netscape'): version = 'rss091n' @@ -3315,7 +3319,7 @@ def _stripDoctype(data): safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) if safe_entities: replacement='\n]>' % '>\n