Only look for DOCTYPE and ENTITY declarations at the beginning of the doc

http://xn--8ws00zhy3a.com/blog/2007/10/obfuscated-atom
2007-10-24 11:17:59 -04:00 · 2007-10-24 11:17:59 -04:00 · acad3937f8
commit acad3937f8
parent d90070f0de
1 changed files with 11 additions and 7 deletions
--- a/planet/vendor/feedparser.py
+++ b/planet/vendor/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 265 $"[11:14] + "-svn"
+__version__ = "4.2-pre-" + "$Revision: 266 $"[11:14] + "-svn"
 __license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -3297,11 +3297,15 @@ def _stripDoctype(data):
    rss_version may be 'rss091n' or None
    stripped_data is the same XML document, minus the DOCTYPE
    '''
-    entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
-    entity_results=entity_pattern.findall(data)
-    data = entity_pattern.sub('', data)
-    doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
-    doctype_results = doctype_pattern.findall(data)
+    start = re.search('<\w',data)
+    start = start and start.start() or -1
+    head,data = data[:start+1], data[start+1:]
+    
+    entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
+    entity_results=entity_pattern.findall(head)
+    head = entity_pattern.sub('', head)
+    doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
+    doctype_results = doctype_pattern.findall(head)
    doctype = doctype_results and doctype_results[0] or ''
    if doctype.lower().count('netscape'):
        version = 'rss091n'
@ -3315,7 +3319,7 @@ def _stripDoctype(data):
       safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
       if safe_entities:
           replacement='<!DOCTYPE feed [\n  <!ENTITY %s>\n]>' % '>\n  <!ENTITY '.join(safe_entities)
-    data = doctype_pattern.sub(replacement, data)
+    data = doctype_pattern.sub(replacement, head) + data

    return version, data, dict(replacement and safe_pattern.findall(replacement))