Only look for DOCTYPE and ENTITY declarations at the beginning of the doc

http://xn--8ws00zhy3a.com/blog/2007/10/obfuscated-atom
This commit is contained in:
Sam Ruby 2007-10-24 11:17:59 -04:00
parent d90070f0de
commit acad3937f8

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
__version__ = "4.2-pre-" + "$Revision: 265 $"[11:14] + "-svn"
__version__ = "4.2-pre-" + "$Revision: 266 $"[11:14] + "-svn"
__license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@ -3297,11 +3297,15 @@ def _stripDoctype(data):
rss_version may be 'rss091n' or None
stripped_data is the same XML document, minus the DOCTYPE
'''
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
entity_results=entity_pattern.findall(data)
data = entity_pattern.sub('', data)
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
doctype_results = doctype_pattern.findall(data)
start = re.search('<\w',data)
start = start and start.start() or -1
head,data = data[:start+1], data[start+1:]
entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
entity_results=entity_pattern.findall(head)
head = entity_pattern.sub('', head)
doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
doctype_results = doctype_pattern.findall(head)
doctype = doctype_results and doctype_results[0] or ''
if doctype.lower().count('netscape'):
version = 'rss091n'
@ -3315,7 +3319,7 @@ def _stripDoctype(data):
safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
if safe_entities:
replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities)
data = doctype_pattern.sub(replacement, data)
data = doctype_pattern.sub(replacement, head) + data
return version, data, dict(replacement and safe_pattern.findall(replacement))