Only look for DOCTYPE and ENTITY declarations at the beginning of the doc

http://xn--8ws00zhy3a.com/blog/2007/10/obfuscated-atom
This commit is contained in:
Sam Ruby 2007-10-24 11:17:59 -04:00
parent d90070f0de
commit acad3937f8

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
""" """
__version__ = "4.2-pre-" + "$Revision: 265 $"[11:14] + "-svn" __version__ = "4.2-pre-" + "$Revision: 266 $"[11:14] + "-svn"
__license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved. __license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,
@ -3297,11 +3297,15 @@ def _stripDoctype(data):
rss_version may be 'rss091n' or None rss_version may be 'rss091n' or None
stripped_data is the same XML document, minus the DOCTYPE stripped_data is the same XML document, minus the DOCTYPE
''' '''
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE) start = re.search('<\w',data)
entity_results=entity_pattern.findall(data) start = start and start.start() or -1
data = entity_pattern.sub('', data) head,data = data[:start+1], data[start+1:]
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
doctype_results = doctype_pattern.findall(data) entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
entity_results=entity_pattern.findall(head)
head = entity_pattern.sub('', head)
doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
doctype_results = doctype_pattern.findall(head)
doctype = doctype_results and doctype_results[0] or '' doctype = doctype_results and doctype_results[0] or ''
if doctype.lower().count('netscape'): if doctype.lower().count('netscape'):
version = 'rss091n' version = 'rss091n'
@ -3315,7 +3319,7 @@ def _stripDoctype(data):
safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
if safe_entities: if safe_entities:
replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities) replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities)
data = doctype_pattern.sub(replacement, data) data = doctype_pattern.sub(replacement, head) + data
return version, data, dict(replacement and safe_pattern.findall(replacement)) return version, data, dict(replacement and safe_pattern.findall(replacement))