filters/xhtml2html =~ s/Genshi/html5lib/
This commit is contained in:
parent
1fcfbe35c0
commit
4b1e0da922
@ -84,8 +84,8 @@ then the output stream is
|
|||||||
through the specified filter and the output is planced into the named file; the
|
through the specified filter and the output is planced into the named file; the
|
||||||
other unmodified branch continues onto the next filter, if any.
|
other unmodified branch continues onto the next filter, if any.
|
||||||
One use case for this function is to use
|
One use case for this function is to use
|
||||||
<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
|
<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
|
||||||
an HTML output stream from one source.</li>
|
and an HTML output stream from one source.</li>
|
||||||
|
|
||||||
<li>Templates written using htmltmpl or django currently only have access to a
|
<li>Templates written using htmltmpl or django currently only have access to a
|
||||||
fixed set of fields, whereas XSLT and genshi templates have access to
|
fixed set of fields, whereas XSLT and genshi templates have access to
|
||||||
|
21
filters/xhtml2html.plugin
Normal file
21
filters/xhtml2html.plugin
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# Example usages:
|
||||||
|
#
|
||||||
|
# filters:
|
||||||
|
# xhtml2html.plugin?quote_attr_values=True"e_char="'"
|
||||||
|
#
|
||||||
|
# -- or --
|
||||||
|
#
|
||||||
|
# [xhtml2html.plugin]
|
||||||
|
# quote_attr_values=True
|
||||||
|
# quote_char="'"
|
||||||
|
|
||||||
|
import sys
|
||||||
|
opts = zip(sys.argv[1::2],sys.argv[2::2])
|
||||||
|
opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
|
||||||
|
|
||||||
|
from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
|
||||||
|
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
|
||||||
|
tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
|
||||||
|
serializer = serializer.HTMLSerializer(**dict(opts))
|
||||||
|
for text in serializer.serialize(tokens, encoding='utf-8'):
|
||||||
|
sys.stdout.write(text)
|
@ -1,5 +0,0 @@
|
|||||||
import sys
|
|
||||||
from genshi.input import XMLParser
|
|
||||||
from genshi.output import HTMLSerializer
|
|
||||||
|
|
||||||
print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
|
|
35
planet/vendor/html5lib/html5parser.py
vendored
35
planet/vendor/html5lib/html5parser.py
vendored
@ -1,16 +1,10 @@
|
|||||||
|
|
||||||
# Differences from the current specification (23 December 2006) are as follows:
|
# Differences from the current specification (23 December 2006) are as follows:
|
||||||
# * Phases and insertion modes are one concept in parser.py.
|
# * Phases and insertion modes are one concept in parser.py.
|
||||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||||
# always exist.
|
# always exist.
|
||||||
# * </br> creates a <br> element.
|
|
||||||
#
|
#
|
||||||
# We haven't updated DOCTYPE handling yet
|
# We haven't updated DOCTYPE handling yet
|
||||||
#
|
|
||||||
# It should be trivial to add the following cases. However, we should probably
|
|
||||||
# also look into comment handling and such then...
|
|
||||||
# * A <p> element end tag creates an empty <p> element when there's no <p>
|
|
||||||
# element in scope.
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
frozenset
|
frozenset
|
||||||
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
|
|||||||
self.startTagHandler.default = self.startTagOther
|
self.startTagHandler.default = self.startTagOther
|
||||||
|
|
||||||
self.endTagHandler = utils.MethodDispatcher([
|
self.endTagHandler = utils.MethodDispatcher([
|
||||||
(("html", "head", "body", "br"), self.endTagImplyHead)
|
(("html", "head", "body", "br", "p"), self.endTagImplyHead)
|
||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
@ -530,7 +524,7 @@ class InHeadPhase(Phase):
|
|||||||
|
|
||||||
self. endTagHandler = utils.MethodDispatcher([
|
self. endTagHandler = utils.MethodDispatcher([
|
||||||
("head", self.endTagHead),
|
("head", self.endTagHead),
|
||||||
(("html", "body", "br"), self.endTagImplyAfterHead),
|
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
|
||||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
(("title", "style", "script"), self.endTagTitleStyleScript)
|
||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
|
|||||||
if self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
self.tree.generateImpliedEndTags("p")
|
self.tree.generateImpliedEndTags("p")
|
||||||
if self.tree.openElements[-1].name != "p":
|
if self.tree.openElements[-1].name != "p":
|
||||||
self.parser.parseError("Unexpected end tag (p).")
|
self.parser.parseError(_("Unexpected end tag (p)."))
|
||||||
while self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
self.tree.openElements.pop()
|
while self.tree.elementInScope("p"):
|
||||||
|
self.tree.openElements.pop()
|
||||||
|
else:
|
||||||
|
self.startTagCloseP("p", {})
|
||||||
|
self.endTagP("p")
|
||||||
|
|
||||||
def endTagBody(self, name):
|
def endTagBody(self, name):
|
||||||
# XXX Need to take open <p> tags into account here. We shouldn't imply
|
# XXX Need to take open <p> tags into account here. We shouldn't imply
|
||||||
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
|
|||||||
if inScope:
|
if inScope:
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||||
u"early. Expected other end tag."))
|
u"early. Expected other end tag."))
|
||||||
if inScope:
|
if inScope:
|
||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
|
|||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
|
|
||||||
def endTagForm(self, name):
|
def endTagForm(self, name):
|
||||||
self.endTagBlock(name)
|
if self.tree.elementInScope(name):
|
||||||
|
self.tree.generateImpliedEndTags()
|
||||||
|
if self.tree.openElements[-1].name != name:
|
||||||
|
self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
|
||||||
|
else:
|
||||||
|
self.tree.openElements.pop()
|
||||||
self.tree.formPointer = None
|
self.tree.formPointer = None
|
||||||
|
|
||||||
def endTagListItem(self, name):
|
def endTagListItem(self, name):
|
||||||
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
|
|||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
self.tree.generateImpliedEndTags(name)
|
self.tree.generateImpliedEndTags(name)
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||||
u"early. Expected other end tag."))
|
u"early. Expected other end tag."))
|
||||||
|
|
||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
break
|
break
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError((u"Unexpected end tag (" + name + "). "
|
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
|
||||||
u"Expected other end tag."))
|
u"Expected other end tag."))
|
||||||
|
|
||||||
for item in headingElements:
|
for item in headingElements:
|
||||||
|
6
planet/vendor/html5lib/inputstream.py
vendored
6
planet/vendor/html5lib/inputstream.py
vendored
@ -53,6 +53,7 @@ class HTMLInputStream(object):
|
|||||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
||||||
|
|
||||||
self.queue = []
|
self.queue = []
|
||||||
|
self.errors = []
|
||||||
|
|
||||||
self.line = self.col = 0
|
self.line = self.col = 0
|
||||||
self.lineLengths = []
|
self.lineLengths = []
|
||||||
@ -214,7 +215,10 @@ class HTMLInputStream(object):
|
|||||||
return EOF
|
return EOF
|
||||||
|
|
||||||
# Normalize newlines and null characters
|
# Normalize newlines and null characters
|
||||||
if c == '\x00': c = u'\uFFFD'
|
if c == '\x00':
|
||||||
|
self.errors.append('null character found in input stream, '
|
||||||
|
'replaced with U+FFFD')
|
||||||
|
c = u'\uFFFD'
|
||||||
if c == '\r':
|
if c == '\r':
|
||||||
c = self.dataStream.read(1, 1)
|
c = self.dataStream.read(1, 1)
|
||||||
if c != '\n':
|
if c != '\n':
|
||||||
|
15
planet/vendor/html5lib/liberalxmlparser.py
vendored
15
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
|
|
||||||
# For EmptyTags, process both a Start and an End tag
|
# For EmptyTags, process both a Start and an End tag
|
||||||
if token["type"] == "EmptyTag":
|
if token["type"] == "EmptyTag":
|
||||||
|
save = self.tokenizer.contentModelFlag
|
||||||
self.phase.processStartTag(token["name"], token["data"])
|
self.phase.processStartTag(token["name"], token["data"])
|
||||||
|
self.tokenizer.contentModelFlag = save
|
||||||
token["data"] = {}
|
token["data"] = {}
|
||||||
token["type"] = "EndTag"
|
token["type"] = "EndTag"
|
||||||
|
|
||||||
@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
self.phases["initial"] = XmlInitialPhase(self, self.tree)
|
||||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
def normalizeToken(self, token):
|
||||||
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
|
|||||||
self.tree.document.appendChild(element)
|
self.tree.document.appendChild(element)
|
||||||
self.parser.phase = self.parser.phases["beforeHead"]
|
self.parser.phase = self.parser.phases["beforeHead"]
|
||||||
|
|
||||||
|
class XmlInitialPhase(html5parser.InitialPhase):
|
||||||
|
""" Consume XML Prologs """
|
||||||
|
def processComment(self, data):
|
||||||
|
if not data.startswith('?xml') or not data.endswith('?'):
|
||||||
|
html5parser.InitialPhase.processComment(self, data)
|
||||||
|
|
||||||
class XmlRootPhase(html5parser.Phase):
|
class XmlRootPhase(html5parser.Phase):
|
||||||
|
""" Consume XML Prologs """
|
||||||
|
def processComment(self, data):
|
||||||
|
print repr(data)
|
||||||
|
if not data.startswith('?xml') or not data.endswith('?'):
|
||||||
|
html5parser.InitialPhase.processComment(self, data)
|
||||||
|
|
||||||
""" Prime the Xml parser """
|
""" Prime the Xml parser """
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
self.tree.openElements.append(self.tree.document)
|
self.tree.openElements.append(self.tree.document)
|
||||||
|
69
planet/vendor/html5lib/sanitizer.py
vendored
69
planet/vendor/html5lib/sanitizer.py
vendored
@ -2,7 +2,7 @@ import re
|
|||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
from tokenizer import HTMLTokenizer
|
from tokenizer import HTMLTokenizer
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer):
|
class HTMLSanitizerMixin:
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||||
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
|
|||||||
# => <script> do_nasty_stuff() </script>
|
# => <script> do_nasty_stuff() </script>
|
||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
# => <a>Click here for $100</a>
|
# => <a>Click here for $100</a>
|
||||||
def __iter__(self):
|
def sanitize_token(self, token):
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
if token["name"] in self.allowed_elements:
|
||||||
if token["name"] in self.allowed_elements:
|
if token.has_key("data"):
|
||||||
if token.has_key("data"):
|
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
for attr in self.attr_val_is_uri:
|
||||||
for attr in self.attr_val_is_uri:
|
if not attrs.has_key(attr): continue
|
||||||
if not attrs.has_key(attr): continue
|
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
del attrs[attr]
|
||||||
del attrs[attr]
|
if attrs.has_key('style'):
|
||||||
if attrs.has_key('style'):
|
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
return token
|
||||||
yield token
|
|
||||||
else:
|
|
||||||
if token["type"] == "EndTag":
|
|
||||||
token["data"] = "</%s>" % token["name"]
|
|
||||||
elif token["data"]:
|
|
||||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
|
||||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
|
||||||
else:
|
|
||||||
token["data"] = "<%s>" % token["name"]
|
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
token["data"]=token["data"][:-1] + "/>"
|
|
||||||
token["type"] = "Characters"
|
|
||||||
del token["name"]
|
|
||||||
yield token
|
|
||||||
elif token["type"] == "Comment":
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
yield token
|
if token["type"] == "EndTag":
|
||||||
|
token["data"] = "</%s>" % token["name"]
|
||||||
|
elif token["data"]:
|
||||||
|
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||||
|
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||||
|
else:
|
||||||
|
token["data"] = "<%s>" % token["name"]
|
||||||
|
if token["type"] == "EmptyTag":
|
||||||
|
token["data"]=token["data"][:-1] + "/>"
|
||||||
|
token["type"] = "Characters"
|
||||||
|
del token["name"]
|
||||||
|
return token
|
||||||
|
elif token["type"] == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
return token
|
||||||
|
|
||||||
def sanitize_css(self, style):
|
def sanitize_css(self, style):
|
||||||
# disallow urls
|
# disallow urls
|
||||||
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
|
|||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
||||||
return ' '.join(clean)
|
return ' '.join(clean)
|
||||||
|
|
||||||
|
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||||
|
def __iter__(self):
|
||||||
|
for token in HTMLTokenizer.__iter__(self):
|
||||||
|
token = self.sanitize_token(token)
|
||||||
|
if token: yield token
|
||||||
|
@ -7,10 +7,6 @@ except NameError:
|
|||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
from html5lib.filters.whitespace import Filter as WhitespaceFilter
|
|
||||||
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
|
|
||||||
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
|
|
||||||
|
|
||||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||||
from html5lib.constants import rcdataElements
|
from html5lib.constants import rcdataElements
|
||||||
|
|
||||||
@ -67,17 +63,16 @@ class HTMLSerializer(object):
|
|||||||
escape_lt_in_attrs = False
|
escape_lt_in_attrs = False
|
||||||
escape_rcdata = False
|
escape_rcdata = False
|
||||||
|
|
||||||
omit_optional_tags = True
|
|
||||||
|
|
||||||
strip_whitespace = False
|
|
||||||
|
|
||||||
inject_meta_charset = True
|
inject_meta_charset = True
|
||||||
|
strip_whitespace = False
|
||||||
|
sanitize = False
|
||||||
|
omit_optional_tags = True
|
||||||
|
|
||||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||||
"space_before_trailing_solidus", "omit_optional_tags",
|
"space_before_trailing_solidus", "omit_optional_tags",
|
||||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||||
"escape_rcdata")
|
"escape_rcdata", 'use_trailing_solidus', "sanitize")
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
if kwargs.has_key('quote_char'):
|
if kwargs.has_key('quote_char'):
|
||||||
@ -91,13 +86,19 @@ class HTMLSerializer(object):
|
|||||||
in_cdata = False
|
in_cdata = False
|
||||||
self.errors = []
|
self.errors = []
|
||||||
if encoding and self.inject_meta_charset:
|
if encoding and self.inject_meta_charset:
|
||||||
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
|
from html5lib.filters.inject_meta_charset import Filter
|
||||||
|
treewalker = Filter(treewalker, encoding)
|
||||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||||
# for maximum efficiently of this latter filter
|
# for maximum efficiently of this latter filter
|
||||||
if self.strip_whitespace:
|
if self.strip_whitespace:
|
||||||
treewalker = WhitespaceFilter(treewalker)
|
from html5lib.filters.whitespace import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
|
if self.sanitize:
|
||||||
|
from html5lib.filters.sanitizer import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
if self.omit_optional_tags:
|
if self.omit_optional_tags:
|
||||||
treewalker = OptionalTagFilter(treewalker)
|
from html5lib.filters.optionaltags import Filter
|
||||||
|
treewalker = Filter(treewalker)
|
||||||
for token in treewalker:
|
for token in treewalker:
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
if type == "Doctype":
|
if type == "Doctype":
|
||||||
|
13
planet/vendor/html5lib/tokenizer.py
vendored
13
planet/vendor/html5lib/tokenizer.py
vendored
@ -93,6 +93,8 @@ class HTMLTokenizer(object):
|
|||||||
# Start processing. When EOF is reached self.state will return False
|
# Start processing. When EOF is reached self.state will return False
|
||||||
# instead of True and the loop will terminate.
|
# instead of True and the loop will terminate.
|
||||||
while self.state():
|
while self.state():
|
||||||
|
while self.stream.errors:
|
||||||
|
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
|
||||||
while self.tokenQueue:
|
while self.tokenQueue:
|
||||||
yield self.tokenQueue.pop(0)
|
yield self.tokenQueue.pop(0)
|
||||||
|
|
||||||
@ -130,7 +132,6 @@ class HTMLTokenizer(object):
|
|||||||
allowed = hexDigits
|
allowed = hexDigits
|
||||||
radix = 16
|
radix = 16
|
||||||
|
|
||||||
char = u"\uFFFD"
|
|
||||||
charStack = []
|
charStack = []
|
||||||
|
|
||||||
# Consume all the characters that are in range while making sure we
|
# Consume all the characters that are in range while making sure we
|
||||||
@ -155,8 +156,8 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||||
|
|
||||||
# 0 is not a good number, neither are illegal Unicode code points.
|
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
|
||||||
if charAsInt > 0 and charAsInt <= 1114111:
|
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
|
||||||
try:
|
try:
|
||||||
# XXX We should have a separate function that does "int" to
|
# XXX We should have a separate function that does "int" to
|
||||||
# "unicodestring" conversion since this doesn't always work
|
# "unicodestring" conversion since this doesn't always work
|
||||||
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
|
|||||||
char = eval("u'\\U%08x'" % charAsInt)
|
char = eval("u'\\U%08x'" % charAsInt)
|
||||||
except:
|
except:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity couldn't be converted to character.")})
|
_("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||||
|
else:
|
||||||
|
char = u"\uFFFD"
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||||
|
|
||||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
# invoke parseError on parser.
|
# invoke parseError on parser.
|
||||||
|
4
planet/vendor/html5lib/treebuilders/dom.py
vendored
4
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
|
|||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||||
def hilite(self, encoding):
|
|
||||||
print 'foo'
|
|
||||||
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
|
|
||||||
setattr(self.dom, 'hilite', method)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def insertDoctype(self, name):
|
def insertDoctype(self, name):
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
[Planet]
|
[Planet]
|
||||||
output_theme = genshi_fancy
|
output_theme = asf
|
||||||
output_dir = tests/work/apply
|
output_dir = tests/work/apply
|
||||||
name = test planet
|
name = test planet
|
||||||
cache_directory = tests/work/spider/cache
|
cache_directory = tests/work/spider/cache
|
||||||
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
|
|||||||
bill_of_materials:
|
bill_of_materials:
|
||||||
images/#{face}
|
images/#{face}
|
||||||
|
|
||||||
[index.html.genshi]
|
[index.html.xslt]
|
||||||
filters:
|
filters:
|
||||||
xhtml2html.py>index.html4
|
xhtml2html.plugin?quote_attr_values=True"e_char="'">index.html4
|
||||||
|
|
||||||
[tests/data/spider/testfeed0.atom]
|
[tests/data/spider/testfeed0.atom]
|
||||||
name = not found
|
name = not found
|
||||||
|
@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
|
|||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
shutil.rmtree(os.path.split(workdir)[0])
|
shutil.rmtree(os.path.split(workdir)[0])
|
||||||
|
|
||||||
def test_apply_asf(self):
|
def apply_asf(self):
|
||||||
config.load(configfile % 'asf')
|
|
||||||
splice.apply(self.feeddata)
|
splice.apply(self.feeddata)
|
||||||
|
|
||||||
# verify that selected files are there
|
# verify that selected files are there
|
||||||
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
|
|||||||
self.assertEqual(12, content)
|
self.assertEqual(12, content)
|
||||||
self.assertEqual(3, lang)
|
self.assertEqual(3, lang)
|
||||||
|
|
||||||
|
def test_apply_asf(self):
|
||||||
|
config.load(configfile % 'asf')
|
||||||
|
self.apply_asf()
|
||||||
|
|
||||||
def test_apply_classic_fancy(self):
|
def test_apply_classic_fancy(self):
|
||||||
config.load(configfile % 'fancy')
|
config.load(configfile % 'fancy')
|
||||||
self.apply_fancy()
|
self.apply_fancy()
|
||||||
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_apply_filter_html(self):
|
def test_apply_filter_html(self):
|
||||||
config.load(configfile % 'html')
|
config.load(configfile % 'html')
|
||||||
self.apply_fancy()
|
self.apply_asf()
|
||||||
|
|
||||||
output = open(os.path.join(workdir, 'index.html')).read()
|
output = open(os.path.join(workdir, 'index.html')).read()
|
||||||
self.assertTrue(output.find('/>')>=0)
|
self.assertTrue(output.find('/>')>=0)
|
||||||
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
|
|||||||
if method.startswith('test_'): break
|
if method.startswith('test_'): break
|
||||||
else:
|
else:
|
||||||
delattr(ApplyTest,'test_apply_genshi_fancy')
|
delattr(ApplyTest,'test_apply_genshi_fancy')
|
||||||
delattr(ApplyTest,'test_apply_filter_html')
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import libxml2
|
import libxml2
|
||||||
|
@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
|
|||||||
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
|
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
|
||||||
self.assertTrue(output.find('</script>')>=0)
|
self.assertTrue(output.find('</script>')>=0)
|
||||||
|
|
||||||
def test_xhtml2html_filter(self):
|
|
||||||
testfile = 'tests/data/filter/index.html'
|
|
||||||
filter = 'xhtml2html.py'
|
|
||||||
output = shell.run(filter, open(testfile).read(), mode="filter")
|
|
||||||
self.assertTrue(output.find('/>')<0)
|
|
||||||
self.assertTrue(output.find('</script>')>=0)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import genshi
|
import genshi
|
||||||
except:
|
except:
|
||||||
|
@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertEqual('', output)
|
self.assertEqual('', output)
|
||||||
|
|
||||||
|
def test_xhtml2html_filter(self):
|
||||||
|
testfile = 'tests/data/filter/index.html'
|
||||||
|
filter = 'xhtml2html.plugin?quote_attr_values=True'
|
||||||
|
output = shell.run(filter, open(testfile).read(), mode="filter")
|
||||||
|
self.assertTrue(output.find('/>')<0)
|
||||||
|
self.assertTrue(output.find('</script>')>=0)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user