Update to the lastest html5lib; replace feedparser's sanitizer with
html5lib's
This commit is contained in:
parent
63fa05e556
commit
6f0f23dd36
@ -16,7 +16,7 @@ Todo:
|
||||
import re, time, sgmllib
|
||||
from xml.sax.saxutils import escape
|
||||
from xml.dom import minidom, Node
|
||||
from html5lib import liberalxmlparser
|
||||
from html5lib import html5parser
|
||||
from html5lib.treebuilders import dom
|
||||
import planet, config
|
||||
|
||||
@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
|
||||
bozo=1
|
||||
|
||||
if detail.type.find('xhtml')<0 or bozo:
|
||||
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
|
||||
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
|
||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||
for body in html.documentElement.childNodes:
|
||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||
|
@ -128,5 +128,11 @@ def scrub(feed_uri, data):
|
||||
node['value'] = feedparser._resolveRelativeURIs(
|
||||
node.value, node.base, 'utf-8', node.type)
|
||||
|
||||
node['value'] = feedparser._sanitizeHTML(
|
||||
node.value, 'utf-8', node.type)
|
||||
# Run this through HTML5's serializer
|
||||
from html5lib import html5parser, sanitizer, treewalkers, serializer
|
||||
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
|
||||
doc = p.parseFragment(node.value, encoding='utf-8')
|
||||
walker = treewalkers.getTreeWalker('simpletree')
|
||||
xhtml = serializer.XHTMLSerializer()
|
||||
tree = xhtml.serialize(walker(doc), encoding='utf-8')
|
||||
node['value'] = ''.join([n for n in tree])
|
||||
|
5
planet/vendor/html5lib/__init__.py
vendored
5
planet/vendor/html5lib/__init__.py
vendored
@ -11,5 +11,6 @@ f = open("my_document.html")
|
||||
p = html5lib.HTMLParser()
|
||||
tree = p.parse(f)
|
||||
"""
|
||||
from html5parser import HTMLParser
|
||||
from liberalxmlparser import XMLParser, XHTMLParser
|
||||
from html5parser import HTMLParser, parse
|
||||
from treebuilders import getTreeBuilder
|
||||
from serializer import serialize
|
||||
|
928
planet/vendor/html5lib/constants.py
vendored
928
planet/vendor/html5lib/constants.py
vendored
@ -1,4 +1,5 @@
|
||||
import string
|
||||
import string, gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
try:
|
||||
frozenset
|
||||
@ -9,6 +10,260 @@ except NameError:
|
||||
|
||||
EOF = None
|
||||
|
||||
E = {
|
||||
"null-character":
|
||||
_(u"Null character in input stream, replaced with U+FFFD."),
|
||||
"invalid-character":
|
||||
_(u"Invalid codepoint in stream."),
|
||||
"incorrectly-placed-solidus":
|
||||
_(u"Solidus (/) incorrectly placed in tag."),
|
||||
"incorrect-cr-newline-entity":
|
||||
_(u"Incorrect CR newline entity, replaced with LF."),
|
||||
"illegal-windows-1252-entity":
|
||||
_(u"Entity used with illegal number (windows-1252 reference)."),
|
||||
"cant-convert-numeric-entity":
|
||||
_(u"Numeric entity couldn't be converted to character "
|
||||
u"(codepoint U+%(charAsInt)08x)."),
|
||||
"illegal-codepoint-for-numeric-entity":
|
||||
_(u"Numeric entity represents an illegal codepoint: "
|
||||
u"U+%(charAsInt)08x."),
|
||||
"numeric-entity-without-semicolon":
|
||||
_(u"Numeric entity didn't end with ';'."),
|
||||
"expected-numeric-entity-but-got-eof":
|
||||
_(u"Numeric entity expected. Got end of file instead."),
|
||||
"expected-numeric-entity":
|
||||
_(u"Numeric entity expected but none found."),
|
||||
"named-entity-without-semicolon":
|
||||
_(u"Named entity didn't end with ';'."),
|
||||
"expected-named-entity":
|
||||
_(u"Named entity expected. Got none."),
|
||||
"attributes-in-end-tag":
|
||||
_(u"End tag contains unexpected attributes."),
|
||||
"expected-tag-name-but-got-right-bracket":
|
||||
_(u"Expected tag name. Got '>' instead."),
|
||||
"expected-tag-name-but-got-question-mark":
|
||||
_(u"Expected tag name. Got '?' instead. (HTML doesn't "
|
||||
u"support processing instructions.)"),
|
||||
"expected-tag-name":
|
||||
_(u"Expected tag name. Got something else instead"),
|
||||
"expected-closing-tag-but-got-right-bracket":
|
||||
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
||||
"expected-closing-tag-but-got-eof":
|
||||
_(u"Expected closing tag. Unexpected end of file."),
|
||||
"expected-closing-tag-but-got-char":
|
||||
_(u"Expected closing tag. Unexpected character '%(data)s' found."),
|
||||
"eof-in-tag-name":
|
||||
_(u"Unexpected end of file in the tag name."),
|
||||
"expected-attribute-name-but-got-eof":
|
||||
_(u"Unexpected end of file. Expected attribute name instead."),
|
||||
"eof-in-attribute-name":
|
||||
_(u"Unexpected end of file in attribute name."),
|
||||
"invalid-character-in-attribute-name":
|
||||
_(u"Invalid chracter in attribute name"),
|
||||
"duplicate-attribute":
|
||||
_(u"Dropped duplicate attribute on tag."),
|
||||
"expected-end-of-tag-name-but-got-eof":
|
||||
_(u"Unexpected end of file. Expected = or end of tag."),
|
||||
"expected-attribute-value-but-got-eof":
|
||||
_(u"Unexpected end of file. Expected attribute value."),
|
||||
"expected-attribute-value-but-got-right-bracket":
|
||||
_(u"Expected attribute value. Got '>' instead."),
|
||||
"eof-in-attribute-value-double-quote":
|
||||
_(u"Unexpected end of file in attribute value (\")."),
|
||||
"eof-in-attribute-value-single-quote":
|
||||
_(u"Unexpected end of file in attribute value (')."),
|
||||
"eof-in-attribute-value-no-quotes":
|
||||
_(u"Unexpected end of file in attribute value."),
|
||||
"unexpected-EOF-after-solidus-in-tag":
|
||||
_(u"Unexpected end of file in tag. Expected >"),
|
||||
"unexpected-character-after-soldius-in-tag":
|
||||
_(u"Unexpected character after / in tag. Expected >"),
|
||||
"expected-dashes-or-doctype":
|
||||
_(u"Expected '--' or 'DOCTYPE'. Not found."),
|
||||
"incorrect-comment":
|
||||
_(u"Incorrect comment."),
|
||||
"eof-in-comment":
|
||||
_(u"Unexpected end of file in comment."),
|
||||
"eof-in-comment-end-dash":
|
||||
_(u"Unexpected end of file in comment (-)"),
|
||||
"unexpected-dash-after-double-dash-in-comment":
|
||||
_(u"Unexpected '-' after '--' found in comment."),
|
||||
"eof-in-comment-double-dash":
|
||||
_(u"Unexpected end of file in comment (--)."),
|
||||
"unexpected-char-in-comment":
|
||||
_(u"Unexpected character in comment found."),
|
||||
"need-space-after-doctype":
|
||||
_(u"No space after literal string 'DOCTYPE'."),
|
||||
"expected-doctype-name-but-got-right-bracket":
|
||||
_(u"Unexpected > character. Expected DOCTYPE name."),
|
||||
"expected-doctype-name-but-got-eof":
|
||||
_(u"Unexpected end of file. Expected DOCTYPE name."),
|
||||
"eof-in-doctype-name":
|
||||
_(u"Unexpected end of file in DOCTYPE name."),
|
||||
"eof-in-doctype":
|
||||
_(u"Unexpected end of file in DOCTYPE."),
|
||||
"expected-space-or-right-bracket-in-doctype":
|
||||
_(u"Expected space or '>'. Got '%(data)s'"),
|
||||
"unexpected-end-of-doctype":
|
||||
_(u"Unexpected end of DOCTYPE."),
|
||||
"unexpected-char-in-doctype":
|
||||
_(u"Unexpected character in DOCTYPE."),
|
||||
"eof-in-innerhtml":
|
||||
_(u"XXX innerHTML EOF"),
|
||||
"unexpected-doctype":
|
||||
_(u"Unexpected DOCTYPE. Ignored."),
|
||||
"non-html-root":
|
||||
_(u"html needs to be the first start tag."),
|
||||
"expected-doctype-but-got-eof":
|
||||
_(u"Unexpected End of file. Expected DOCTYPE."),
|
||||
"unknown-doctype":
|
||||
_(u"Erroneous DOCTYPE."),
|
||||
"expected-doctype-but-got-chars":
|
||||
_(u"Unexpected non-space characters. Expected DOCTYPE."),
|
||||
"expected-doctype-but-got-start-tag":
|
||||
_(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
||||
"expected-doctype-but-got-end-tag":
|
||||
_(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
||||
"end-tag-after-implied-root":
|
||||
_(u"Unexpected end tag (%(name)s) after the (implied) root element."),
|
||||
"expected-named-closing-tag-but-got-eof":
|
||||
_(u"Unexpected end of file. Expected end tag (%(name)s)."),
|
||||
"two-heads-are-not-better-than-one":
|
||||
_(u"Unexpected start tag head in existing head. Ignored."),
|
||||
"unexpected-end-tag":
|
||||
_(u"Unexpected end tag (%(name)s). Ignored."),
|
||||
"unexpected-start-tag-out-of-my-head":
|
||||
_(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
|
||||
"unexpected-start-tag":
|
||||
_(u"Unexpected start tag (%(name)s)."),
|
||||
"missing-end-tag":
|
||||
_(u"Missing end tag (%(name)s)."),
|
||||
"missing-end-tags":
|
||||
_(u"Missing end tags (%(name)s)."),
|
||||
"unexpected-start-tag-implies-end-tag":
|
||||
_(u"Unexpected start tag (%(startName)s) "
|
||||
u"implies end tag (%(endName)s)."),
|
||||
"unexpected-start-tag-treated-as":
|
||||
_(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
||||
"deprecated-tag":
|
||||
_(u"Unexpected start tag %(name)s. Don't use it!"),
|
||||
"unexpected-start-tag-ignored":
|
||||
_(u"Unexpected start tag %(name)s. Ignored."),
|
||||
"expected-one-end-tag-but-got-another":
|
||||
_(u"Unexpected end tag (%(gotName)s). "
|
||||
u"Missing end tag (%(expectedName)s)."),
|
||||
"end-tag-too-early":
|
||||
_(u"End tag (%(name)s) seen too early. Expected other end tag."),
|
||||
"end-tag-too-early-named":
|
||||
_(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
||||
"end-tag-too-early-ignored":
|
||||
_(u"End tag (%(name)s) seen too early. Ignored."),
|
||||
"adoption-agency-1.1":
|
||||
_(u"End tag (%(name)s) violates step 1, "
|
||||
u"paragraph 1 of the adoption agency algorithm."),
|
||||
"adoption-agency-1.2":
|
||||
_(u"End tag (%(name)s) violates step 1, "
|
||||
u"paragraph 2 of the adoption agency algorithm."),
|
||||
"adoption-agency-1.3":
|
||||
_(u"End tag (%(name)s) violates step 1, "
|
||||
u"paragraph 3 of the adoption agency algorithm."),
|
||||
"unexpected-end-tag-treated-as":
|
||||
_(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
||||
"no-end-tag":
|
||||
_(u"This element (%(name)s) has no end tag."),
|
||||
"unexpected-implied-end-tag-in-table":
|
||||
_(u"Unexpected implied end tag (%(name)s) in the table phase."),
|
||||
"unexpected-implied-end-tag-in-table-body":
|
||||
_(u"Unexpected implied end tag (%(name)s) in the table body phase."),
|
||||
"unexpected-char-implies-table-voodoo":
|
||||
_(u"Unexpected non-space characters in "
|
||||
u"table context caused voodoo mode."),
|
||||
"unexpected-hidden-input-in-table":
|
||||
_(u"Unexpected input with type hidden in table context."),
|
||||
"unexpected-start-tag-implies-table-voodoo":
|
||||
_(u"Unexpected start tag (%(name)s) in "
|
||||
u"table context caused voodoo mode."),
|
||||
"unexpected-end-tag-implies-table-voodoo":
|
||||
_(u"Unexpected end tag (%(name)s) in "
|
||||
u"table context caused voodoo mode."),
|
||||
"unexpected-cell-in-table-body":
|
||||
_(u"Unexpected table cell start tag (%(name)s) "
|
||||
u"in the table body phase."),
|
||||
"unexpected-cell-end-tag":
|
||||
_(u"Got table cell end tag (%(name)s) "
|
||||
u"while required end tags are missing."),
|
||||
"unexpected-end-tag-in-table-body":
|
||||
_(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
||||
"unexpected-implied-end-tag-in-table-row":
|
||||
_(u"Unexpected implied end tag (%(name)s) in the table row phase."),
|
||||
"unexpected-end-tag-in-table-row":
|
||||
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
||||
"unexpected-select-in-select":
|
||||
_(u"Unexpected select start tag in the select phase "
|
||||
u"treated as select end tag."),
|
||||
"unexpected-input-in-select":
|
||||
_(u"Unexpected input start tag in the select phase."),
|
||||
"unexpected-start-tag-in-select":
|
||||
_(u"Unexpected start tag token (%(name)s in the select phase. "
|
||||
u"Ignored."),
|
||||
"unexpected-end-tag-in-select":
|
||||
_(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
||||
"unexpected-table-element-start-tag-in-select-in-table":
|
||||
_(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
|
||||
"unexpected-table-element-end-tag-in-select-in-table":
|
||||
_(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
|
||||
"unexpected-char-after-body":
|
||||
_(u"Unexpected non-space characters in the after body phase."),
|
||||
"unexpected-start-tag-after-body":
|
||||
_(u"Unexpected start tag token (%(name)s)"
|
||||
u" in the after body phase."),
|
||||
"unexpected-end-tag-after-body":
|
||||
_(u"Unexpected end tag token (%(name)s)"
|
||||
u" in the after body phase."),
|
||||
"unexpected-char-in-frameset":
|
||||
_(u"Unepxected characters in the frameset phase. Characters ignored."),
|
||||
"unexpected-start-tag-in-frameset":
|
||||
_(u"Unexpected start tag token (%(name)s)"
|
||||
u" in the frameset phase. Ignored."),
|
||||
"unexpected-frameset-in-frameset-innerhtml":
|
||||
_(u"Unexpected end tag token (frameset) "
|
||||
u"in the frameset phase (innerHTML)."),
|
||||
"unexpected-end-tag-in-frameset":
|
||||
_(u"Unexpected end tag token (%(name)s)"
|
||||
u" in the frameset phase. Ignored."),
|
||||
"unexpected-char-after-frameset":
|
||||
_(u"Unexpected non-space characters in the "
|
||||
u"after frameset phase. Ignored."),
|
||||
"unexpected-start-tag-after-frameset":
|
||||
_(u"Unexpected start tag (%(name)s)"
|
||||
u" in the after frameset phase. Ignored."),
|
||||
"unexpected-end-tag-after-frameset":
|
||||
_(u"Unexpected end tag (%(name)s)"
|
||||
u" in the after frameset phase. Ignored."),
|
||||
"unexpected-end-tag-after-body-innerhtml":
|
||||
_(u"Unexpected end tag after body(innerHtml)"),
|
||||
"expected-eof-but-got-char":
|
||||
_(u"Unexpected non-space characters. Expected end of file."),
|
||||
"expected-eof-but-got-start-tag":
|
||||
_(u"Unexpected start tag (%(name)s)"
|
||||
u". Expected end of file."),
|
||||
"expected-eof-but-got-end-tag":
|
||||
_(u"Unexpected end tag (%(name)s)"
|
||||
u". Expected end of file."),
|
||||
"eof-in-table":
|
||||
_(u"Unexpected end of file. Expected table content."),
|
||||
"eof-in-select":
|
||||
_(u"Unexpected end of file. Expected select content."),
|
||||
"eof-in-frameset":
|
||||
_(u"Unexpected end of file. Expected frameset content."),
|
||||
"non-void-element-with-trailing-solidus":
|
||||
_(u"Trailing solidus not allowed on element %(name)s"),
|
||||
"unexpected-html-element-in-foreign-content":
|
||||
_(u"Element %(name)s not allowed in a non-html context"),
|
||||
"XXX-undefined-error":
|
||||
(u"Undefined error (this sucks and should be fixed)"),
|
||||
}
|
||||
|
||||
contentModelFlags = {
|
||||
"PCDATA":0,
|
||||
"RCDATA":1,
|
||||
@ -16,101 +271,126 @@ contentModelFlags = {
|
||||
"PLAINTEXT":3
|
||||
}
|
||||
|
||||
namespaces = {
|
||||
"html":"http://www.w3.org/1999/xhtml",
|
||||
"mathml":"http://www.w3.org/1998/Math/MathML",
|
||||
"svg":"http://www.w3.org/2000/svg",
|
||||
"xlink":"http://www.w3.org/1999/xlink",
|
||||
"xml":"http://www.w3.org/XML/1998/namespace",
|
||||
"xmlns":"http://www.w3.org/2000/xmlns/"
|
||||
}
|
||||
|
||||
scopingElements = frozenset((
|
||||
"button",
|
||||
"caption",
|
||||
"html",
|
||||
"marquee",
|
||||
"object",
|
||||
"table",
|
||||
"td",
|
||||
"th"
|
||||
(namespaces["html"], "applet"),
|
||||
(namespaces["html"], "button"),
|
||||
(namespaces["html"], "caption"),
|
||||
(namespaces["html"], "html"),
|
||||
(namespaces["html"], "marquee"),
|
||||
(namespaces["html"], "object"),
|
||||
(namespaces["html"], "table"),
|
||||
(namespaces["html"], "td"),
|
||||
(namespaces["html"], "th"),
|
||||
(namespaces["svg"], "foreignObject")
|
||||
))
|
||||
|
||||
formattingElements = frozenset((
|
||||
"a",
|
||||
"b",
|
||||
"big",
|
||||
"em",
|
||||
"font",
|
||||
"i",
|
||||
"nobr",
|
||||
"s",
|
||||
"small",
|
||||
"strike",
|
||||
"strong",
|
||||
"tt",
|
||||
"u"
|
||||
(namespaces["html"], "a"),
|
||||
(namespaces["html"], "b"),
|
||||
(namespaces["html"], "big"),
|
||||
(namespaces["html"], "code"),
|
||||
(namespaces["html"], "em"),
|
||||
(namespaces["html"], "font"),
|
||||
(namespaces["html"], "i"),
|
||||
(namespaces["html"], "nobr"),
|
||||
(namespaces["html"], "s"),
|
||||
(namespaces["html"], "small"),
|
||||
(namespaces["html"], "strike"),
|
||||
(namespaces["html"], "strong"),
|
||||
(namespaces["html"], "tt"),
|
||||
(namespaces["html"], "u")
|
||||
))
|
||||
|
||||
specialElements = frozenset((
|
||||
"address",
|
||||
"area",
|
||||
"base",
|
||||
"basefont",
|
||||
"bgsound",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"center",
|
||||
"col",
|
||||
"colgroup",
|
||||
"dd",
|
||||
"dir",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"form",
|
||||
"frame",
|
||||
"frameset",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"hr",
|
||||
"iframe",
|
||||
"image",
|
||||
"img",
|
||||
"input",
|
||||
"isindex",
|
||||
"li",
|
||||
"link",
|
||||
"listing",
|
||||
"menu",
|
||||
"meta",
|
||||
"noembed",
|
||||
"noframes",
|
||||
"noscript",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"p",
|
||||
"param",
|
||||
"plaintext",
|
||||
"pre",
|
||||
"script",
|
||||
"select",
|
||||
"spacer",
|
||||
"style",
|
||||
"tbody",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"title",
|
||||
"tr",
|
||||
"ul",
|
||||
"wbr"
|
||||
(namespaces["html"], "address"),
|
||||
(namespaces["html"], "area"),
|
||||
(namespaces["html"], "article"),
|
||||
(namespaces["html"], "aside"),
|
||||
(namespaces["html"], "base"),
|
||||
(namespaces["html"], "basefont"),
|
||||
(namespaces["html"], "bgsound"),
|
||||
(namespaces["html"], "blockquote"),
|
||||
(namespaces["html"], "body"),
|
||||
(namespaces["html"], "br"),
|
||||
(namespaces["html"], "center"),
|
||||
(namespaces["html"], "col"),
|
||||
(namespaces["html"], "colgroup"),
|
||||
(namespaces["html"], "command"),
|
||||
(namespaces["html"], "datagrid"),
|
||||
(namespaces["html"], "dd"),
|
||||
(namespaces["html"], "details"),
|
||||
(namespaces["html"], "dialog"),
|
||||
(namespaces["html"], "dir"),
|
||||
(namespaces["html"], "div"),
|
||||
(namespaces["html"], "dl"),
|
||||
(namespaces["html"], "dt"),
|
||||
(namespaces["html"], "embed"),
|
||||
(namespaces["html"], "event-source"),
|
||||
(namespaces["html"], "fieldset"),
|
||||
(namespaces["html"], "figure"),
|
||||
(namespaces["html"], "footer"),
|
||||
(namespaces["html"], "form"),
|
||||
(namespaces["html"], "frame"),
|
||||
(namespaces["html"], "frameset"),
|
||||
(namespaces["html"], "h1"),
|
||||
(namespaces["html"], "h2"),
|
||||
(namespaces["html"], "h3"),
|
||||
(namespaces["html"], "h4"),
|
||||
(namespaces["html"], "h5"),
|
||||
(namespaces["html"], "h6"),
|
||||
(namespaces["html"], "head"),
|
||||
(namespaces["html"], "header"),
|
||||
(namespaces["html"], "hr"),
|
||||
(namespaces["html"], "iframe"),
|
||||
# Note that image is commented out in the spec as "this isn't an
|
||||
# element that can end up on the stack, so it doesn't matter,"
|
||||
(namespaces["html"], "image"),
|
||||
(namespaces["html"], "img"),
|
||||
(namespaces["html"], "input"),
|
||||
(namespaces["html"], "isindex"),
|
||||
(namespaces["html"], "li"),
|
||||
(namespaces["html"], "link"),
|
||||
(namespaces["html"], "listing"),
|
||||
(namespaces["html"], "menu"),
|
||||
(namespaces["html"], "meta"),
|
||||
(namespaces["html"], "nav"),
|
||||
(namespaces["html"], "noembed"),
|
||||
(namespaces["html"], "noframes"),
|
||||
(namespaces["html"], "noscript"),
|
||||
(namespaces["html"], "ol"),
|
||||
(namespaces["html"], "optgroup"),
|
||||
(namespaces["html"], "option"),
|
||||
(namespaces["html"], "p"),
|
||||
(namespaces["html"], "param"),
|
||||
(namespaces["html"], "plaintext"),
|
||||
(namespaces["html"], "pre"),
|
||||
(namespaces["html"], "script"),
|
||||
(namespaces["html"], "section"),
|
||||
(namespaces["html"], "select"),
|
||||
(namespaces["html"], "spacer"),
|
||||
(namespaces["html"], "style"),
|
||||
(namespaces["html"], "tbody"),
|
||||
(namespaces["html"], "textarea"),
|
||||
(namespaces["html"], "tfoot"),
|
||||
(namespaces["html"], "thead"),
|
||||
(namespaces["html"], "title"),
|
||||
(namespaces["html"], "tr"),
|
||||
(namespaces["html"], "ul"),
|
||||
(namespaces["html"], "wbr")
|
||||
))
|
||||
|
||||
spaceCharacters = frozenset((
|
||||
u"\t",
|
||||
u"\n",
|
||||
u"\u000B",
|
||||
u"\u000C",
|
||||
u" ",
|
||||
u"\r"
|
||||
@ -143,9 +423,10 @@ headingElements = (
|
||||
"h6"
|
||||
)
|
||||
|
||||
# XXX What about event-source and command?
|
||||
voidElements = frozenset((
|
||||
"base",
|
||||
"command",
|
||||
"event-source",
|
||||
"link",
|
||||
"meta",
|
||||
"hr",
|
||||
@ -155,7 +436,8 @@ voidElements = frozenset((
|
||||
"param",
|
||||
"area",
|
||||
"col",
|
||||
"input"
|
||||
"input",
|
||||
"source"
|
||||
))
|
||||
|
||||
cdataElements = frozenset(('title', 'textarea'))
|
||||
@ -440,7 +722,7 @@ entities = {
|
||||
"kappa;": u"\u03BA",
|
||||
"lArr;": u"\u21D0",
|
||||
"lambda;": u"\u03BB",
|
||||
"lang;": u"\u3008",
|
||||
"lang;": u"\u27E8",
|
||||
"laquo;": u"\u00AB",
|
||||
"laquo": u"\u00AB",
|
||||
"larr;": u"\u2190",
|
||||
@ -520,7 +802,7 @@ entities = {
|
||||
"quot": u"\u0022",
|
||||
"rArr;": u"\u21D2",
|
||||
"radic;": u"\u221A",
|
||||
"rang;": u"\u3009",
|
||||
"rang;": u"\u27E9",
|
||||
"raquo;": u"\u00BB",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr;": u"\u2192",
|
||||
@ -596,221 +878,255 @@ entities = {
|
||||
"zwnj;": u"\u200C"
|
||||
}
|
||||
|
||||
encodings = frozenset((
|
||||
"ansi_x3.4-1968",
|
||||
"iso-ir-6",
|
||||
"ansi_x3.4-1986",
|
||||
"iso_646.irv:1991",
|
||||
"ascii",
|
||||
"iso646-us",
|
||||
"us-ascii",
|
||||
"us",
|
||||
"ibm367",
|
||||
"cp367",
|
||||
"csascii",
|
||||
"ks_c_5601-1987",
|
||||
"korean",
|
||||
"iso-2022-kr",
|
||||
"csiso2022kr",
|
||||
"euc-kr",
|
||||
"iso-2022-jp",
|
||||
"csiso2022jp",
|
||||
"iso-2022-jp-2",
|
||||
"iso-ir-58",
|
||||
"chinese",
|
||||
"csiso58gb231280",
|
||||
"iso_8859-1:1987",
|
||||
"iso-ir-100",
|
||||
"iso_8859-1",
|
||||
"iso-8859-1",
|
||||
"latin1",
|
||||
"l1",
|
||||
"ibm819",
|
||||
"cp819",
|
||||
"csisolatin1",
|
||||
"iso_8859-2:1987",
|
||||
"iso-ir-101",
|
||||
"iso_8859-2",
|
||||
"iso-8859-2",
|
||||
"latin2",
|
||||
"l2",
|
||||
"csisolatin2",
|
||||
"iso_8859-3:1988",
|
||||
"iso-ir-109",
|
||||
"iso_8859-3",
|
||||
"iso-8859-3",
|
||||
"latin3",
|
||||
"l3",
|
||||
"csisolatin3",
|
||||
"iso_8859-4:1988",
|
||||
"iso-ir-110",
|
||||
"iso_8859-4",
|
||||
"iso-8859-4",
|
||||
"latin4",
|
||||
"l4",
|
||||
"csisolatin4",
|
||||
"iso_8859-6:1987",
|
||||
"iso-ir-127",
|
||||
"iso_8859-6",
|
||||
"iso-8859-6",
|
||||
"ecma-114",
|
||||
"asmo-708",
|
||||
"arabic",
|
||||
"csisolatinarabic",
|
||||
"iso_8859-7:1987",
|
||||
"iso-ir-126",
|
||||
"iso_8859-7",
|
||||
"iso-8859-7",
|
||||
"elot_928",
|
||||
"ecma-118",
|
||||
"greek",
|
||||
"greek8",
|
||||
"csisolatingreek",
|
||||
"iso_8859-8:1988",
|
||||
"iso-ir-138",
|
||||
"iso_8859-8",
|
||||
"iso-8859-8",
|
||||
"hebrew",
|
||||
"csisolatinhebrew",
|
||||
"iso_8859-5:1988",
|
||||
"iso-ir-144",
|
||||
"iso_8859-5",
|
||||
"iso-8859-5",
|
||||
"cyrillic",
|
||||
"csisolatincyrillic",
|
||||
"iso_8859-9:1989",
|
||||
"iso-ir-148",
|
||||
"iso_8859-9",
|
||||
"iso-8859-9",
|
||||
"latin5",
|
||||
"l5",
|
||||
"csisolatin5",
|
||||
"iso-8859-10",
|
||||
"iso-ir-157",
|
||||
"l6",
|
||||
"iso_8859-10:1992",
|
||||
"csisolatin6",
|
||||
"latin6",
|
||||
"hp-roman8",
|
||||
"roman8",
|
||||
"r8",
|
||||
"ibm037",
|
||||
"cp037",
|
||||
"csibm037",
|
||||
"ibm424",
|
||||
"cp424",
|
||||
"csibm424",
|
||||
"ibm437",
|
||||
"cp437",
|
||||
"437",
|
||||
"cspc8codepage437",
|
||||
"ibm500",
|
||||
"cp500",
|
||||
"csibm500",
|
||||
"ibm775",
|
||||
"cp775",
|
||||
"cspc775baltic",
|
||||
"ibm850",
|
||||
"cp850",
|
||||
"850",
|
||||
"cspc850multilingual",
|
||||
"ibm852",
|
||||
"cp852",
|
||||
"852",
|
||||
"cspcp852",
|
||||
"ibm855",
|
||||
"cp855",
|
||||
"855",
|
||||
"csibm855",
|
||||
"ibm857",
|
||||
"cp857",
|
||||
"857",
|
||||
"csibm857",
|
||||
"ibm860",
|
||||
"cp860",
|
||||
"860",
|
||||
"csibm860",
|
||||
"ibm861",
|
||||
"cp861",
|
||||
"861",
|
||||
"cp-is",
|
||||
"csibm861",
|
||||
"ibm862",
|
||||
"cp862",
|
||||
"862",
|
||||
"cspc862latinhebrew",
|
||||
"ibm863",
|
||||
"cp863",
|
||||
"863",
|
||||
"csibm863",
|
||||
"ibm864",
|
||||
"cp864",
|
||||
"csibm864",
|
||||
"ibm865",
|
||||
"cp865",
|
||||
"865",
|
||||
"csibm865",
|
||||
"ibm866",
|
||||
"cp866",
|
||||
"866",
|
||||
"csibm866",
|
||||
"ibm869",
|
||||
"cp869",
|
||||
"869",
|
||||
"cp-gr",
|
||||
"csibm869",
|
||||
"ibm1026",
|
||||
"cp1026",
|
||||
"csibm1026",
|
||||
"koi8-r",
|
||||
"cskoi8r",
|
||||
"koi8-u",
|
||||
"big5-hkscs",
|
||||
"ptcp154",
|
||||
"csptcp154",
|
||||
"pt154",
|
||||
"cp154",
|
||||
"utf-7",
|
||||
"utf-16be",
|
||||
"utf-16le",
|
||||
"utf-16",
|
||||
"utf-8",
|
||||
"iso-8859-13",
|
||||
"iso-8859-14",
|
||||
"iso-ir-199",
|
||||
"iso_8859-14:1998",
|
||||
"iso_8859-14",
|
||||
"latin8",
|
||||
"iso-celtic",
|
||||
"l8",
|
||||
"iso-8859-15",
|
||||
"iso_8859-15",
|
||||
"iso-8859-16",
|
||||
"iso-ir-226",
|
||||
"iso_8859-16:2001",
|
||||
"iso_8859-16",
|
||||
"latin10",
|
||||
"l10",
|
||||
"gbk",
|
||||
"cp936",
|
||||
"ms936",
|
||||
"gb18030",
|
||||
"shift_jis",
|
||||
"ms_kanji",
|
||||
"csshiftjis",
|
||||
"euc-jp",
|
||||
"gb2312",
|
||||
"big5",
|
||||
"csbig5",
|
||||
"windows-1250",
|
||||
"windows-1251",
|
||||
"windows-1252",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1255",
|
||||
"windows-1256",
|
||||
"windows-1257",
|
||||
"windows-1258",
|
||||
"tis-620",
|
||||
"hz-gb-2312",
|
||||
))
|
||||
encodings = {
|
||||
'437': 'cp437',
|
||||
'850': 'cp850',
|
||||
'852': 'cp852',
|
||||
'855': 'cp855',
|
||||
'857': 'cp857',
|
||||
'860': 'cp860',
|
||||
'861': 'cp861',
|
||||
'862': 'cp862',
|
||||
'863': 'cp863',
|
||||
'865': 'cp865',
|
||||
'866': 'cp866',
|
||||
'869': 'cp869',
|
||||
'ansix341968': 'ascii',
|
||||
'ansix341986': 'ascii',
|
||||
'arabic': 'iso8859-6',
|
||||
'ascii': 'ascii',
|
||||
'asmo708': 'iso8859-6',
|
||||
'big5': 'big5',
|
||||
'big5hkscs': 'big5hkscs',
|
||||
'chinese': 'gbk',
|
||||
'cp037': 'cp037',
|
||||
'cp1026': 'cp1026',
|
||||
'cp154': 'ptcp154',
|
||||
'cp367': 'ascii',
|
||||
'cp424': 'cp424',
|
||||
'cp437': 'cp437',
|
||||
'cp500': 'cp500',
|
||||
'cp775': 'cp775',
|
||||
'cp819': 'windows-1252',
|
||||
'cp850': 'cp850',
|
||||
'cp852': 'cp852',
|
||||
'cp855': 'cp855',
|
||||
'cp857': 'cp857',
|
||||
'cp860': 'cp860',
|
||||
'cp861': 'cp861',
|
||||
'cp862': 'cp862',
|
||||
'cp863': 'cp863',
|
||||
'cp864': 'cp864',
|
||||
'cp865': 'cp865',
|
||||
'cp866': 'cp866',
|
||||
'cp869': 'cp869',
|
||||
'cp936': 'gbk',
|
||||
'cpgr': 'cp869',
|
||||
'cpis': 'cp861',
|
||||
'csascii': 'ascii',
|
||||
'csbig5': 'big5',
|
||||
'cseuckr': 'cp949',
|
||||
'cseucpkdfmtjapanese': 'euc_jp',
|
||||
'csgb2312': 'gbk',
|
||||
'cshproman8': 'hp-roman8',
|
||||
'csibm037': 'cp037',
|
||||
'csibm1026': 'cp1026',
|
||||
'csibm424': 'cp424',
|
||||
'csibm500': 'cp500',
|
||||
'csibm855': 'cp855',
|
||||
'csibm857': 'cp857',
|
||||
'csibm860': 'cp860',
|
||||
'csibm861': 'cp861',
|
||||
'csibm863': 'cp863',
|
||||
'csibm864': 'cp864',
|
||||
'csibm865': 'cp865',
|
||||
'csibm866': 'cp866',
|
||||
'csibm869': 'cp869',
|
||||
'csiso2022jp': 'iso2022_jp',
|
||||
'csiso2022jp2': 'iso2022_jp_2',
|
||||
'csiso2022kr': 'iso2022_kr',
|
||||
'csiso58gb231280': 'gbk',
|
||||
'csisolatin1': 'windows-1252',
|
||||
'csisolatin2': 'iso8859-2',
|
||||
'csisolatin3': 'iso8859-3',
|
||||
'csisolatin4': 'iso8859-4',
|
||||
'csisolatin5': 'windows-1254',
|
||||
'csisolatin6': 'iso8859-10',
|
||||
'csisolatinarabic': 'iso8859-6',
|
||||
'csisolatincyrillic': 'iso8859-5',
|
||||
'csisolatingreek': 'iso8859-7',
|
||||
'csisolatinhebrew': 'iso8859-8',
|
||||
'cskoi8r': 'koi8-r',
|
||||
'csksc56011987': 'cp949',
|
||||
'cspc775baltic': 'cp775',
|
||||
'cspc850multilingual': 'cp850',
|
||||
'cspc862latinhebrew': 'cp862',
|
||||
'cspc8codepage437': 'cp437',
|
||||
'cspcp852': 'cp852',
|
||||
'csptcp154': 'ptcp154',
|
||||
'csshiftjis': 'shift_jis',
|
||||
'csunicode11utf7': 'utf-7',
|
||||
'cyrillic': 'iso8859-5',
|
||||
'cyrillicasian': 'ptcp154',
|
||||
'ebcdiccpbe': 'cp500',
|
||||
'ebcdiccpca': 'cp037',
|
||||
'ebcdiccpch': 'cp500',
|
||||
'ebcdiccphe': 'cp424',
|
||||
'ebcdiccpnl': 'cp037',
|
||||
'ebcdiccpus': 'cp037',
|
||||
'ebcdiccpwt': 'cp037',
|
||||
'ecma114': 'iso8859-6',
|
||||
'ecma118': 'iso8859-7',
|
||||
'elot928': 'iso8859-7',
|
||||
'eucjp': 'euc_jp',
|
||||
'euckr': 'cp949',
|
||||
'extendedunixcodepackedformatforjapanese': 'euc_jp',
|
||||
'gb18030': 'gb18030',
|
||||
'gb2312': 'gbk',
|
||||
'gb231280': 'gbk',
|
||||
'gbk': 'gbk',
|
||||
'greek': 'iso8859-7',
|
||||
'greek8': 'iso8859-7',
|
||||
'hebrew': 'iso8859-8',
|
||||
'hproman8': 'hp-roman8',
|
||||
'hzgb2312': 'hz',
|
||||
'ibm037': 'cp037',
|
||||
'ibm1026': 'cp1026',
|
||||
'ibm367': 'ascii',
|
||||
'ibm424': 'cp424',
|
||||
'ibm437': 'cp437',
|
||||
'ibm500': 'cp500',
|
||||
'ibm775': 'cp775',
|
||||
'ibm819': 'windows-1252',
|
||||
'ibm850': 'cp850',
|
||||
'ibm852': 'cp852',
|
||||
'ibm855': 'cp855',
|
||||
'ibm857': 'cp857',
|
||||
'ibm860': 'cp860',
|
||||
'ibm861': 'cp861',
|
||||
'ibm862': 'cp862',
|
||||
'ibm863': 'cp863',
|
||||
'ibm864': 'cp864',
|
||||
'ibm865': 'cp865',
|
||||
'ibm866': 'cp866',
|
||||
'ibm869': 'cp869',
|
||||
'iso2022jp': 'iso2022_jp',
|
||||
'iso2022jp2': 'iso2022_jp_2',
|
||||
'iso2022kr': 'iso2022_kr',
|
||||
'iso646irv1991': 'ascii',
|
||||
'iso646us': 'ascii',
|
||||
'iso88591': 'windows-1252',
|
||||
'iso885910': 'iso8859-10',
|
||||
'iso8859101992': 'iso8859-10',
|
||||
'iso885911987': 'windows-1252',
|
||||
'iso885913': 'iso8859-13',
|
||||
'iso885914': 'iso8859-14',
|
||||
'iso8859141998': 'iso8859-14',
|
||||
'iso885915': 'iso8859-15',
|
||||
'iso885916': 'iso8859-16',
|
||||
'iso8859162001': 'iso8859-16',
|
||||
'iso88592': 'iso8859-2',
|
||||
'iso885921987': 'iso8859-2',
|
||||
'iso88593': 'iso8859-3',
|
||||
'iso885931988': 'iso8859-3',
|
||||
'iso88594': 'iso8859-4',
|
||||
'iso885941988': 'iso8859-4',
|
||||
'iso88595': 'iso8859-5',
|
||||
'iso885951988': 'iso8859-5',
|
||||
'iso88596': 'iso8859-6',
|
||||
'iso885961987': 'iso8859-6',
|
||||
'iso88597': 'iso8859-7',
|
||||
'iso885971987': 'iso8859-7',
|
||||
'iso88598': 'iso8859-8',
|
||||
'iso885981988': 'iso8859-8',
|
||||
'iso88599': 'windows-1254',
|
||||
'iso885991989': 'windows-1254',
|
||||
'isoceltic': 'iso8859-14',
|
||||
'isoir100': 'windows-1252',
|
||||
'isoir101': 'iso8859-2',
|
||||
'isoir109': 'iso8859-3',
|
||||
'isoir110': 'iso8859-4',
|
||||
'isoir126': 'iso8859-7',
|
||||
'isoir127': 'iso8859-6',
|
||||
'isoir138': 'iso8859-8',
|
||||
'isoir144': 'iso8859-5',
|
||||
'isoir148': 'windows-1254',
|
||||
'isoir149': 'cp949',
|
||||
'isoir157': 'iso8859-10',
|
||||
'isoir199': 'iso8859-14',
|
||||
'isoir226': 'iso8859-16',
|
||||
'isoir58': 'gbk',
|
||||
'isoir6': 'ascii',
|
||||
'koi8r': 'koi8-r',
|
||||
'koi8u': 'koi8-u',
|
||||
'korean': 'cp949',
|
||||
'ksc5601': 'cp949',
|
||||
'ksc56011987': 'cp949',
|
||||
'ksc56011989': 'cp949',
|
||||
'l1': 'windows-1252',
|
||||
'l10': 'iso8859-16',
|
||||
'l2': 'iso8859-2',
|
||||
'l3': 'iso8859-3',
|
||||
'l4': 'iso8859-4',
|
||||
'l5': 'windows-1254',
|
||||
'l6': 'iso8859-10',
|
||||
'l8': 'iso8859-14',
|
||||
'latin1': 'windows-1252',
|
||||
'latin10': 'iso8859-16',
|
||||
'latin2': 'iso8859-2',
|
||||
'latin3': 'iso8859-3',
|
||||
'latin4': 'iso8859-4',
|
||||
'latin5': 'windows-1254',
|
||||
'latin6': 'iso8859-10',
|
||||
'latin8': 'iso8859-14',
|
||||
'latin9': 'iso8859-15',
|
||||
'ms936': 'gbk',
|
||||
'mskanji': 'shift_jis',
|
||||
'pt154': 'ptcp154',
|
||||
'ptcp154': 'ptcp154',
|
||||
'r8': 'hp-roman8',
|
||||
'roman8': 'hp-roman8',
|
||||
'shiftjis': 'shift_jis',
|
||||
'tis620': 'cp874',
|
||||
'unicode11utf7': 'utf-7',
|
||||
'us': 'ascii',
|
||||
'usascii': 'ascii',
|
||||
'utf16': 'utf-16',
|
||||
'utf16be': 'utf-16-be',
|
||||
'utf16le': 'utf-16-le',
|
||||
'utf8': 'utf-8',
|
||||
'windows1250': 'cp1250',
|
||||
'windows1251': 'cp1251',
|
||||
'windows1252': 'cp1252',
|
||||
'windows1253': 'cp1253',
|
||||
'windows1254': 'cp1254',
|
||||
'windows1255': 'cp1255',
|
||||
'windows1256': 'cp1256',
|
||||
'windows1257': 'cp1257',
|
||||
'windows1258': 'cp1258',
|
||||
'windows936': 'gbk',
|
||||
'x-x-big5': 'big5'}
|
||||
|
||||
tokenTypes = {
|
||||
"Doctype":0,
|
||||
"Characters":1,
|
||||
"SpaceCharacters":2,
|
||||
"StartTag":3,
|
||||
"EndTag":4,
|
||||
"EmptyTag":5,
|
||||
"Comment":6,
|
||||
"ParseError":7
|
||||
}
|
||||
|
||||
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]))
|
||||
|
||||
|
||||
prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
|
||||
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
|
||||
|
||||
class DataLossWarning(UserWarning):
|
||||
pass
|
||||
|
||||
class ReparseException(Exception):
|
||||
pass
|
||||
|
127
planet/vendor/html5lib/filters/formfiller.py
vendored
Normal file
127
planet/vendor/html5lib/filters/formfiller.py
vendored
Normal file
@ -0,0 +1,127 @@
|
||||
#
|
||||
# The goal is to finally have a form filler where you pass data for
|
||||
# each form, using the algorithm for "Seeding a form with initial values"
|
||||
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
|
||||
#
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class SimpleFilter(_base.Filter):
|
||||
def __init__(self, source, fieldStorage):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.fieldStorage = fieldStorage
|
||||
|
||||
def __iter__(self):
|
||||
field_indices = {}
|
||||
state = None
|
||||
field_name = None
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"].lower()
|
||||
if name == "input":
|
||||
field_name = None
|
||||
field_type = None
|
||||
input_value_index = -1
|
||||
input_checked_index = -1
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == u"name":
|
||||
field_name = v.strip(spaceCharacters)
|
||||
elif n == u"type":
|
||||
field_type = v.strip(spaceCharacters)
|
||||
elif n == u"checked":
|
||||
input_checked_index = i
|
||||
elif n == u"value":
|
||||
input_value_index = i
|
||||
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
|
||||
if field_type in (u"checkbox", u"radio"):
|
||||
if value_list:
|
||||
if token["data"][input_value_index][1] == value:
|
||||
if input_checked_index < 0:
|
||||
token["data"].append((u"checked", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
elif input_checked_index >= 0:
|
||||
del token["data"][input_checked_index]
|
||||
|
||||
elif field_type not in (u"button", u"submit", u"reset"):
|
||||
if input_value_index >= 0:
|
||||
token["data"][input_value_index] = (u"value", value)
|
||||
else:
|
||||
token["data"].append((u"value", value))
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_type = None
|
||||
field_name = None
|
||||
|
||||
elif name == "textarea":
|
||||
field_type = "textarea"
|
||||
field_name = dict((token["data"])[::-1])["name"]
|
||||
|
||||
elif name == "select":
|
||||
field_type = "select"
|
||||
attributes = dict(token["data"][::-1])
|
||||
field_name = attributes.get("name")
|
||||
is_select_multiple = "multiple" in attributes
|
||||
is_selected_option_found = False
|
||||
|
||||
elif field_type == "select" and field_name and name == "option":
|
||||
option_selected_index = -1
|
||||
option_value = None
|
||||
for i,(n,v) in enumerate(token["data"]):
|
||||
n = n.lower()
|
||||
if n == "selected":
|
||||
option_selected_index = i
|
||||
elif n == "value":
|
||||
option_value = v.strip(spaceCharacters)
|
||||
if option_value is None:
|
||||
raise NotImplementedError("<option>s without a value= attribute")
|
||||
else:
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
if (is_select_multiple or not is_selected_option_found) and option_value == value:
|
||||
if option_selected_index < 0:
|
||||
token["data"].append((u"selected", u""))
|
||||
field_indices[field_name] = field_index + 1
|
||||
is_selected_option_found = True
|
||||
elif option_selected_index >= 0:
|
||||
del token["data"][option_selected_index]
|
||||
|
||||
elif field_type is not None and field_name and type == "EndTag":
|
||||
name = token["name"].lower()
|
||||
if name == field_type:
|
||||
if name == "textarea":
|
||||
value_list = self.fieldStorage.getlist(field_name)
|
||||
if value_list:
|
||||
field_index = field_indices.setdefault(field_name, 0)
|
||||
if field_index < len(value_list):
|
||||
value = value_list[field_index]
|
||||
else:
|
||||
value = ""
|
||||
yield {"type": "Characters", "data": value}
|
||||
field_indices[field_name] = field_index + 1
|
||||
|
||||
field_name = None
|
||||
|
||||
elif name == "option" and field_type == "select":
|
||||
pass # TODO: part of "option without value= attribute" processing
|
||||
|
||||
elif field_type == "textarea":
|
||||
continue # ignore token
|
||||
|
||||
yield token
|
53
planet/vendor/html5lib/filters/optionaltags.py
vendored
53
planet/vendor/html5lib/filters/optionaltags.py
vendored
@ -14,7 +14,8 @@ class Filter(_base.Filter):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["data"] or not self.is_optional_start(token["name"], previous, next):
|
||||
if (token["data"] or
|
||||
not self.is_optional_start(token["name"], previous, next)):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
@ -31,7 +32,11 @@ class Filter(_base.Filter):
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
return type == "StartTag"
|
||||
# XXX: we also omit the start tag if the head element is empty
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return True
|
||||
elif type == "EndTag":
|
||||
return next["name"] == "head"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
@ -52,7 +57,7 @@ class Filter(_base.Filter):
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
@ -81,16 +86,13 @@ class Filter(_base.Filter):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'option', 'tr'):
|
||||
elif tagname in ('li', 'optgroup', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
@ -112,14 +114,39 @@ class Filter(_base.Filter):
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, blockquote, dl, fieldset,
|
||||
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||
# or ul element, or if there is no more content in the parent
|
||||
# immediately followed by an address, article, aside,
|
||||
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||
# there is no more content in the parent element.
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
return next["name"] in ('address', 'article', 'aside',
|
||||
'blockquote', 'datagrid', 'dialog',
|
||||
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'header', 'hr', 'menu', 'nav', 'ol',
|
||||
'p', 'pre', 'section', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'option':
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if it is immediately followed by an <code>optgroup</code>
|
||||
# element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('address', 'blockquote', \
|
||||
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
|
||||
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
|
||||
return next["name"] in ('option', 'optgroup')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('rt', 'rp'):
|
||||
# An rt element's end tag may be omitted if the rt element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
# An rp element's end tag may be omitted if the rp element is
|
||||
# immediately followed by an rt or rp element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('rt', 'rp')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
|
8
planet/vendor/html5lib/filters/sanitizer.py
vendored
Normal file
8
planet/vendor/html5lib/filters/sanitizer.py
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
import _base
|
||||
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||
|
||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||
def __iter__(self):
|
||||
for token in _base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token: yield token
|
2155
planet/vendor/html5lib/html5parser.py
vendored
2155
planet/vendor/html5lib/html5parser.py
vendored
File diff suppressed because it is too large
Load Diff
170
planet/vendor/html5lib/ihatexml.py
vendored
Normal file
170
planet/vendor/html5lib/ihatexml.py
vendored
Normal file
@ -0,0 +1,170 @@
|
||||
import re
|
||||
|
||||
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||
|
||||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||
|
||||
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
|
||||
|
||||
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||
|
||||
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||
|
||||
letter = " | ".join([baseChar, ideographic])
|
||||
|
||||
#Without the
|
||||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||
extender])
|
||||
nameFirst = " | ".join([letter, "_"])
|
||||
|
||||
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||
|
||||
def charStringToList(chars):
|
||||
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||
rv = []
|
||||
for item in charRanges:
|
||||
foundMatch = False
|
||||
for regexp in (reChar, reCharRange):
|
||||
match = regexp.match(item)
|
||||
if match is not None:
|
||||
rv.append([hexToInt(item) for item in match.groups()])
|
||||
if len(rv[-1]) == 1:
|
||||
rv[-1] = rv[-1]*2
|
||||
foundMatch = True
|
||||
break
|
||||
if not foundMatch:
|
||||
assert len(item) == 1
|
||||
|
||||
rv.append([ord(item)] * 2)
|
||||
rv = normaliseCharList(rv)
|
||||
return rv
|
||||
|
||||
def normaliseCharList(charList):
|
||||
charList = sorted(charList)
|
||||
for item in charList:
|
||||
assert item[1] >= item[0]
|
||||
rv = []
|
||||
i = 0
|
||||
while i < len(charList):
|
||||
j = 1
|
||||
rv.append(charList[i])
|
||||
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
|
||||
rv[-1][1] = charList[i+j][1]
|
||||
j += 1
|
||||
i += j
|
||||
return rv
|
||||
|
||||
#We don't really support characters above the BMP :(
|
||||
max_unicode = int("FFFF", 16)
|
||||
|
||||
def missingRanges(charList):
|
||||
rv = []
|
||||
if charList[0] != 0:
|
||||
rv.append([0, charList[0][0] - 1])
|
||||
for i, item in enumerate(charList[:-1]):
|
||||
rv.append([item[1]+1, charList[i+1][0] - 1])
|
||||
if charList[-1][1] != max_unicode:
|
||||
rv.append([charList[-1][1] + 1, max_unicode])
|
||||
return rv
|
||||
|
||||
def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(intToUnicodeStr(item[0]))
|
||||
else:
|
||||
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
|
||||
return "[%s]"%"|".join(rv)
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
def intToUnicodeStr(intValue):
|
||||
#There must be a better (non-evil) way to do this
|
||||
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, r"\\" + char)
|
||||
if char in string:
|
||||
print string
|
||||
|
||||
return string
|
||||
|
||||
#output from the above
|
||||
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
def __init__(self, replaceChars = None,
|
||||
replaceRanges = None,
|
||||
dropXmlnsLocalName = False,
|
||||
dropXmlnsAttrNs = False,
|
||||
preventDoubleDashComments = False,
|
||||
preventDashAtCommentEnd = False,
|
||||
replaceFormFeedCharacters = True):
|
||||
if replaceRanges is not None or replaceChars is not None:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
self.replaceCharsRegexp = nonXmlBMPRegexp
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
|
||||
self.preventDoubleDashComments = preventDoubleDashComments
|
||||
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||
|
||||
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||
|
||||
self.replaceCache = {}
|
||||
|
||||
def coerceAttribute(self, name, namespace=None):
|
||||
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||
#Need a datalosswarning here
|
||||
return None
|
||||
elif (self.dropXmlnsAttrNs and
|
||||
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||
return None
|
||||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name, namespace=None):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
if self.preventDoubleDashComments:
|
||||
while "--" in data:
|
||||
data = data.replace("--", "- -")
|
||||
return data
|
||||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
data = data.replace("\x0C", " ")
|
||||
#Other non-xml characters
|
||||
return data
|
||||
|
||||
def toXmlName(self, name):
|
||||
replaceChars = set(self.replaceCharsRegexp.findall(name))
|
||||
for char in replaceChars:
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
name = name.replace(char, replacement)
|
||||
return name
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
name = name.replace(item, self.unescapeChar(item))
|
||||
return name
|
||||
|
||||
def escapeChar(self, char):
|
||||
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
|
||||
self.replaceCache[char] = replacement
|
||||
return replacement
|
||||
|
||||
def unescapeChar(self, charcode):
|
||||
return unichr(int(charcode[1:], 16))
|
616
planet/vendor/html5lib/inputstream.py
vendored
616
planet/vendor/html5lib/inputstream.py
vendored
@ -1,15 +1,109 @@
|
||||
import codecs
|
||||
import re
|
||||
import types
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
import sys
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings
|
||||
from utils import MethodDispatcher
|
||||
from constants import encodings, ReparseException
|
||||
|
||||
class HTMLInputStream(object):
|
||||
#Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
|
||||
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
|
||||
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
|
||||
|
||||
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||
|
||||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||
0x10FFFE, 0x10FFFF])
|
||||
|
||||
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
||||
|
||||
# Cache for charsUntil()
|
||||
charsUntilRegEx = {}
|
||||
|
||||
class BufferedStream:
|
||||
"""Buffering for streams that do not have buffering of their own
|
||||
|
||||
The buffer is implemented as a list of chunks on the assumption that
|
||||
joining many strings will be slow since it is O(n**2)
|
||||
"""
|
||||
|
||||
def __init__(self, stream):
|
||||
self.stream = stream
|
||||
self.buffer = []
|
||||
self.position = [-1,0] #chunk number, offset
|
||||
|
||||
def tell(self):
|
||||
pos = 0
|
||||
for chunk in self.buffer[:self.position[0]]:
|
||||
pos += len(chunk)
|
||||
pos += self.position[1]
|
||||
return pos
|
||||
|
||||
def seek(self, pos):
|
||||
assert pos < self._bufferedBytes()
|
||||
offset = pos
|
||||
i = 0
|
||||
while len(self.buffer[i]) < offset:
|
||||
offset -= pos
|
||||
i += 1
|
||||
self.position = [i, offset]
|
||||
|
||||
def read(self, bytes):
|
||||
if not self.buffer:
|
||||
return self._readStream(bytes)
|
||||
elif (self.position[0] == len(self.buffer) and
|
||||
self.position[1] == len(self.buffer[-1])):
|
||||
return self._readStream(bytes)
|
||||
else:
|
||||
return self._readFromBuffer(bytes)
|
||||
|
||||
def _bufferedBytes(self):
|
||||
return sum([len(item) for item in self.buffer])
|
||||
|
||||
def _readStream(self, bytes):
|
||||
data = self.stream.read(bytes)
|
||||
self.buffer.append(data)
|
||||
self.position[0] += 1
|
||||
self.position[1] = len(data)
|
||||
return data
|
||||
|
||||
def _readFromBuffer(self, bytes):
|
||||
remainingBytes = bytes
|
||||
rv = []
|
||||
bufferIndex = self.position[0]
|
||||
bufferOffset = self.position[1]
|
||||
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||
assert remainingBytes > 0
|
||||
bufferedData = self.buffer[bufferIndex]
|
||||
|
||||
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||
bytesToRead = remainingBytes
|
||||
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||
else:
|
||||
bytesToRead = len(bufferedData) - bufferOffset
|
||||
self.position = [bufferIndex, len(bufferedData)]
|
||||
bufferIndex += 1
|
||||
data = rv.append(bufferedData[bufferOffset:
|
||||
bufferOffset + bytesToRead])
|
||||
remainingBytes -= bytesToRead
|
||||
|
||||
bufferOffset = 0
|
||||
|
||||
if remainingBytes:
|
||||
rv.append(self._readStream(remainingBytes))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
|
||||
class HTMLInputStream:
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
@ -17,11 +111,13 @@ class HTMLInputStream(object):
|
||||
|
||||
"""
|
||||
|
||||
_defaultChunkSize = 10240
|
||||
|
||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by the HTML5Lib.
|
||||
for use by html5lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
@ -33,10 +129,17 @@ class HTMLInputStream(object):
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
|
||||
#Craziness
|
||||
if len(u"\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = encoding
|
||||
self.charEncoding = (codecName(encoding), "certain")
|
||||
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
@ -52,17 +155,25 @@ class HTMLInputStream(object):
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
|
||||
if (self.charEncoding[0] is None):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||
'replace')
|
||||
|
||||
self.queue = []
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
self.errors = []
|
||||
|
||||
self.line = self.col = 0
|
||||
self.lineLengths = []
|
||||
# number of (complete) lines in previous chunks
|
||||
self.prevNumLines = 0
|
||||
# number of columns in the last line of the previous chunk
|
||||
self.prevNumCols = 0
|
||||
|
||||
#Flag to indicate we may have a CR LF broken across a data chunk
|
||||
self._lastChunkEndsWithCR = False
|
||||
@ -80,22 +191,29 @@ class HTMLInputStream(object):
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
self.charEncoding = "utf-8"
|
||||
self.charEncoding = ("utf-8", "certain")
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
|
||||
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
|
||||
stream is sys.stdin):
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
confidence = "certain"
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding is None and parseMeta:
|
||||
encoding = self.detectEncodingMeta()
|
||||
confidence = "tentative"
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and chardet:
|
||||
confidence = "tentative"
|
||||
try:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
buffers = []
|
||||
@ -108,11 +226,12 @@ class HTMLInputStream(object):
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
self.seek("".join(buffers), 0)
|
||||
self.rawStream.seek(0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
if encoding is None:
|
||||
confidence="tentative"
|
||||
encoding = self.defaultEncoding
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
@ -121,7 +240,21 @@ class HTMLInputStream(object):
|
||||
if encoding.lower() in encodingSub:
|
||||
encoding = encodingSub[encoding.lower()]
|
||||
|
||||
return encoding
|
||||
return encoding, confidence
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
newEncoding = codecName(newEncoding)
|
||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
newEncoding = "utf-8"
|
||||
if newEncoding is None:
|
||||
return
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.reset()
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
@ -149,198 +282,219 @@ class HTMLInputStream(object):
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.seek(string, encoding and seek or 0)
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def seek(self, buffer, n):
|
||||
"""Unget buffer[n:]"""
|
||||
if hasattr(self.rawStream, 'unget'):
|
||||
self.rawStream.unget(buffer[n:])
|
||||
return
|
||||
|
||||
if hasattr(self.rawStream, 'seek'):
|
||||
try:
|
||||
self.rawStream.seek(n)
|
||||
return
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
class BufferedStream:
|
||||
def __init__(self, data, stream):
|
||||
self.data = data
|
||||
self.stream = stream
|
||||
def read(self, chars=-1):
|
||||
if chars == -1 or chars > len(self.data):
|
||||
result = self.data
|
||||
self.data = ''
|
||||
if chars == -1:
|
||||
return result + self.stream.read()
|
||||
else:
|
||||
return result + self.stream.read(chars-len(result))
|
||||
elif not self.data:
|
||||
return self.stream.read(chars)
|
||||
else:
|
||||
result = self.data[:chars]
|
||||
self.data = self.data[chars:]
|
||||
return result
|
||||
def unget(self, data):
|
||||
if self.data:
|
||||
self.data += data
|
||||
else:
|
||||
self.data = data
|
||||
|
||||
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
parser = EncodingParser(buffer)
|
||||
self.seek(buffer, 0)
|
||||
return parser.getEncoding()
|
||||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
encoding = "utf-8"
|
||||
|
||||
return encoding
|
||||
|
||||
def _position(self, offset):
|
||||
chunk = self.chunk
|
||||
nLines = chunk.count(u'\n', 0, offset)
|
||||
positionLine = self.prevNumLines + nLines
|
||||
lastLinePos = chunk.rfind(u'\n', 0, offset)
|
||||
if lastLinePos == -1:
|
||||
positionColumn = self.prevNumCols + offset
|
||||
else:
|
||||
positionColumn = offset - (lastLinePos + 1)
|
||||
return (positionLine, positionColumn)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self.line, self.col
|
||||
line, col = self._position(self.chunkOffset)
|
||||
return (line+1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#If we still don't have a character we have reached EOF
|
||||
if not self.queue:
|
||||
# Read a new chunk from the input stream if necessary
|
||||
if self.chunkOffset >= self.chunkSize:
|
||||
if not self.readChunk():
|
||||
return EOF
|
||||
|
||||
char = self.queue.pop(0)
|
||||
chunkOffset = self.chunkOffset
|
||||
char = self.chunk[chunkOffset]
|
||||
self.chunkOffset = chunkOffset + 1
|
||||
|
||||
# update position in stream
|
||||
if char == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
self.col += 1
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=10240):
|
||||
def readChunk(self, chunkSize=None):
|
||||
if chunkSize is None:
|
||||
chunkSize = self._defaultChunkSize
|
||||
|
||||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||
|
||||
self.chunk = u""
|
||||
self.chunkSize = 0
|
||||
self.chunkOffset = 0
|
||||
|
||||
data = self.dataStream.read(chunkSize)
|
||||
|
||||
if not data:
|
||||
return
|
||||
#Replace null characters
|
||||
for i in xrange(data.count(u"\u0000")):
|
||||
self.errors.append(_('null character found in input stream, '
|
||||
'replaced with U+FFFD'))
|
||||
return False
|
||||
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
data = data.replace(u"\u0000", u"\ufffd")
|
||||
#Check for CR LF broken across chunks
|
||||
if (self._lastChunkEndsWithCR and data[0] == "\n"):
|
||||
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
|
||||
data = data[1:]
|
||||
self._lastChunkEndsWithCR = data[-1] == "\r"
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
# Stop if the chunk is now empty
|
||||
if not data:
|
||||
return False
|
||||
self._lastChunkEndsWithCR = data[-1] == u"\r"
|
||||
data = data.replace(u"\r\n", u"\n")
|
||||
data = data.replace(u"\r", u"\n")
|
||||
|
||||
data = unicode(data)
|
||||
self.queue.extend([char for char in data])
|
||||
self.chunk = data
|
||||
self.chunkSize = len(data)
|
||||
|
||||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for i in xrange(data.count(u"\u0000")):
|
||||
self.errors.append("null-character")
|
||||
for i in xrange(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
#Someone picked the wrong compile option
|
||||
#You lose
|
||||
for i in xrange(data.count(u"\u0000")):
|
||||
self.errors.append("null-character")
|
||||
skip = False
|
||||
import sys
|
||||
for match in invalid_unicode_re.finditer(data):
|
||||
if skip:
|
||||
continue
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
#Pretty sure there should be endianness issues here
|
||||
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
|
||||
pos < len(data) - 1 and
|
||||
ord(data[pos + 1]) >= 0xDC00 and
|
||||
ord(data[pos + 1]) <= 0xDFFF):
|
||||
#We have a surrogate pair!
|
||||
#From a perl manpage
|
||||
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
|
||||
(ord(data[pos + 1]) - 0xDC00))
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||
pos == len(data) - 1):
|
||||
self.errors.append("invalid-codepoint")
|
||||
else:
|
||||
skip = False
|
||||
self.errors.append("invalid-codepoint")
|
||||
#This is still wrong if it is possible for a surrogate pair to break a
|
||||
#chunk boundary
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
any container that supports the in method being called on it.
|
||||
including any character in 'characters' or EOF. 'characters' must be
|
||||
a container that supports the 'in' method and iteration over its
|
||||
characters.
|
||||
"""
|
||||
|
||||
#This method is currently 40-50% of our total runtime and badly needs
|
||||
#optimizing
|
||||
#Possible improvements:
|
||||
# - use regexp to find characters that match the required character set
|
||||
# (with regexp cache since we do the same searches many many times)
|
||||
# - improve EOF handling for fewer if statements
|
||||
# Use a cache of regexps to find the required characters
|
||||
try:
|
||||
chars = charsUntilRegEx[(characters, opposite)]
|
||||
except KeyError:
|
||||
if __debug__:
|
||||
for c in characters:
|
||||
assert(ord(c) < 128)
|
||||
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
|
||||
if not opposite:
|
||||
regex = u"^%s" % regex
|
||||
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
|
||||
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#Break if we have reached EOF
|
||||
if not self.queue or self.queue[0] == None:
|
||||
return u""
|
||||
rv = []
|
||||
|
||||
i = 0
|
||||
while (self.queue[i] in characters) == opposite:
|
||||
i += 1
|
||||
if i == len(self.queue):
|
||||
self.readChunk()
|
||||
#If the queue doesn't grow we have reached EOF
|
||||
if i == len(self.queue) or self.queue[i] is EOF:
|
||||
while True:
|
||||
# Find the longest matching prefix
|
||||
m = chars.match(self.chunk, self.chunkOffset)
|
||||
if m is None:
|
||||
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||
# then stop
|
||||
if self.chunkOffset != self.chunkSize:
|
||||
break
|
||||
#XXX- wallpaper over bug in calculation below
|
||||
#Otherwise change the stream position
|
||||
if self.queue[i] == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
self.col += 1
|
||||
end = m.end()
|
||||
# If not the whole chunk matched, return everything
|
||||
# up to the part that didn't match
|
||||
if end != self.chunkSize:
|
||||
rv.append(self.chunk[self.chunkOffset:end])
|
||||
self.chunkOffset = end
|
||||
break
|
||||
# If the whole remainder of the chunk matched,
|
||||
# use it all and read the next chunk
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
rv = u"".join(self.queue[:i])
|
||||
self.queue = self.queue[i:]
|
||||
r = u"".join(rv)
|
||||
return r
|
||||
|
||||
#Calculate where we now are in the stream
|
||||
#One possible optimisation would be to store all read characters and
|
||||
#Calculate this on an as-needed basis (perhaps flushing the read data
|
||||
#every time we read a new chunk) rather than once per call here and
|
||||
#in .char()
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
|
||||
#XXX Temporarily disable this because there is a bug
|
||||
|
||||
#lines = rv.split("\n")
|
||||
#
|
||||
#if lines:
|
||||
# #Add number of lines passed onto positon
|
||||
# oldCol = self.col
|
||||
# self.line += len(lines)-1
|
||||
# if len(lines) > 1:
|
||||
# self.col = len(lines[-1])
|
||||
# else:
|
||||
# self.col += len(lines[0])
|
||||
#
|
||||
# if self.lineLengths and oldCol > 0:
|
||||
# self.lineLengths[-1] += len(lines[0])
|
||||
# lines = lines[1:-1]
|
||||
# else:
|
||||
# lines = lines[:-1]
|
||||
#
|
||||
# for line in lines:
|
||||
# self.lineLengths.append(len(line))
|
||||
#
|
||||
|
||||
return rv
|
||||
|
||||
def unget(self, chars):
|
||||
if chars:
|
||||
self.queue = list(chars) + self.queue
|
||||
#Alter the current line, col position
|
||||
for c in chars[::-1]:
|
||||
if c == '\n':
|
||||
self.line -= 1
|
||||
self.col = self.lineLengths[self.line]
|
||||
if char is not None:
|
||||
if self.chunkOffset == 0:
|
||||
# unget is called quite rarely, so it's a good idea to do
|
||||
# more work here if it saves a bit of work in the frequently
|
||||
# called char and charsUntil.
|
||||
# So, just prepend the ungotten character onto the current
|
||||
# chunk:
|
||||
self.chunk = char + self.chunk
|
||||
self.chunkSize += 1
|
||||
else:
|
||||
self.col -= 1
|
||||
self.chunkOffset -= 1
|
||||
assert self.chunk[self.chunkOffset] == char
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an assosiated position and various extra methods
|
||||
"""String-like object with an associated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
return str.__new__(self, value)
|
||||
|
||||
def __init__(self, value):
|
||||
str.__init__(self, value)
|
||||
self._position=-1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
self._position += 1
|
||||
rv = self[self.position]
|
||||
return rv
|
||||
p = self._position = self._position + 1
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
return self[p]
|
||||
|
||||
def previous(self):
|
||||
p = self._position
|
||||
if p >= len(self):
|
||||
raise StopIteration
|
||||
elif p < 0:
|
||||
raise TypeError
|
||||
self._position = p = p - 1
|
||||
return self[p]
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
@ -362,20 +516,39 @@ class EncodingBytes(str):
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharacters):
|
||||
def skip(self, chars=spaceCharactersBytes):
|
||||
"""Skip past a list of characters"""
|
||||
while self.currentByte in chars:
|
||||
self.position += 1
|
||||
p = self.position # use property for the error-checking
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c not in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def skipUntil(self, chars):
|
||||
p = self.position
|
||||
while p < len(self):
|
||||
c = self[p]
|
||||
if c in chars:
|
||||
self._position = p
|
||||
return c
|
||||
p += 1
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes, lower=False):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
data = self[self.position:self.position+len(bytes)]
|
||||
p = self.position
|
||||
data = self[p:p+len(bytes)]
|
||||
if lower:
|
||||
data = data.lower()
|
||||
rv = data.startswith(bytes)
|
||||
if rv == True:
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
@ -389,12 +562,6 @@ class EncodingBytes(str):
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def findNext(self, byteList):
|
||||
"""Move the pointer so it points to the next byte in a set of possible
|
||||
bytes"""
|
||||
while (self.currentByte not in byteList):
|
||||
self.position += 1
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
@ -423,8 +590,7 @@ class EncodingParser(object):
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
if self.encoding is not None:
|
||||
self.encoding = self.encoding.strip()
|
||||
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
@ -432,7 +598,7 @@ class EncodingParser(object):
|
||||
return self.data.jumpTo("-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharacters:
|
||||
if self.data.currentByte not in spaceCharactersBytes:
|
||||
#if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
#We have a valid meta element we want to search for attributes
|
||||
@ -444,38 +610,41 @@ class EncodingParser(object):
|
||||
else:
|
||||
if attr[0] == "charset":
|
||||
tentativeEncoding = attr[1]
|
||||
if isValidEncoding(tentativeEncoding):
|
||||
self.encoding = tentativeEncoding
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
elif attr[0] == "content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if isValidEncoding(tentativeEncoding):
|
||||
self.encoding = tentativeEncoding
|
||||
codec = codecName(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
self.data.position+=1
|
||||
self.data.next()
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
if self.data.currentByte not in asciiLetters:
|
||||
data = self.data
|
||||
if data.currentByte not in asciiLettersBytes:
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag:
|
||||
self.data.position -= 1
|
||||
data.previous()
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
self.data.findNext(list(spaceCharacters) + ["<", ">"])
|
||||
if self.data.currentByte == "<":
|
||||
c = data.skipUntil(spacesAngleBrackets)
|
||||
if c == "<":
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
self.data.position -= 1
|
||||
data.previous()
|
||||
else:
|
||||
#Read all attributes
|
||||
attr = self.getAttribute()
|
||||
@ -489,73 +658,75 @@ class EncodingParser(object):
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
self.data.skip(list(spaceCharacters)+["/"])
|
||||
if self.data.currentByte == "<":
|
||||
self.data.position -= 1
|
||||
data = self.data
|
||||
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
||||
if c == "<":
|
||||
data.previous()
|
||||
return None
|
||||
elif self.data.currentByte == ">":
|
||||
elif c == ">" or c is None:
|
||||
return None
|
||||
attrName = []
|
||||
attrValue = []
|
||||
spaceFound = False
|
||||
#Step 5 attribute name
|
||||
while True:
|
||||
if self.data.currentByte == "=" and attrName:
|
||||
if c == "=" and attrName:
|
||||
break
|
||||
elif self.data.currentByte in spaceCharacters:
|
||||
elif c in spaceCharactersBytes:
|
||||
spaceFound=True
|
||||
break
|
||||
elif self.data.currentByte in ("/", "<", ">"):
|
||||
elif c in ("/", "<", ">"):
|
||||
return "".join(attrName), ""
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrName.extend(self.data.currentByte.lower())
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
else:
|
||||
attrName.extend(self.data.currentByte)
|
||||
attrName.append(c)
|
||||
#Step 6
|
||||
self.data.position += 1
|
||||
c = data.next()
|
||||
#Step 7
|
||||
if spaceFound:
|
||||
self.data.skip()
|
||||
c = data.skip()
|
||||
#Step 8
|
||||
if self.data.currentByte != "=":
|
||||
self.data.position -= 1
|
||||
if c != "=":
|
||||
data.previous()
|
||||
return "".join(attrName), ""
|
||||
#XXX need to advance position in both spaces and value case
|
||||
#Step 9
|
||||
self.data.position += 1
|
||||
data.next()
|
||||
#Step 10
|
||||
self.data.skip()
|
||||
c = data.skip()
|
||||
#Step 11
|
||||
if self.data.currentByte in ("'", '"'):
|
||||
if c in ("'", '"'):
|
||||
#11.1
|
||||
quoteChar = self.data.currentByte
|
||||
quoteChar = c
|
||||
while True:
|
||||
self.data.position+=1
|
||||
#11.3
|
||||
if self.data.currentByte == quoteChar:
|
||||
self.data.position += 1
|
||||
c = data.next()
|
||||
if c == quoteChar:
|
||||
data.next()
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
#11.4
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
#11.5
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
elif self.data.currentByte in (">", '<'):
|
||||
attrValue.append(c)
|
||||
elif c in (">", "<"):
|
||||
return "".join(attrName), ""
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
attrValue.append(c)
|
||||
while True:
|
||||
self.data.position +=1
|
||||
if self.data.currentByte in (
|
||||
list(spaceCharacters) + [">", '<']):
|
||||
c = data.next()
|
||||
if c in spacesAngleBrackets:
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
attrValue.append(c)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
@ -588,7 +759,7 @@ class ContentAttrParser(object):
|
||||
#Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.findNext(spaceCharacters)
|
||||
self.data.skipUntil(spaceCharactersBytes)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
#Return the whole remaining value
|
||||
@ -596,7 +767,12 @@ class ContentAttrParser(object):
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
def isValidEncoding(encoding):
|
||||
"""Determine if a string is a supported encoding"""
|
||||
return (encoding is not None and type(encoding) == types.StringType and
|
||||
encoding.lower().strip() in encodings)
|
||||
|
||||
def codecName(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if (encoding is not None and type(encoding) in types.StringTypes):
|
||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||
return encodings.get(canonicalName, None)
|
||||
else:
|
||||
return None
|
||||
|
147
planet/vendor/html5lib/liberalxmlparser.py
vendored
147
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -1,147 +0,0 @@
|
||||
"""
|
||||
Warning: this module is experimental and subject to change and even removal
|
||||
at any time.
|
||||
|
||||
For background/rationale, see:
|
||||
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||
* http://tinyurl.com/ylfj8k (and follow-ups)
|
||||
|
||||
References:
|
||||
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
|
||||
@@TODO:
|
||||
* Selectively lowercase only XHTML, but not foreign markup
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
from constants import voidElements, contentModelFlags
|
||||
|
||||
from xml.dom import XHTML_NAMESPACE
|
||||
from xml.sax.saxutils import unescape
|
||||
|
||||
class XMLParser(html5parser.HTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
save = self.tokenizer.contentModelFlag
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
self.tokenizer.contentModelFlag = save
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
elif token["type"] == "Characters":
|
||||
# un-escape rcdataElements (e.g. style, script)
|
||||
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
||||
token["data"] = unescape(token["data"])
|
||||
|
||||
elif token["type"] == "Comment":
|
||||
# Rescue CDATA from the comments
|
||||
if (token["data"].startswith("[CDATA[") and
|
||||
token["data"].endswith("]]")):
|
||||
token["type"] = "Characters"
|
||||
token["data"] = token["data"][7:-2]
|
||||
|
||||
return token
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
|
||||
**kwargs):
|
||||
|
||||
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
|
||||
encoding, lowercaseElementName=False,
|
||||
lowercaseAttrName=False)
|
||||
|
||||
class XHTMLParser(XMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlInitialPhase(self, self.tree)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
token = XMLParser.normalizeToken(self, token)
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token["type"] == "EndTag":
|
||||
if token["name"] in voidElements:
|
||||
if not self.tree.openElements or \
|
||||
self.tree.openElements[-1].name != token["name"]:
|
||||
token["type"] = "EmptyTag"
|
||||
if not token.has_key("data"): token["data"] = {}
|
||||
else:
|
||||
if token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] != XHTML_NAMESPACE:
|
||||
break
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
|
||||
return token
|
||||
|
||||
class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
def insertHtmlElement(self):
|
||||
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
|
||||
self.tree.openElements.append(element)
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XmlInitialPhase(html5parser.InitialPhase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
print repr(data)
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
self.tree.openElements.append(self.tree.document)
|
||||
self.parser.phase = XmlElementPhase(self.parser, self.tree)
|
||||
return getattr(self.parser.phase, name)
|
||||
|
||||
class XmlElementPhase(html5parser.Phase):
|
||||
""" Generic handling for all XML elements """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.Phase.__init__(self, *args, **kwargs)
|
||||
self.startTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
self.endTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
|
||||
def endTagOther(self, name):
|
||||
for node in self.tree.openElements[::-1]:
|
||||
if node.name == name:
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
self.parser.parseError()
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.tree.insertText(data)
|
58
planet/vendor/html5lib/sanitizer.py
vendored
58
planet/vendor/html5lib/sanitizer.py
vendored
@ -1,6 +1,8 @@
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from tokenizer import HTMLTokenizer
|
||||
from constants import tokenTypes
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
|
||||
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
||||
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
||||
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
||||
@ -83,6 +85,13 @@ class HTMLSanitizerMixin(object):
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||
'xlink:href', 'xml:base']
|
||||
|
||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
|
||||
|
||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
|
||||
'radialGradient', 'textpath', 'tref', 'set', 'use']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]):
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||
attrs = dict([(name,val) for name,val in
|
||||
token["data"][::-1]
|
||||
if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr): continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||
if not attrs.has_key(attr):
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
|
||||
(val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
del attrs[attr]
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||
attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
return token
|
||||
else:
|
||||
if token["type"] == "EndTag":
|
||||
if token["type"] == tokenTypes["EndTag"]:
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == "EmptyTag":
|
||||
if token["type"] == tokenTypes["EmptyTag"]:
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = "Characters"
|
||||
token["type"] = tokenTypes["Characters"]
|
||||
del token["name"]
|
||||
return token
|
||||
elif token["type"] == "Comment":
|
||||
elif token["type"] == tokenTypes["Comment"]:
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
|
||||
|
||||
clean = []
|
||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||
if not value: continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False):
|
||||
#Change case matching defaults as we only output lowercase html anyway
|
||||
#This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName)
|
||||
|
||||
def __iter__(self):
|
||||
|
14
planet/vendor/html5lib/serializer/__init__.py
vendored
14
planet/vendor/html5lib/serializer/__init__.py
vendored
@ -1,3 +1,17 @@
|
||||
|
||||
from html5lib import treewalkers
|
||||
|
||||
from htmlserializer import HTMLSerializer
|
||||
from xhtmlserializer import XHTMLSerializer
|
||||
|
||||
def serialize(input, tree="simpletree", format="html", encoding=None,
|
||||
**serializer_opts):
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
if format == "html":
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
elif format == "xhtml":
|
||||
s = XHTMLSerializer(**serializer_opts)
|
||||
else:
|
||||
raise ValueError, "type must be either html or xhtml"
|
||||
return s.render(walker(input), encoding)
|
||||
|
@ -147,7 +147,7 @@ class HTMLSerializer(object):
|
||||
quote_attr = True
|
||||
else:
|
||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||
spaceCharacters + "<>\"'", False)
|
||||
spaceCharacters + ">\"'=", False)
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
||||
if encoding:
|
||||
|
1058
planet/vendor/html5lib/tokenizer.py
vendored
1058
planet/vendor/html5lib/tokenizer.py
vendored
File diff suppressed because it is too large
Load Diff
28
planet/vendor/html5lib/treebuilders/__init__.py
vendored
28
planet/vendor/html5lib/treebuilders/__init__.py
vendored
@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"dom" - A generic builder for DOM implementations, defaulting to
|
||||
a xml.dom.minidom based implementation for the sake of
|
||||
backwards compatibility (as releases up until 0.10 had a
|
||||
builder called "dom" that was a minidom implemenation).
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
lxml.etree."""
|
||||
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
||||
module implementing the tree type e.g.
|
||||
xml.etree.ElementTree or lxml.etree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType in ("dom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeBuilderCache[treeType] = mod.TreeBuilder
|
||||
if treeType == "dom":
|
||||
import dom
|
||||
# XXX: Keep backwards compatibility by using minidom if no implementation is given
|
||||
if implementation == None:
|
||||
from xml.dom import minidom
|
||||
implementation = minidom
|
||||
# XXX: NEVER cache here, caching is done in the dom submodule
|
||||
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||
elif treeType == "simpletree":
|
||||
import simpletree
|
||||
treeBuilderCache[treeType] = simpletree.TreeBuilder
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||
elif treeType == "lxml":
|
||||
import etree_lxml
|
||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
|
71
planet/vendor/html5lib/treebuilders/_base.py
vendored
71
planet/vendor/html5lib/treebuilders/_base.py
vendored
@ -1,3 +1,4 @@
|
||||
import warnings
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||
try:
|
||||
frozenset
|
||||
@ -11,9 +12,6 @@ except NameError:
|
||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like
|
||||
# rather than DOM-like
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, name):
|
||||
"""Node representing an item in the tree.
|
||||
@ -43,7 +41,7 @@ class Node(object):
|
||||
return "<%s>"%(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
return "<%s>" % (self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
@ -112,7 +110,12 @@ class TreeBuilder(object):
|
||||
#Fragment class
|
||||
fragmentClass = None
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
if namespaceHTMLElements:
|
||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||
else:
|
||||
self.defaultNamespace = None
|
||||
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
@ -140,7 +143,8 @@ class TreeBuilder(object):
|
||||
return True
|
||||
elif node.name == "table":
|
||||
return False
|
||||
elif not tableVariant and node.name in scopingElements:
|
||||
elif (not tableVariant and (node.nameTuple in
|
||||
scopingElements)):
|
||||
return False
|
||||
elif node.name == "html":
|
||||
return False
|
||||
@ -179,7 +183,10 @@ class TreeBuilder(object):
|
||||
clone = self.activeFormattingElements[i].cloneNode()
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement(clone.name, clone.attributes)
|
||||
element = self.insertElement({"type":"StartTag",
|
||||
"name":clone.name,
|
||||
"namespace":clone.namespace,
|
||||
"data":clone.attributes})
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
@ -207,21 +214,30 @@ class TreeBuilder(object):
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
doctype = self.doctypeClass(name)
|
||||
doctype.publicId = publicId
|
||||
doctype.systemId = systemId
|
||||
def insertRoot(self, token):
|
||||
element = self.createElement(token)
|
||||
self.openElements.append(element)
|
||||
self.document.appendChild(element)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
def insertComment(self, token, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(data))
|
||||
parent.appendChild(self.commentClass(token["data"]))
|
||||
|
||||
def createElement(self, name, attributes):
|
||||
def createElement(self, token):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
@ -238,19 +254,20 @@ class TreeBuilder(object):
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, name, attributes):
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
def insertElementNormal(self, token):
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
element = self.elementClass(name, namespace)
|
||||
element.attributes = token["data"]
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, name, attributes):
|
||||
def insertElementTable(self, token):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
element = self.createElement(token)
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(name, attributes)
|
||||
return self.insertElementNormal(token)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
@ -267,9 +284,9 @@ class TreeBuilder(object):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not(self.insertFromTable) or (self.insertFromTable and
|
||||
self.openElements[-1].name not in
|
||||
tableInsertModeElements)):
|
||||
if (not self.insertFromTable or (self.insertFromTable and
|
||||
self.openElements[-1].name
|
||||
not in tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
# We should be in the InTable mode. This means we want to do
|
||||
@ -287,7 +304,7 @@ class TreeBuilder(object):
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == u"table":
|
||||
if elm.name == "table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
|
129
planet/vendor/html5lib/treebuilders/dom.py
vendored
129
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -1,35 +1,66 @@
|
||||
import _base
|
||||
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
|
||||
import new
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
|
||||
import _base
|
||||
from html5lib import constants, ihatexml
|
||||
from html5lib.constants import namespaces
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getDomModule(DomImplementation):
|
||||
name = "_" + DomImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = new.module(name)
|
||||
objs = getDomBuilder(DomImplementation)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
infoset_filter = ihatexml.InfosetFilter()
|
||||
class AttrList:
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
def __iter__(self):
|
||||
return self.element.attributes.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
self.element.setAttribute(infoset_filter.coerceAttribute(name),
|
||||
infoset_filter.coerceCharacters(value))
|
||||
def items(self):
|
||||
return self.element.attributes.items()
|
||||
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
|
||||
self.element.attributes.items()]
|
||||
def keys(self):
|
||||
return self.element.attributes.keys()
|
||||
return [infoset_filter.fromXmlName(item) for item in
|
||||
self.element.attributes.keys()]
|
||||
def __getitem__(self, name):
|
||||
name = infoset_filter.toXmlName(name)
|
||||
return self.element.getAttribute(name)
|
||||
|
||||
def __contains__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.hasAttribute(infoset_filter.toXmlName(name))
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
_base.Node.__init__(self, element.localName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
|
||||
and self.element.namespaceURI or None)
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
data=infoset_filter.coerceCharacters(data)
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
@ -58,9 +89,19 @@ class NodeBuilder(_base.Node):
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
|
||||
if isinstance(name, tuple):
|
||||
if name[0] is not None:
|
||||
qualifiedName = (name[0] + ":" +
|
||||
infoset_filter.coerceAttribute(
|
||||
name[1]))
|
||||
else:
|
||||
qualifiedName = infoset_filter.coerceAttribute(
|
||||
name[1])
|
||||
self.element.setAttributeNS(name[2], qualifiedName,
|
||||
value)
|
||||
else:
|
||||
self.element.setAttribute(
|
||||
infoset_filter.coerceAttribute(name), value)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
@ -69,19 +110,37 @@ class NodeBuilder(_base.Node):
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
|
||||
return self
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
domimpl = Dom.getDOMImplementation()
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
if Dom == minidom:
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name):
|
||||
return NodeBuilder(self.dom.createElement(name))
|
||||
def elementClass(self, name, namespace=None):
|
||||
if namespace is None and self.defaultNamespace is None:
|
||||
node = self.dom.createElement(name)
|
||||
else:
|
||||
node = self.dom.createElementNS(namespace, name)
|
||||
|
||||
return NodeBuilder(node)
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
@ -102,7 +161,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
data=infoset_filter.coerceCharacters(data)
|
||||
if parent <> self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
@ -121,6 +180,12 @@ def testSerializer(element):
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
if element.name:
|
||||
if element.publicId or element.systemId:
|
||||
publicId = element.publicId or ""
|
||||
systemId = element.systemId or ""
|
||||
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
|
||||
' '*indent, element.name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||
@ -133,9 +198,26 @@ def testSerializer(element):
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
|
||||
if (hasattr(element, "namespaceURI") and
|
||||
element.namespaceURI not in (None,
|
||||
constants.namespaces["html"])):
|
||||
name = "%s %s"%(constants.prefixes[element.namespaceURI],
|
||||
element.nodeName)
|
||||
else:
|
||||
name = element.nodeName
|
||||
rv.append("|%s<%s>"%(' '*indent, name))
|
||||
if element.hasAttributes():
|
||||
for name, value in element.attributes.items():
|
||||
i = 0
|
||||
attr = element.attributes.item(i)
|
||||
while attr:
|
||||
name = infoset_filter.fromXmlName(attr.localName)
|
||||
value = attr.value
|
||||
ns = attr.namespaceURI
|
||||
if ns:
|
||||
name = "%s %s"%(constants.prefixes[ns], name)
|
||||
i += 1
|
||||
attr = element.attributes.item(i)
|
||||
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
@ -201,3 +283,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
# DOCUMENT_TYPE_NODE
|
||||
# NOTATION_NODE
|
||||
pass
|
||||
|
||||
return locals()
|
||||
|
||||
# Keep backwards compatibility with things that directly load
|
||||
# classes/functions from this module
|
||||
for key, value in getDomModule(minidom).__dict__.items():
|
||||
globals()[key] = value
|
||||
|
94
planet/vendor/html5lib/treebuilders/etree.py
vendored
94
planet/vendor/html5lib/treebuilders/etree.py
vendored
@ -1,5 +1,12 @@
|
||||
import _base
|
||||
import new
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib import ihatexml
|
||||
from html5lib import constants
|
||||
from html5lib.constants import namespaces
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
@ -17,21 +24,44 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
def __init__(self, name, namespace=None):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||
namespace))
|
||||
if namespace is None:
|
||||
self.nameTuple = namespaces["html"], self._name
|
||||
else:
|
||||
self.nameTuple = self._namespace, self._name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getETreeTag(self, name, namespace):
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s"%(namespace, name)
|
||||
return etree_tag
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
self._name = name
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
return self._name
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _setNamespace(self, namespace):
|
||||
self._namespace = namespace
|
||||
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||
|
||||
def _getNamespace(self):
|
||||
return self._namespace
|
||||
|
||||
namespace = property(_getNamespace, _setNamespace)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], key[1])
|
||||
else:
|
||||
name = key
|
||||
self._element.set(name, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get(u"publicId", None)
|
||||
return self._element.get(u"publicId", "")
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get(u"systemId", None)
|
||||
return self._element.get(u"systemId", "")
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
nsmatch = tag_regexp.match(element.tag)
|
||||
|
||||
if nsmatch is None:
|
||||
name = element.tag
|
||||
else:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
if prefix != "html":
|
||||
name = "%s %s"%(prefix, name)
|
||||
rv.append("|%s<%s>"%(' '*indent, name))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
nsmatch = tag_regexp.match(name)
|
||||
if nsmatch is not None:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
name = "%s %s"%(prefix, name)
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
@ -201,11 +257,18 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
filter = ihatexml.InfosetFilter()
|
||||
def serializeElement(element):
|
||||
if type(element) == type(ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
if element.get("publicId") or element.get("systemId"):
|
||||
publicId = element.get("publicId") or ""
|
||||
systemId = element.get("systemId") or ""
|
||||
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
|
||||
element.text, publicId, systemId))
|
||||
else:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
if element.text:
|
||||
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
attr = " ".join(["%s=\"%s\""%(
|
||||
filter.fromXmlName(name), value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
|
331
planet/vendor/html5lib/treebuilders/etree_lxml.py
vendored
Normal file
331
planet/vendor/html5lib/treebuilders/etree_lxml.py
vendored
Normal file
@ -0,0 +1,331 @@
|
||||
import new
|
||||
import warnings
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import DataLossWarning
|
||||
import html5lib.constants as constants
|
||||
import etree as etree_builders
|
||||
from html5lib import ihatexml
|
||||
|
||||
try:
|
||||
import lxml.etree as etree
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
fullTree = True
|
||||
|
||||
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||
of the native library as possible, without using fragile hacks like custom element
|
||||
names that break between releases. The downside of this is that we cannot represent
|
||||
all possible trees; specifically the following are known to cause problems:
|
||||
|
||||
Text or comments as siblings of the root element
|
||||
Docypes with no name
|
||||
|
||||
When any of these things occur, we emit a DataLossWarning
|
||||
"""
|
||||
|
||||
class DocumentType(object):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
self.name = name
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
class Document(object):
|
||||
def __init__(self):
|
||||
self._elementTree = None
|
||||
self._childNodes = []
|
||||
|
||||
def appendChild(self, element):
|
||||
self._elementTree.getroot().addnext(element._element)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
childNodes = property(_getChildNodes)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
filter = ihatexml.InfosetFilter()
|
||||
def serializeElement(element, indent=0):
|
||||
if not hasattr(element, "tag"):
|
||||
if hasattr(element, "getroot"):
|
||||
#Full tree case
|
||||
rv.append("#document")
|
||||
if element.docinfo.internalDTD:
|
||||
if not (element.docinfo.public_id or
|
||||
element.docinfo.system_url):
|
||||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||
else:
|
||||
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
element.docinfo.root_name,
|
||||
element.docinfo.public_id,
|
||||
element.docinfo.system_url)
|
||||
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
|
||||
next_element = element.getroot()
|
||||
while next_element.getprevious() is not None:
|
||||
next_element = next_element.getprevious()
|
||||
while next_element is not None:
|
||||
serializeElement(next_element, indent+2)
|
||||
next_element = next_element.getnext()
|
||||
elif isinstance(element, basestring):
|
||||
#Text in a fragment
|
||||
rv.append("|%s\"%s\""%(' '*indent, element))
|
||||
else:
|
||||
#Fragment case
|
||||
rv.append("#document-fragment")
|
||||
for next_element in element:
|
||||
serializeElement(next_element, indent+2)
|
||||
elif type(element.tag) == type(etree.Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||
if nsmatch is not None:
|
||||
ns = nsmatch.group(1)
|
||||
tag = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
if prefix != "html":
|
||||
rv.append("|%s<%s %s>"%(' '*indent, prefix,
|
||||
filter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent,
|
||||
filter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent,
|
||||
filter.fromXmlName(element.tag)))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
nsmatch = etree_builders.tag_regexp.match(name)
|
||||
if nsmatch:
|
||||
ns = nsmatch.group(1)
|
||||
name = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
|
||||
prefix,
|
||||
filter.fromXmlName(name),
|
||||
value))
|
||||
else:
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2),
|
||||
filter.fromXmlName(name),
|
||||
value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if not hasattr(element, "tag"):
|
||||
if element.docinfo.internalDTD:
|
||||
if element.docinfo.doctype:
|
||||
dtd_str = element.docinfo.doctype
|
||||
else:
|
||||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||
rv.append(dtd_str)
|
||||
serializeElement(element.getroot())
|
||||
|
||||
elif type(element.tag) == type(etree.Comment):
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if hasattr(element, "tail") and element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
commentClass = None
|
||||
fragmentClass = Document
|
||||
|
||||
def __init__(self, namespaceHTMLElements, fullTree = False):
|
||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||
filter = self.filter = ihatexml.InfosetFilter()
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
class Attributes(dict):
|
||||
def __init__(self, element, value={}):
|
||||
self._element = element
|
||||
dict.__init__(self, value)
|
||||
for key, value in self.iteritems():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = filter.coerceAttribute(key)
|
||||
self._element._element.attrib[name] = value
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
dict.__setitem__(self, key, value)
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||
else:
|
||||
name = filter.coerceAttribute(key)
|
||||
self._element._element.attrib[name] = value
|
||||
|
||||
class Element(builder.Element):
|
||||
def __init__(self, name, namespace):
|
||||
name = filter.coerceElement(name)
|
||||
builder.Element.__init__(self, name, namespace=namespace)
|
||||
self._attributes = Attributes(self)
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = filter.coerceElement(name)
|
||||
self._element.tag = self._getETreeTag(
|
||||
self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._name
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._attributes
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
self._attributes = Attributes(self, attributes)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data = filter.coerceCharacters(data)
|
||||
builder.Element.insertText(self, data, insertBefore)
|
||||
|
||||
def appendChild(self, child):
|
||||
builder.Element.appendChild(self, child)
|
||||
|
||||
|
||||
class Comment(builder.Comment):
|
||||
def __init__(self, data):
|
||||
data = filter.coerceComment(data)
|
||||
builder.Comment.__init__(self, data)
|
||||
|
||||
def _setData(self, data):
|
||||
data = filter.coerceComment(data)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
self.elementClass = Element
|
||||
self.commentClass = builder.Comment
|
||||
#self.fragmentClass = builder.DocumentFragment
|
||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def reset(self):
|
||||
_base.TreeBuilder.reset(self)
|
||||
self.insertComment = self.insertCommentInitial
|
||||
self.initial_comments = []
|
||||
self.doctype = None
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._elementTree
|
||||
else:
|
||||
return self.document._elementTree.getroot()
|
||||
|
||||
def getFragment(self):
|
||||
fragment = []
|
||||
element = self.openElements[0]._element
|
||||
if element.text:
|
||||
fragment.append(element.text)
|
||||
fragment.extend(element.getchildren())
|
||||
if element.tail:
|
||||
fragment.append(element.tail)
|
||||
return fragment
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if not name or ihatexml.nonXmlBMPRegexp.search(name):
|
||||
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
def insertCommentInitial(self, data, parent=None):
|
||||
self.initial_comments.append(data)
|
||||
|
||||
def insertRoot(self, token):
|
||||
"""Create the document root"""
|
||||
#Because of the way libxml2 works, it doesn't seem to be possible to
|
||||
#alter information like the doctype after the tree has been parsed.
|
||||
#Therefore we need to use the built-in parser to create our iniial
|
||||
#tree, after which we can add elements like normal
|
||||
docStr = ""
|
||||
if self.doctype and self.doctype.name:
|
||||
docStr += "<!DOCTYPE %s"%self.doctype.name
|
||||
if (self.doctype.publicId is not None or
|
||||
self.doctype.systemId is not None):
|
||||
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
|
||||
self.doctype.systemId or "")
|
||||
docStr += ">"
|
||||
#TODO - this needs to work when elements are not put into the default ns
|
||||
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
|
||||
|
||||
try:
|
||||
root = etree.fromstring(docStr)
|
||||
except etree.XMLSyntaxError:
|
||||
print docStr
|
||||
raise
|
||||
|
||||
#Append the initial comments:
|
||||
for comment_token in self.initial_comments:
|
||||
root.addprevious(etree.Comment(comment_token["data"]))
|
||||
|
||||
#Create the root document and add the ElementTree to it
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
#Add the root element to the internal child/open data structures
|
||||
namespace = token.get("namespace", None)
|
||||
root_element = self.elementClass(token["name"], namespace)
|
||||
root_element._element = root
|
||||
self.document._childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
||||
#Reset to the default insert comment function
|
||||
self.insertComment = super(TreeBuilder, self).insertComment
|
@ -1,5 +1,5 @@
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
from html5lib.constants import voidElements, namespaces, prefixes
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
@ -63,6 +63,8 @@ class Node(_base.Node):
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
if hasattr(self, 'namespace'):
|
||||
newNode.namespace = self.namespace
|
||||
if hasattr(self, 'attributes'):
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
@ -73,6 +75,14 @@ class Node(_base.Node):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self.childNodes)
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class Document(Node):
|
||||
type = 1
|
||||
def __init__(self):
|
||||
@ -81,6 +91,9 @@ class Document(Node):
|
||||
def __unicode__(self):
|
||||
return "#document"
|
||||
|
||||
def appendChild(self, child):
|
||||
Node.appendChild(self, child)
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ""
|
||||
for child in self.childNodes:
|
||||
@ -106,14 +119,22 @@ class DocumentFragment(Document):
|
||||
|
||||
class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name):
|
||||
def __init__(self, name, publicId, systemId):
|
||||
Node.__init__(self, name)
|
||||
self.publicId = u""
|
||||
self.systemId = u""
|
||||
self.publicId = publicId
|
||||
self.systemId = systemId
|
||||
|
||||
def __unicode__(self):
|
||||
if self.publicId or self.systemId:
|
||||
publicId = self.publicId or ""
|
||||
systemId = self.systemId or ""
|
||||
return """<!DOCTYPE %s "%s" "%s">"""%(
|
||||
self.name, publicId, systemId)
|
||||
|
||||
else:
|
||||
return u"<!DOCTYPE %s>" % self.name
|
||||
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
def hilite(self):
|
||||
@ -135,12 +156,16 @@ class TextNode(Node):
|
||||
|
||||
class Element(Node):
|
||||
type = 5
|
||||
def __init__(self, name):
|
||||
def __init__(self, name, namespace=None):
|
||||
Node.__init__(self, name)
|
||||
self.namespace = namespace
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
if self.namespace in (None, namespaces["html"]):
|
||||
return u"<%s>" % self.name
|
||||
else:
|
||||
return u"<%s %s>"%(prefixes[self.namespace], self.name)
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
@ -174,6 +199,8 @@ class Element(Node):
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
if isinstance(name, tuple):
|
||||
name = "%s %s"%(name[0], name[1])
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
|
99
planet/vendor/html5lib/treebuilders/soup.py
vendored
99
planet/vendor/html5lib/treebuilders/soup.py
vendored
@ -1,6 +1,9 @@
|
||||
import warnings
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
from html5lib.constants import namespaces, DataLossWarning
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
@ -22,18 +25,35 @@ class AttrList(object):
|
||||
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, element, soup):
|
||||
def __init__(self, element, soup, namespace):
|
||||
_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def _nodeIndex(self, node, refNode):
|
||||
# Finds a node by identity rather than equality
|
||||
for index in range(len(self.element.contents)):
|
||||
if id(self.element.contents[index]) == id(refNode.element):
|
||||
return index
|
||||
return None
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[-1]+node.element), self.soup)
|
||||
self.element.contents[-1].extract()
|
||||
self.appendChild(newNode)
|
||||
# Concatenate new text onto old text node
|
||||
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
|
||||
newStr = NavigableString(self.element.contents[-1]+node.element)
|
||||
|
||||
# Remove the old text node
|
||||
# (Can't simply use .extract() by itself, because it fails if
|
||||
# an equal text node exists within the parent node)
|
||||
oldElement = self.element.contents[-1]
|
||||
del self.element.contents[-1]
|
||||
oldElement.parent = None
|
||||
oldElement.extract()
|
||||
|
||||
self.element.insert(len(self.element.contents), newStr)
|
||||
else:
|
||||
self.element.insert(len(self.element.contents), node.element)
|
||||
node.parent = self
|
||||
@ -56,18 +76,25 @@ class Element(_base.Node):
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.contents.index(refNode.element)
|
||||
index = self._nodeIndex(node, refNode)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[index-1]+node.element), self.soup)
|
||||
self.element.contents[index-1].extract()
|
||||
self.insertBefore(newNode, refNode)
|
||||
# (See comments in appendChild)
|
||||
newStr = NavigableString(self.element.contents[index-1]+node.element)
|
||||
oldNode = self.element.contents[index-1]
|
||||
del self.element.contents[index-1]
|
||||
oldNode.parent = None
|
||||
oldNode.extract()
|
||||
|
||||
self.element.insert(index-1, newStr)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
index = self._nodeIndex(node.parent, node)
|
||||
del node.parent.element.contents[index]
|
||||
node.element.parent = None
|
||||
node.element.extract()
|
||||
node.parent = None
|
||||
|
||||
@ -76,12 +103,12 @@ class Element(_base.Node):
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(Element(child, self.soup))
|
||||
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
|
||||
else:
|
||||
newParent.appendChild(TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
node = Element(Tag(self.soup, self.element.name), self.soup)
|
||||
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
@ -89,6 +116,14 @@ class Element(_base.Node):
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, None)
|
||||
@ -101,13 +136,25 @@ class TextNode(Element):
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup)
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if publicId:
|
||||
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
||||
elif systemId:
|
||||
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
|
||||
(name, systemId)))
|
||||
else:
|
||||
self.soup.insert(0, Declaration(name))
|
||||
|
||||
def elementClass(self, name):
|
||||
return Element(Tag(self.soup, name), self.soup)
|
||||
def elementClass(self, name, namespace):
|
||||
if namespace not in (None, namespaces["html"]):
|
||||
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
|
||||
return Element(Tag(self.soup, name), self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup)
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
self.soup.insert(len(self.soup.contents), node.element)
|
||||
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(element):
|
||||
import re
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, Declaration):
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
|
||||
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
||||
m = re.compile(doctype_regexp).match(element.string)
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
name = m.group('name')
|
||||
publicId = m.group('publicId')
|
||||
if publicId is not None:
|
||||
systemId = m.group('systemId1') or ""
|
||||
else:
|
||||
systemId = m.group('systemId2')
|
||||
|
||||
if publicId is not None or systemId is not None:
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
|
||||
(' '*indent, name, publicId or "", systemId or ""))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
|
||||
|
||||
elif isinstance(element, BeautifulSoup):
|
||||
if element.name == "[document_fragment]":
|
||||
rv.append("#document-fragment")
|
||||
|
34
planet/vendor/html5lib/treewalkers/_base.py
vendored
34
planet/vendor/html5lib/treewalkers/_base.py
vendored
@ -21,18 +21,24 @@ class TreeWalker(object):
|
||||
attrs = attrs.items()
|
||||
return [(unicode(name),unicode(value)) for name,value in attrs]
|
||||
|
||||
def emptyTag(self, name, attrs, hasChildren=False):
|
||||
yield {"type": "EmptyTag", "name": unicode(name), \
|
||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||
yield {"type": "EmptyTag", "name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
if hasChildren:
|
||||
yield self.error(_("Void element has children"))
|
||||
|
||||
def startTag(self, name, attrs):
|
||||
return {"type": "StartTag", "name": unicode(name), \
|
||||
def startTag(self, namespace, name, attrs):
|
||||
return {"type": "StartTag",
|
||||
"name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
|
||||
def endTag(self, name):
|
||||
return {"type": "EndTag", "name": unicode(name), "data": []}
|
||||
def endTag(self, namespace, name):
|
||||
return {"type": "EndTag",
|
||||
"name": unicode(name),
|
||||
"namespace":unicode(namespace),
|
||||
"data": []}
|
||||
|
||||
def text(self, data):
|
||||
data = unicode(data)
|
||||
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
|
||||
def walkChildren(self, node):
|
||||
raise NodeImplementedError
|
||||
|
||||
def element(self, node, name, attrs, hasChildren):
|
||||
def element(self, node, namespace, name, attrs, hasChildren):
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attrs, hasChildren):
|
||||
for token in self.emptyTag(namespace, name, attrs, hasChildren):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, attrs)
|
||||
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
endTag = None
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attributes, hasChildren):
|
||||
for token in self.emptyTag(namespace, name, attributes, hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
yield self.startTag(name, attributes)
|
||||
endTag = name
|
||||
yield self.startTag(namespace, name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
yield self.endTag(namespace, name)
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
|
3
planet/vendor/html5lib/treewalkers/dom.py
vendored
3
planet/vendor/html5lib/treewalkers/dom.py
vendored
@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
return _base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
|
||||
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
node.attributes.items(), node.hasChildNodes)
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return _base.COMMENT, node.nodeValue
|
||||
|
108
planet/vendor/html5lib/treewalkers/etree.py
vendored
108
planet/vendor/html5lib/treewalkers/etree.py
vendored
@ -3,10 +3,13 @@ _ = gettext.gettext
|
||||
|
||||
import new
|
||||
import copy
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation):
|
||||
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. An Element node serving as *context* (it cannot be called the parent
|
||||
node due to the particular ``tail`` text nodes.
|
||||
1. The current element
|
||||
|
||||
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
|
||||
2. The index of the element relative to its parent
|
||||
|
||||
3. A list used as a stack of all ancestor *context nodes*. It is a
|
||||
pair tuple whose first item is an Element and second item is a child
|
||||
index.
|
||||
3. A stack of ancestor elements
|
||||
|
||||
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||
text node; either the text or tail of the current element (1)
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
if key in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, key)
|
||||
elt, key, parents, flag = node
|
||||
if flag in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, flag)
|
||||
else:
|
||||
node = elt[int(key)]
|
||||
node = elt
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return _base.DOCTYPE, node.text
|
||||
return (_base.DOCTYPE, node.text,
|
||||
node.get("publicId"), node.get("systemId"))
|
||||
|
||||
elif type(node.tag) == type(ElementTree.Comment):
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
return (_base.ELEMENT, namespace, tag,
|
||||
node.attrib.items(), len(node) or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
assert key not in ("text", "tail"), "Text nodes have no children"
|
||||
parents.append((elt, int(key)))
|
||||
node = elt[int(key)]
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
parents = []
|
||||
element, key, parents, flag = node, None, [], None
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text", parents)
|
||||
if flag in ("text", "tail"):
|
||||
return None
|
||||
else:
|
||||
return (node, 0, parents)
|
||||
if element.text:
|
||||
return element, key, parents, "text"
|
||||
elif len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
|
||||
elt, key, parents = node
|
||||
if key == "text":
|
||||
key = -1
|
||||
elif key == "tail":
|
||||
elt, key = parents.pop()
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
# Look for "tail" of the "revisited" node
|
||||
child = elt[key]
|
||||
if child.tail:
|
||||
parents.append((elt, key))
|
||||
return (child, "tail", parents)
|
||||
return None
|
||||
|
||||
# case where key were "text" or "tail" or elt[key] had a tail
|
||||
key += 1
|
||||
if len(elt) > key:
|
||||
return (elt, key, parents)
|
||||
if flag == "text":
|
||||
if len(element):
|
||||
parents.append(element)
|
||||
return element[0], 0, parents, None
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
if element.tail and flag != "tail":
|
||||
return element, key, parents, "tail"
|
||||
elif key < len(parents[-1]) - 1:
|
||||
return parents[-1][key+1], key+1, parents, None
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
elt, key, parents = node
|
||||
if parents:
|
||||
elt, key = parents.pop()
|
||||
return elt, key, parents
|
||||
if isinstance(node, tuple):
|
||||
element, key, parents, flag = node
|
||||
else:
|
||||
# HACK: We could return ``elt`` but None will stop the algorithm the same way
|
||||
return None
|
||||
|
||||
if flag == "text":
|
||||
if not parents:
|
||||
return element
|
||||
else:
|
||||
return element, key, parents, None
|
||||
else:
|
||||
parent = parents.pop()
|
||||
if not parents:
|
||||
return parent
|
||||
else:
|
||||
return parent, list(parents[-1]).index(parent), parents, None
|
||||
|
||||
return locals()
|
||||
|
@ -1,4 +1,4 @@
|
||||
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
|
||||
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.output import NamespaceFlattener
|
||||
|
||||
@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
|
||||
depth = 0
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in NamespaceFlattener(prefixes={
|
||||
'http://www.w3.org/1999/xhtml': ''
|
||||
})(self.tree):
|
||||
for event in self.tree:
|
||||
if previous is not None:
|
||||
if previous[0] == START:
|
||||
depth += 1
|
||||
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
|
||||
kind, data, pos = event
|
||||
if kind == START:
|
||||
tag, attrib = data
|
||||
name = tag.localname
|
||||
namespace = tag.namespace
|
||||
if tag in voidElements:
|
||||
for token in self.emptyTag(tag, list(attrib), \
|
||||
not next or next[0] != END or next[1] != tag):
|
||||
for token in self.emptyTag(namespace, name, list(attrib),
|
||||
not next or next[0] != END
|
||||
or next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(tag, list(attrib))
|
||||
yield self.startTag(namespace, name, list(attrib))
|
||||
|
||||
elif kind == END:
|
||||
if data not in voidElements:
|
||||
yield self.endTag(data)
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if (namespace, name) not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
|
175
planet/vendor/html5lib/treewalkers/lxmletree.py
vendored
Normal file
175
planet/vendor/html5lib/treewalkers/lxmletree.py
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
from lxml import etree
|
||||
from html5lib.treebuilders.etree import tag_regexp
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
from html5lib import ihatexml
|
||||
|
||||
class Root(object):
|
||||
def __init__(self, et):
|
||||
self.elementtree = et
|
||||
self.children = []
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self, et.docinfo.root_name,
|
||||
et.docinfo.public_id,
|
||||
et.docinfo.system_url))
|
||||
root = et.getroot()
|
||||
node = root
|
||||
|
||||
while node.getprevious() is not None:
|
||||
node = node.getprevious()
|
||||
while node is not None:
|
||||
self.children.append(node)
|
||||
node = node.getnext()
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.children[key]
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
def __len__(self):
|
||||
return 1
|
||||
|
||||
class Doctype(object):
|
||||
def __init__(self, root_node, name, public_id, system_id):
|
||||
self.root_node = root_node
|
||||
self.name = name
|
||||
self.public_id = public_id
|
||||
self.system_id = system_id
|
||||
|
||||
self.text = None
|
||||
self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return self.root_node.children[1]
|
||||
|
||||
class FragmentRoot(Root):
|
||||
def __init__(self, children):
|
||||
self.children = [FragmentWrapper(self, child) for child in children]
|
||||
self.text = self.tail = None
|
||||
|
||||
def getnext(self):
|
||||
return None
|
||||
|
||||
class FragmentWrapper(object):
|
||||
def __init__(self, fragment_root, obj):
|
||||
self.root_node = fragment_root
|
||||
self.obj = obj
|
||||
if hasattr(self.obj, 'text'):
|
||||
self.text = self.obj.text
|
||||
else:
|
||||
self.text = None
|
||||
if hasattr(self.obj, 'tail'):
|
||||
self.tail = self.obj.tail
|
||||
else:
|
||||
self.tail = None
|
||||
self.isstring = isinstance(obj, basestring)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.obj, name)
|
||||
|
||||
def getnext(self):
|
||||
siblings = self.root_node.children
|
||||
idx = siblings.index(self)
|
||||
if idx < len(siblings) - 1:
|
||||
return siblings[idx + 1]
|
||||
else:
|
||||
return None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.obj[key]
|
||||
|
||||
def __nonzero__(self):
|
||||
return bool(self.obj)
|
||||
|
||||
def getparent(self):
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.obj)
|
||||
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def __init__(self, tree):
|
||||
if hasattr(tree, "getroot"):
|
||||
tree = Root(tree)
|
||||
elif isinstance(tree, list):
|
||||
tree = FragmentRoot(tree)
|
||||
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = ihatexml.InfosetFilter()
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
return _base.TEXT, getattr(node, key)
|
||||
|
||||
elif isinstance(node, Root):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Doctype):
|
||||
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
|
||||
elif isinstance(node, FragmentWrapper) and node.isstring:
|
||||
return _base.TEXT, node
|
||||
|
||||
elif node.tag == etree.Comment:
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
if match:
|
||||
namespace, tag = match.groups()
|
||||
else:
|
||||
namespace = None
|
||||
tag = node.tag
|
||||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
[(self.filter.fromXmlName(name), value) for
|
||||
name,value in node.attrib.iteritems()],
|
||||
len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
assert not isinstance(node, tuple), _("Text nodes have no children")
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text")
|
||||
else:
|
||||
return node[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
if key == "text":
|
||||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||
# because node[0] might evaluate to False if it has no child element
|
||||
if len(node):
|
||||
return node[0]
|
||||
else:
|
||||
return None
|
||||
else: # tail
|
||||
return node.getnext()
|
||||
|
||||
return node.tail and (node, "tail") or node.getnext()
|
||||
|
||||
def getParentNode(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
|
||||
return node.getparent()
|
12
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
12
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
|
||||
type, node = event
|
||||
if type == START_ELEMENT:
|
||||
name = node.nodeName
|
||||
namespace = node.namespaceURI
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, \
|
||||
node.attributes.items(), not next or next[1] is not node):
|
||||
for token in self.emptyTag(namespace,
|
||||
name,
|
||||
node.attributes.items(),
|
||||
not next or next[1] is not node):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, node.attributes.items())
|
||||
yield self.startTag(namespace, name, node.attributes.items())
|
||||
|
||||
elif type == END_ELEMENT:
|
||||
name = node.nodeName
|
||||
namespace = node.namespaceURI
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(node.nodeValue)
|
||||
|
@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
return _base.TEXT, node.value
|
||||
|
||||
elif node.type == 5: # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
node.attributes.items(), node.hasContent()
|
||||
return (_base.ELEMENT, node.namespace, node.name,
|
||||
node.attributes.items(), node.hasContent())
|
||||
|
||||
elif node.type == 6: # CommentNode
|
||||
return _base.COMMENT, node.data
|
||||
|
29
planet/vendor/html5lib/treewalkers/soup.py
vendored
29
planet/vendor/html5lib/treewalkers/soup.py
vendored
@ -1,3 +1,4 @@
|
||||
import re
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
doctype_regexp = re.compile(
|
||||
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Declaration): # DocumentType
|
||||
#Slice needed to remove markup added during unicode conversion
|
||||
return _base.DOCTYPE, unicode(node.string)[2:-1]
|
||||
string = unicode(node.string)
|
||||
#Slice needed to remove markup added during unicode conversion,
|
||||
#but only in some versions of BeautifulSoup/Python
|
||||
if string.startswith('<!') and string.endswith('>'):
|
||||
string = string[2:-1]
|
||||
m = self.doctype_regexp.match(string)
|
||||
#This regexp approach seems wrong and fragile
|
||||
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
|
||||
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
|
||||
#been modified at all
|
||||
#We could just feed to it a html5lib tokenizer, I guess...
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
name = m.group('name')
|
||||
publicId = m.group('publicId')
|
||||
if publicId is not None:
|
||||
systemId = m.group('systemId1')
|
||||
else:
|
||||
systemId = m.group('systemId2')
|
||||
return _base.DOCTYPE, name, publicId or "", systemId or ""
|
||||
|
||||
elif isinstance(node, Comment):
|
||||
return _base.COMMENT, unicode(node.string)[4:-3]
|
||||
string = unicode(node.string)
|
||||
if string.startswith('<!--') and string.endswith('-->'):
|
||||
string = string[4:-3]
|
||||
return _base.COMMENT, string
|
||||
|
||||
elif isinstance(node, unicode): # TextNode
|
||||
return _base.TEXT, node
|
||||
|
120
planet/vendor/html5lib/utils.py
vendored
120
planet/vendor/html5lib/utils.py
vendored
@ -34,3 +34,123 @@ class MethodDispatcher(dict):
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
||||
|
||||
#Pure python implementation of deque taken from the ASPN Python Cookbook
|
||||
#Original code by Raymond Hettinger
|
||||
|
||||
class deque(object):
|
||||
|
||||
def __init__(self, iterable=(), maxsize=-1):
|
||||
if not hasattr(self, 'data'):
|
||||
self.left = self.right = 0
|
||||
self.data = {}
|
||||
self.maxsize = maxsize
|
||||
self.extend(iterable)
|
||||
|
||||
def append(self, x):
|
||||
self.data[self.right] = x
|
||||
self.right += 1
|
||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||
self.popleft()
|
||||
|
||||
def appendleft(self, x):
|
||||
self.left -= 1
|
||||
self.data[self.left] = x
|
||||
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||
self.pop()
|
||||
|
||||
def pop(self):
|
||||
if self.left == self.right:
|
||||
raise IndexError('cannot pop from empty deque')
|
||||
self.right -= 1
|
||||
elem = self.data[self.right]
|
||||
del self.data[self.right]
|
||||
return elem
|
||||
|
||||
def popleft(self):
|
||||
if self.left == self.right:
|
||||
raise IndexError('cannot pop from empty deque')
|
||||
elem = self.data[self.left]
|
||||
del self.data[self.left]
|
||||
self.left += 1
|
||||
return elem
|
||||
|
||||
def clear(self):
|
||||
self.data.clear()
|
||||
self.left = self.right = 0
|
||||
|
||||
def extend(self, iterable):
|
||||
for elem in iterable:
|
||||
self.append(elem)
|
||||
|
||||
def extendleft(self, iterable):
|
||||
for elem in iterable:
|
||||
self.appendleft(elem)
|
||||
|
||||
def rotate(self, n=1):
|
||||
if self:
|
||||
n %= len(self)
|
||||
for i in xrange(n):
|
||||
self.appendleft(self.pop())
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
try:
|
||||
return self.data[i + self.left]
|
||||
except KeyError:
|
||||
raise IndexError
|
||||
|
||||
def __setitem__(self, i, value):
|
||||
if i < 0:
|
||||
i += len(self)
|
||||
try:
|
||||
self.data[i + self.left] = value
|
||||
except KeyError:
|
||||
raise IndexError
|
||||
|
||||
def __delitem__(self, i):
|
||||
size = len(self)
|
||||
if not (-size <= i < size):
|
||||
raise IndexError
|
||||
data = self.data
|
||||
if i < 0:
|
||||
i += size
|
||||
for j in xrange(self.left+i, self.right-1):
|
||||
data[j] = data[j+1]
|
||||
self.pop()
|
||||
|
||||
def __len__(self):
|
||||
return self.right - self.left
|
||||
|
||||
def __cmp__(self, other):
|
||||
if type(self) != type(other):
|
||||
return cmp(type(self), type(other))
|
||||
return cmp(list(self), list(other))
|
||||
|
||||
def __repr__(self, _track=[]):
|
||||
if id(self) in _track:
|
||||
return '...'
|
||||
_track.append(id(self))
|
||||
r = 'deque(%r)' % (list(self),)
|
||||
_track.remove(id(self))
|
||||
return r
|
||||
|
||||
def __getstate__(self):
|
||||
return (tuple(self),)
|
||||
|
||||
def __setstate__(self, s):
|
||||
self.__init__(s[0])
|
||||
|
||||
def __hash__(self):
|
||||
raise TypeError
|
||||
|
||||
def __copy__(self):
|
||||
return self.__class__(self)
|
||||
|
||||
def __deepcopy__(self, memo={}):
|
||||
from copy import deepcopy
|
||||
result = self.__class__()
|
||||
memo[id(self)] = result
|
||||
result.__init__(deepcopy(tuple(self), memo))
|
||||
return result
|
@ -1,6 +1,6 @@
|
||||
<!--
|
||||
Description: illegal control character
|
||||
Expect: content[0].value == u'Page 1\ufffdPage 2'
|
||||
Expect: content[0].value == u'Page 1 Page 2'
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
|
Loading…
x
Reference in New Issue
Block a user