Update to the lastest html5lib; replace feedparser's sanitizer with
html5lib's
This commit is contained in:
parent
63fa05e556
commit
6f0f23dd36
@ -16,7 +16,7 @@ Todo:
|
|||||||
import re, time, sgmllib
|
import re, time, sgmllib
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
from xml.dom import minidom, Node
|
from xml.dom import minidom, Node
|
||||||
from html5lib import liberalxmlparser
|
from html5lib import html5parser
|
||||||
from html5lib.treebuilders import dom
|
from html5lib.treebuilders import dom
|
||||||
import planet, config
|
import planet, config
|
||||||
|
|
||||||
@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
|
|||||||
bozo=1
|
bozo=1
|
||||||
|
|
||||||
if detail.type.find('xhtml')<0 or bozo:
|
if detail.type.find('xhtml')<0 or bozo:
|
||||||
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
|
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
|
||||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||||
for body in html.documentElement.childNodes:
|
for body in html.documentElement.childNodes:
|
||||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||||
|
@ -128,5 +128,11 @@ def scrub(feed_uri, data):
|
|||||||
node['value'] = feedparser._resolveRelativeURIs(
|
node['value'] = feedparser._resolveRelativeURIs(
|
||||||
node.value, node.base, 'utf-8', node.type)
|
node.value, node.base, 'utf-8', node.type)
|
||||||
|
|
||||||
node['value'] = feedparser._sanitizeHTML(
|
# Run this through HTML5's serializer
|
||||||
node.value, 'utf-8', node.type)
|
from html5lib import html5parser, sanitizer, treewalkers, serializer
|
||||||
|
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
|
||||||
|
doc = p.parseFragment(node.value, encoding='utf-8')
|
||||||
|
walker = treewalkers.getTreeWalker('simpletree')
|
||||||
|
xhtml = serializer.XHTMLSerializer()
|
||||||
|
tree = xhtml.serialize(walker(doc), encoding='utf-8')
|
||||||
|
node['value'] = ''.join([n for n in tree])
|
||||||
|
5
planet/vendor/html5lib/__init__.py
vendored
5
planet/vendor/html5lib/__init__.py
vendored
@ -11,5 +11,6 @@ f = open("my_document.html")
|
|||||||
p = html5lib.HTMLParser()
|
p = html5lib.HTMLParser()
|
||||||
tree = p.parse(f)
|
tree = p.parse(f)
|
||||||
"""
|
"""
|
||||||
from html5parser import HTMLParser
|
from html5parser import HTMLParser, parse
|
||||||
from liberalxmlparser import XMLParser, XHTMLParser
|
from treebuilders import getTreeBuilder
|
||||||
|
from serializer import serialize
|
||||||
|
928
planet/vendor/html5lib/constants.py
vendored
928
planet/vendor/html5lib/constants.py
vendored
@ -1,4 +1,5 @@
|
|||||||
import string
|
import string, gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
try:
|
try:
|
||||||
frozenset
|
frozenset
|
||||||
@ -9,6 +10,260 @@ except NameError:
|
|||||||
|
|
||||||
EOF = None
|
EOF = None
|
||||||
|
|
||||||
|
E = {
|
||||||
|
"null-character":
|
||||||
|
_(u"Null character in input stream, replaced with U+FFFD."),
|
||||||
|
"invalid-character":
|
||||||
|
_(u"Invalid codepoint in stream."),
|
||||||
|
"incorrectly-placed-solidus":
|
||||||
|
_(u"Solidus (/) incorrectly placed in tag."),
|
||||||
|
"incorrect-cr-newline-entity":
|
||||||
|
_(u"Incorrect CR newline entity, replaced with LF."),
|
||||||
|
"illegal-windows-1252-entity":
|
||||||
|
_(u"Entity used with illegal number (windows-1252 reference)."),
|
||||||
|
"cant-convert-numeric-entity":
|
||||||
|
_(u"Numeric entity couldn't be converted to character "
|
||||||
|
u"(codepoint U+%(charAsInt)08x)."),
|
||||||
|
"illegal-codepoint-for-numeric-entity":
|
||||||
|
_(u"Numeric entity represents an illegal codepoint: "
|
||||||
|
u"U+%(charAsInt)08x."),
|
||||||
|
"numeric-entity-without-semicolon":
|
||||||
|
_(u"Numeric entity didn't end with ';'."),
|
||||||
|
"expected-numeric-entity-but-got-eof":
|
||||||
|
_(u"Numeric entity expected. Got end of file instead."),
|
||||||
|
"expected-numeric-entity":
|
||||||
|
_(u"Numeric entity expected but none found."),
|
||||||
|
"named-entity-without-semicolon":
|
||||||
|
_(u"Named entity didn't end with ';'."),
|
||||||
|
"expected-named-entity":
|
||||||
|
_(u"Named entity expected. Got none."),
|
||||||
|
"attributes-in-end-tag":
|
||||||
|
_(u"End tag contains unexpected attributes."),
|
||||||
|
"expected-tag-name-but-got-right-bracket":
|
||||||
|
_(u"Expected tag name. Got '>' instead."),
|
||||||
|
"expected-tag-name-but-got-question-mark":
|
||||||
|
_(u"Expected tag name. Got '?' instead. (HTML doesn't "
|
||||||
|
u"support processing instructions.)"),
|
||||||
|
"expected-tag-name":
|
||||||
|
_(u"Expected tag name. Got something else instead"),
|
||||||
|
"expected-closing-tag-but-got-right-bracket":
|
||||||
|
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
||||||
|
"expected-closing-tag-but-got-eof":
|
||||||
|
_(u"Expected closing tag. Unexpected end of file."),
|
||||||
|
"expected-closing-tag-but-got-char":
|
||||||
|
_(u"Expected closing tag. Unexpected character '%(data)s' found."),
|
||||||
|
"eof-in-tag-name":
|
||||||
|
_(u"Unexpected end of file in the tag name."),
|
||||||
|
"expected-attribute-name-but-got-eof":
|
||||||
|
_(u"Unexpected end of file. Expected attribute name instead."),
|
||||||
|
"eof-in-attribute-name":
|
||||||
|
_(u"Unexpected end of file in attribute name."),
|
||||||
|
"invalid-character-in-attribute-name":
|
||||||
|
_(u"Invalid chracter in attribute name"),
|
||||||
|
"duplicate-attribute":
|
||||||
|
_(u"Dropped duplicate attribute on tag."),
|
||||||
|
"expected-end-of-tag-name-but-got-eof":
|
||||||
|
_(u"Unexpected end of file. Expected = or end of tag."),
|
||||||
|
"expected-attribute-value-but-got-eof":
|
||||||
|
_(u"Unexpected end of file. Expected attribute value."),
|
||||||
|
"expected-attribute-value-but-got-right-bracket":
|
||||||
|
_(u"Expected attribute value. Got '>' instead."),
|
||||||
|
"eof-in-attribute-value-double-quote":
|
||||||
|
_(u"Unexpected end of file in attribute value (\")."),
|
||||||
|
"eof-in-attribute-value-single-quote":
|
||||||
|
_(u"Unexpected end of file in attribute value (')."),
|
||||||
|
"eof-in-attribute-value-no-quotes":
|
||||||
|
_(u"Unexpected end of file in attribute value."),
|
||||||
|
"unexpected-EOF-after-solidus-in-tag":
|
||||||
|
_(u"Unexpected end of file in tag. Expected >"),
|
||||||
|
"unexpected-character-after-soldius-in-tag":
|
||||||
|
_(u"Unexpected character after / in tag. Expected >"),
|
||||||
|
"expected-dashes-or-doctype":
|
||||||
|
_(u"Expected '--' or 'DOCTYPE'. Not found."),
|
||||||
|
"incorrect-comment":
|
||||||
|
_(u"Incorrect comment."),
|
||||||
|
"eof-in-comment":
|
||||||
|
_(u"Unexpected end of file in comment."),
|
||||||
|
"eof-in-comment-end-dash":
|
||||||
|
_(u"Unexpected end of file in comment (-)"),
|
||||||
|
"unexpected-dash-after-double-dash-in-comment":
|
||||||
|
_(u"Unexpected '-' after '--' found in comment."),
|
||||||
|
"eof-in-comment-double-dash":
|
||||||
|
_(u"Unexpected end of file in comment (--)."),
|
||||||
|
"unexpected-char-in-comment":
|
||||||
|
_(u"Unexpected character in comment found."),
|
||||||
|
"need-space-after-doctype":
|
||||||
|
_(u"No space after literal string 'DOCTYPE'."),
|
||||||
|
"expected-doctype-name-but-got-right-bracket":
|
||||||
|
_(u"Unexpected > character. Expected DOCTYPE name."),
|
||||||
|
"expected-doctype-name-but-got-eof":
|
||||||
|
_(u"Unexpected end of file. Expected DOCTYPE name."),
|
||||||
|
"eof-in-doctype-name":
|
||||||
|
_(u"Unexpected end of file in DOCTYPE name."),
|
||||||
|
"eof-in-doctype":
|
||||||
|
_(u"Unexpected end of file in DOCTYPE."),
|
||||||
|
"expected-space-or-right-bracket-in-doctype":
|
||||||
|
_(u"Expected space or '>'. Got '%(data)s'"),
|
||||||
|
"unexpected-end-of-doctype":
|
||||||
|
_(u"Unexpected end of DOCTYPE."),
|
||||||
|
"unexpected-char-in-doctype":
|
||||||
|
_(u"Unexpected character in DOCTYPE."),
|
||||||
|
"eof-in-innerhtml":
|
||||||
|
_(u"XXX innerHTML EOF"),
|
||||||
|
"unexpected-doctype":
|
||||||
|
_(u"Unexpected DOCTYPE. Ignored."),
|
||||||
|
"non-html-root":
|
||||||
|
_(u"html needs to be the first start tag."),
|
||||||
|
"expected-doctype-but-got-eof":
|
||||||
|
_(u"Unexpected End of file. Expected DOCTYPE."),
|
||||||
|
"unknown-doctype":
|
||||||
|
_(u"Erroneous DOCTYPE."),
|
||||||
|
"expected-doctype-but-got-chars":
|
||||||
|
_(u"Unexpected non-space characters. Expected DOCTYPE."),
|
||||||
|
"expected-doctype-but-got-start-tag":
|
||||||
|
_(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
||||||
|
"expected-doctype-but-got-end-tag":
|
||||||
|
_(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
||||||
|
"end-tag-after-implied-root":
|
||||||
|
_(u"Unexpected end tag (%(name)s) after the (implied) root element."),
|
||||||
|
"expected-named-closing-tag-but-got-eof":
|
||||||
|
_(u"Unexpected end of file. Expected end tag (%(name)s)."),
|
||||||
|
"two-heads-are-not-better-than-one":
|
||||||
|
_(u"Unexpected start tag head in existing head. Ignored."),
|
||||||
|
"unexpected-end-tag":
|
||||||
|
_(u"Unexpected end tag (%(name)s). Ignored."),
|
||||||
|
"unexpected-start-tag-out-of-my-head":
|
||||||
|
_(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
|
||||||
|
"unexpected-start-tag":
|
||||||
|
_(u"Unexpected start tag (%(name)s)."),
|
||||||
|
"missing-end-tag":
|
||||||
|
_(u"Missing end tag (%(name)s)."),
|
||||||
|
"missing-end-tags":
|
||||||
|
_(u"Missing end tags (%(name)s)."),
|
||||||
|
"unexpected-start-tag-implies-end-tag":
|
||||||
|
_(u"Unexpected start tag (%(startName)s) "
|
||||||
|
u"implies end tag (%(endName)s)."),
|
||||||
|
"unexpected-start-tag-treated-as":
|
||||||
|
_(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
||||||
|
"deprecated-tag":
|
||||||
|
_(u"Unexpected start tag %(name)s. Don't use it!"),
|
||||||
|
"unexpected-start-tag-ignored":
|
||||||
|
_(u"Unexpected start tag %(name)s. Ignored."),
|
||||||
|
"expected-one-end-tag-but-got-another":
|
||||||
|
_(u"Unexpected end tag (%(gotName)s). "
|
||||||
|
u"Missing end tag (%(expectedName)s)."),
|
||||||
|
"end-tag-too-early":
|
||||||
|
_(u"End tag (%(name)s) seen too early. Expected other end tag."),
|
||||||
|
"end-tag-too-early-named":
|
||||||
|
_(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
||||||
|
"end-tag-too-early-ignored":
|
||||||
|
_(u"End tag (%(name)s) seen too early. Ignored."),
|
||||||
|
"adoption-agency-1.1":
|
||||||
|
_(u"End tag (%(name)s) violates step 1, "
|
||||||
|
u"paragraph 1 of the adoption agency algorithm."),
|
||||||
|
"adoption-agency-1.2":
|
||||||
|
_(u"End tag (%(name)s) violates step 1, "
|
||||||
|
u"paragraph 2 of the adoption agency algorithm."),
|
||||||
|
"adoption-agency-1.3":
|
||||||
|
_(u"End tag (%(name)s) violates step 1, "
|
||||||
|
u"paragraph 3 of the adoption agency algorithm."),
|
||||||
|
"unexpected-end-tag-treated-as":
|
||||||
|
_(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
||||||
|
"no-end-tag":
|
||||||
|
_(u"This element (%(name)s) has no end tag."),
|
||||||
|
"unexpected-implied-end-tag-in-table":
|
||||||
|
_(u"Unexpected implied end tag (%(name)s) in the table phase."),
|
||||||
|
"unexpected-implied-end-tag-in-table-body":
|
||||||
|
_(u"Unexpected implied end tag (%(name)s) in the table body phase."),
|
||||||
|
"unexpected-char-implies-table-voodoo":
|
||||||
|
_(u"Unexpected non-space characters in "
|
||||||
|
u"table context caused voodoo mode."),
|
||||||
|
"unexpected-hidden-input-in-table":
|
||||||
|
_(u"Unexpected input with type hidden in table context."),
|
||||||
|
"unexpected-start-tag-implies-table-voodoo":
|
||||||
|
_(u"Unexpected start tag (%(name)s) in "
|
||||||
|
u"table context caused voodoo mode."),
|
||||||
|
"unexpected-end-tag-implies-table-voodoo":
|
||||||
|
_(u"Unexpected end tag (%(name)s) in "
|
||||||
|
u"table context caused voodoo mode."),
|
||||||
|
"unexpected-cell-in-table-body":
|
||||||
|
_(u"Unexpected table cell start tag (%(name)s) "
|
||||||
|
u"in the table body phase."),
|
||||||
|
"unexpected-cell-end-tag":
|
||||||
|
_(u"Got table cell end tag (%(name)s) "
|
||||||
|
u"while required end tags are missing."),
|
||||||
|
"unexpected-end-tag-in-table-body":
|
||||||
|
_(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
||||||
|
"unexpected-implied-end-tag-in-table-row":
|
||||||
|
_(u"Unexpected implied end tag (%(name)s) in the table row phase."),
|
||||||
|
"unexpected-end-tag-in-table-row":
|
||||||
|
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
||||||
|
"unexpected-select-in-select":
|
||||||
|
_(u"Unexpected select start tag in the select phase "
|
||||||
|
u"treated as select end tag."),
|
||||||
|
"unexpected-input-in-select":
|
||||||
|
_(u"Unexpected input start tag in the select phase."),
|
||||||
|
"unexpected-start-tag-in-select":
|
||||||
|
_(u"Unexpected start tag token (%(name)s in the select phase. "
|
||||||
|
u"Ignored."),
|
||||||
|
"unexpected-end-tag-in-select":
|
||||||
|
_(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
||||||
|
"unexpected-table-element-start-tag-in-select-in-table":
|
||||||
|
_(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
|
||||||
|
"unexpected-table-element-end-tag-in-select-in-table":
|
||||||
|
_(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
|
||||||
|
"unexpected-char-after-body":
|
||||||
|
_(u"Unexpected non-space characters in the after body phase."),
|
||||||
|
"unexpected-start-tag-after-body":
|
||||||
|
_(u"Unexpected start tag token (%(name)s)"
|
||||||
|
u" in the after body phase."),
|
||||||
|
"unexpected-end-tag-after-body":
|
||||||
|
_(u"Unexpected end tag token (%(name)s)"
|
||||||
|
u" in the after body phase."),
|
||||||
|
"unexpected-char-in-frameset":
|
||||||
|
_(u"Unepxected characters in the frameset phase. Characters ignored."),
|
||||||
|
"unexpected-start-tag-in-frameset":
|
||||||
|
_(u"Unexpected start tag token (%(name)s)"
|
||||||
|
u" in the frameset phase. Ignored."),
|
||||||
|
"unexpected-frameset-in-frameset-innerhtml":
|
||||||
|
_(u"Unexpected end tag token (frameset) "
|
||||||
|
u"in the frameset phase (innerHTML)."),
|
||||||
|
"unexpected-end-tag-in-frameset":
|
||||||
|
_(u"Unexpected end tag token (%(name)s)"
|
||||||
|
u" in the frameset phase. Ignored."),
|
||||||
|
"unexpected-char-after-frameset":
|
||||||
|
_(u"Unexpected non-space characters in the "
|
||||||
|
u"after frameset phase. Ignored."),
|
||||||
|
"unexpected-start-tag-after-frameset":
|
||||||
|
_(u"Unexpected start tag (%(name)s)"
|
||||||
|
u" in the after frameset phase. Ignored."),
|
||||||
|
"unexpected-end-tag-after-frameset":
|
||||||
|
_(u"Unexpected end tag (%(name)s)"
|
||||||
|
u" in the after frameset phase. Ignored."),
|
||||||
|
"unexpected-end-tag-after-body-innerhtml":
|
||||||
|
_(u"Unexpected end tag after body(innerHtml)"),
|
||||||
|
"expected-eof-but-got-char":
|
||||||
|
_(u"Unexpected non-space characters. Expected end of file."),
|
||||||
|
"expected-eof-but-got-start-tag":
|
||||||
|
_(u"Unexpected start tag (%(name)s)"
|
||||||
|
u". Expected end of file."),
|
||||||
|
"expected-eof-but-got-end-tag":
|
||||||
|
_(u"Unexpected end tag (%(name)s)"
|
||||||
|
u". Expected end of file."),
|
||||||
|
"eof-in-table":
|
||||||
|
_(u"Unexpected end of file. Expected table content."),
|
||||||
|
"eof-in-select":
|
||||||
|
_(u"Unexpected end of file. Expected select content."),
|
||||||
|
"eof-in-frameset":
|
||||||
|
_(u"Unexpected end of file. Expected frameset content."),
|
||||||
|
"non-void-element-with-trailing-solidus":
|
||||||
|
_(u"Trailing solidus not allowed on element %(name)s"),
|
||||||
|
"unexpected-html-element-in-foreign-content":
|
||||||
|
_(u"Element %(name)s not allowed in a non-html context"),
|
||||||
|
"XXX-undefined-error":
|
||||||
|
(u"Undefined error (this sucks and should be fixed)"),
|
||||||
|
}
|
||||||
|
|
||||||
contentModelFlags = {
|
contentModelFlags = {
|
||||||
"PCDATA":0,
|
"PCDATA":0,
|
||||||
"RCDATA":1,
|
"RCDATA":1,
|
||||||
@ -16,101 +271,126 @@ contentModelFlags = {
|
|||||||
"PLAINTEXT":3
|
"PLAINTEXT":3
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespaces = {
|
||||||
|
"html":"http://www.w3.org/1999/xhtml",
|
||||||
|
"mathml":"http://www.w3.org/1998/Math/MathML",
|
||||||
|
"svg":"http://www.w3.org/2000/svg",
|
||||||
|
"xlink":"http://www.w3.org/1999/xlink",
|
||||||
|
"xml":"http://www.w3.org/XML/1998/namespace",
|
||||||
|
"xmlns":"http://www.w3.org/2000/xmlns/"
|
||||||
|
}
|
||||||
|
|
||||||
scopingElements = frozenset((
|
scopingElements = frozenset((
|
||||||
"button",
|
(namespaces["html"], "applet"),
|
||||||
"caption",
|
(namespaces["html"], "button"),
|
||||||
"html",
|
(namespaces["html"], "caption"),
|
||||||
"marquee",
|
(namespaces["html"], "html"),
|
||||||
"object",
|
(namespaces["html"], "marquee"),
|
||||||
"table",
|
(namespaces["html"], "object"),
|
||||||
"td",
|
(namespaces["html"], "table"),
|
||||||
"th"
|
(namespaces["html"], "td"),
|
||||||
|
(namespaces["html"], "th"),
|
||||||
|
(namespaces["svg"], "foreignObject")
|
||||||
))
|
))
|
||||||
|
|
||||||
formattingElements = frozenset((
|
formattingElements = frozenset((
|
||||||
"a",
|
(namespaces["html"], "a"),
|
||||||
"b",
|
(namespaces["html"], "b"),
|
||||||
"big",
|
(namespaces["html"], "big"),
|
||||||
"em",
|
(namespaces["html"], "code"),
|
||||||
"font",
|
(namespaces["html"], "em"),
|
||||||
"i",
|
(namespaces["html"], "font"),
|
||||||
"nobr",
|
(namespaces["html"], "i"),
|
||||||
"s",
|
(namespaces["html"], "nobr"),
|
||||||
"small",
|
(namespaces["html"], "s"),
|
||||||
"strike",
|
(namespaces["html"], "small"),
|
||||||
"strong",
|
(namespaces["html"], "strike"),
|
||||||
"tt",
|
(namespaces["html"], "strong"),
|
||||||
"u"
|
(namespaces["html"], "tt"),
|
||||||
|
(namespaces["html"], "u")
|
||||||
))
|
))
|
||||||
|
|
||||||
specialElements = frozenset((
|
specialElements = frozenset((
|
||||||
"address",
|
(namespaces["html"], "address"),
|
||||||
"area",
|
(namespaces["html"], "area"),
|
||||||
"base",
|
(namespaces["html"], "article"),
|
||||||
"basefont",
|
(namespaces["html"], "aside"),
|
||||||
"bgsound",
|
(namespaces["html"], "base"),
|
||||||
"blockquote",
|
(namespaces["html"], "basefont"),
|
||||||
"body",
|
(namespaces["html"], "bgsound"),
|
||||||
"br",
|
(namespaces["html"], "blockquote"),
|
||||||
"center",
|
(namespaces["html"], "body"),
|
||||||
"col",
|
(namespaces["html"], "br"),
|
||||||
"colgroup",
|
(namespaces["html"], "center"),
|
||||||
"dd",
|
(namespaces["html"], "col"),
|
||||||
"dir",
|
(namespaces["html"], "colgroup"),
|
||||||
"div",
|
(namespaces["html"], "command"),
|
||||||
"dl",
|
(namespaces["html"], "datagrid"),
|
||||||
"dt",
|
(namespaces["html"], "dd"),
|
||||||
"embed",
|
(namespaces["html"], "details"),
|
||||||
"fieldset",
|
(namespaces["html"], "dialog"),
|
||||||
"form",
|
(namespaces["html"], "dir"),
|
||||||
"frame",
|
(namespaces["html"], "div"),
|
||||||
"frameset",
|
(namespaces["html"], "dl"),
|
||||||
"h1",
|
(namespaces["html"], "dt"),
|
||||||
"h2",
|
(namespaces["html"], "embed"),
|
||||||
"h3",
|
(namespaces["html"], "event-source"),
|
||||||
"h4",
|
(namespaces["html"], "fieldset"),
|
||||||
"h5",
|
(namespaces["html"], "figure"),
|
||||||
"h6",
|
(namespaces["html"], "footer"),
|
||||||
"head",
|
(namespaces["html"], "form"),
|
||||||
"hr",
|
(namespaces["html"], "frame"),
|
||||||
"iframe",
|
(namespaces["html"], "frameset"),
|
||||||
"image",
|
(namespaces["html"], "h1"),
|
||||||
"img",
|
(namespaces["html"], "h2"),
|
||||||
"input",
|
(namespaces["html"], "h3"),
|
||||||
"isindex",
|
(namespaces["html"], "h4"),
|
||||||
"li",
|
(namespaces["html"], "h5"),
|
||||||
"link",
|
(namespaces["html"], "h6"),
|
||||||
"listing",
|
(namespaces["html"], "head"),
|
||||||
"menu",
|
(namespaces["html"], "header"),
|
||||||
"meta",
|
(namespaces["html"], "hr"),
|
||||||
"noembed",
|
(namespaces["html"], "iframe"),
|
||||||
"noframes",
|
# Note that image is commented out in the spec as "this isn't an
|
||||||
"noscript",
|
# element that can end up on the stack, so it doesn't matter,"
|
||||||
"ol",
|
(namespaces["html"], "image"),
|
||||||
"optgroup",
|
(namespaces["html"], "img"),
|
||||||
"option",
|
(namespaces["html"], "input"),
|
||||||
"p",
|
(namespaces["html"], "isindex"),
|
||||||
"param",
|
(namespaces["html"], "li"),
|
||||||
"plaintext",
|
(namespaces["html"], "link"),
|
||||||
"pre",
|
(namespaces["html"], "listing"),
|
||||||
"script",
|
(namespaces["html"], "menu"),
|
||||||
"select",
|
(namespaces["html"], "meta"),
|
||||||
"spacer",
|
(namespaces["html"], "nav"),
|
||||||
"style",
|
(namespaces["html"], "noembed"),
|
||||||
"tbody",
|
(namespaces["html"], "noframes"),
|
||||||
"textarea",
|
(namespaces["html"], "noscript"),
|
||||||
"tfoot",
|
(namespaces["html"], "ol"),
|
||||||
"thead",
|
(namespaces["html"], "optgroup"),
|
||||||
"title",
|
(namespaces["html"], "option"),
|
||||||
"tr",
|
(namespaces["html"], "p"),
|
||||||
"ul",
|
(namespaces["html"], "param"),
|
||||||
"wbr"
|
(namespaces["html"], "plaintext"),
|
||||||
|
(namespaces["html"], "pre"),
|
||||||
|
(namespaces["html"], "script"),
|
||||||
|
(namespaces["html"], "section"),
|
||||||
|
(namespaces["html"], "select"),
|
||||||
|
(namespaces["html"], "spacer"),
|
||||||
|
(namespaces["html"], "style"),
|
||||||
|
(namespaces["html"], "tbody"),
|
||||||
|
(namespaces["html"], "textarea"),
|
||||||
|
(namespaces["html"], "tfoot"),
|
||||||
|
(namespaces["html"], "thead"),
|
||||||
|
(namespaces["html"], "title"),
|
||||||
|
(namespaces["html"], "tr"),
|
||||||
|
(namespaces["html"], "ul"),
|
||||||
|
(namespaces["html"], "wbr")
|
||||||
))
|
))
|
||||||
|
|
||||||
spaceCharacters = frozenset((
|
spaceCharacters = frozenset((
|
||||||
u"\t",
|
u"\t",
|
||||||
u"\n",
|
u"\n",
|
||||||
u"\u000B",
|
|
||||||
u"\u000C",
|
u"\u000C",
|
||||||
u" ",
|
u" ",
|
||||||
u"\r"
|
u"\r"
|
||||||
@ -143,9 +423,10 @@ headingElements = (
|
|||||||
"h6"
|
"h6"
|
||||||
)
|
)
|
||||||
|
|
||||||
# XXX What about event-source and command?
|
|
||||||
voidElements = frozenset((
|
voidElements = frozenset((
|
||||||
"base",
|
"base",
|
||||||
|
"command",
|
||||||
|
"event-source",
|
||||||
"link",
|
"link",
|
||||||
"meta",
|
"meta",
|
||||||
"hr",
|
"hr",
|
||||||
@ -155,7 +436,8 @@ voidElements = frozenset((
|
|||||||
"param",
|
"param",
|
||||||
"area",
|
"area",
|
||||||
"col",
|
"col",
|
||||||
"input"
|
"input",
|
||||||
|
"source"
|
||||||
))
|
))
|
||||||
|
|
||||||
cdataElements = frozenset(('title', 'textarea'))
|
cdataElements = frozenset(('title', 'textarea'))
|
||||||
@ -440,7 +722,7 @@ entities = {
|
|||||||
"kappa;": u"\u03BA",
|
"kappa;": u"\u03BA",
|
||||||
"lArr;": u"\u21D0",
|
"lArr;": u"\u21D0",
|
||||||
"lambda;": u"\u03BB",
|
"lambda;": u"\u03BB",
|
||||||
"lang;": u"\u3008",
|
"lang;": u"\u27E8",
|
||||||
"laquo;": u"\u00AB",
|
"laquo;": u"\u00AB",
|
||||||
"laquo": u"\u00AB",
|
"laquo": u"\u00AB",
|
||||||
"larr;": u"\u2190",
|
"larr;": u"\u2190",
|
||||||
@ -520,7 +802,7 @@ entities = {
|
|||||||
"quot": u"\u0022",
|
"quot": u"\u0022",
|
||||||
"rArr;": u"\u21D2",
|
"rArr;": u"\u21D2",
|
||||||
"radic;": u"\u221A",
|
"radic;": u"\u221A",
|
||||||
"rang;": u"\u3009",
|
"rang;": u"\u27E9",
|
||||||
"raquo;": u"\u00BB",
|
"raquo;": u"\u00BB",
|
||||||
"raquo": u"\u00BB",
|
"raquo": u"\u00BB",
|
||||||
"rarr;": u"\u2192",
|
"rarr;": u"\u2192",
|
||||||
@ -596,221 +878,255 @@ entities = {
|
|||||||
"zwnj;": u"\u200C"
|
"zwnj;": u"\u200C"
|
||||||
}
|
}
|
||||||
|
|
||||||
encodings = frozenset((
|
encodings = {
|
||||||
"ansi_x3.4-1968",
|
'437': 'cp437',
|
||||||
"iso-ir-6",
|
'850': 'cp850',
|
||||||
"ansi_x3.4-1986",
|
'852': 'cp852',
|
||||||
"iso_646.irv:1991",
|
'855': 'cp855',
|
||||||
"ascii",
|
'857': 'cp857',
|
||||||
"iso646-us",
|
'860': 'cp860',
|
||||||
"us-ascii",
|
'861': 'cp861',
|
||||||
"us",
|
'862': 'cp862',
|
||||||
"ibm367",
|
'863': 'cp863',
|
||||||
"cp367",
|
'865': 'cp865',
|
||||||
"csascii",
|
'866': 'cp866',
|
||||||
"ks_c_5601-1987",
|
'869': 'cp869',
|
||||||
"korean",
|
'ansix341968': 'ascii',
|
||||||
"iso-2022-kr",
|
'ansix341986': 'ascii',
|
||||||
"csiso2022kr",
|
'arabic': 'iso8859-6',
|
||||||
"euc-kr",
|
'ascii': 'ascii',
|
||||||
"iso-2022-jp",
|
'asmo708': 'iso8859-6',
|
||||||
"csiso2022jp",
|
'big5': 'big5',
|
||||||
"iso-2022-jp-2",
|
'big5hkscs': 'big5hkscs',
|
||||||
"iso-ir-58",
|
'chinese': 'gbk',
|
||||||
"chinese",
|
'cp037': 'cp037',
|
||||||
"csiso58gb231280",
|
'cp1026': 'cp1026',
|
||||||
"iso_8859-1:1987",
|
'cp154': 'ptcp154',
|
||||||
"iso-ir-100",
|
'cp367': 'ascii',
|
||||||
"iso_8859-1",
|
'cp424': 'cp424',
|
||||||
"iso-8859-1",
|
'cp437': 'cp437',
|
||||||
"latin1",
|
'cp500': 'cp500',
|
||||||
"l1",
|
'cp775': 'cp775',
|
||||||
"ibm819",
|
'cp819': 'windows-1252',
|
||||||
"cp819",
|
'cp850': 'cp850',
|
||||||
"csisolatin1",
|
'cp852': 'cp852',
|
||||||
"iso_8859-2:1987",
|
'cp855': 'cp855',
|
||||||
"iso-ir-101",
|
'cp857': 'cp857',
|
||||||
"iso_8859-2",
|
'cp860': 'cp860',
|
||||||
"iso-8859-2",
|
'cp861': 'cp861',
|
||||||
"latin2",
|
'cp862': 'cp862',
|
||||||
"l2",
|
'cp863': 'cp863',
|
||||||
"csisolatin2",
|
'cp864': 'cp864',
|
||||||
"iso_8859-3:1988",
|
'cp865': 'cp865',
|
||||||
"iso-ir-109",
|
'cp866': 'cp866',
|
||||||
"iso_8859-3",
|
'cp869': 'cp869',
|
||||||
"iso-8859-3",
|
'cp936': 'gbk',
|
||||||
"latin3",
|
'cpgr': 'cp869',
|
||||||
"l3",
|
'cpis': 'cp861',
|
||||||
"csisolatin3",
|
'csascii': 'ascii',
|
||||||
"iso_8859-4:1988",
|
'csbig5': 'big5',
|
||||||
"iso-ir-110",
|
'cseuckr': 'cp949',
|
||||||
"iso_8859-4",
|
'cseucpkdfmtjapanese': 'euc_jp',
|
||||||
"iso-8859-4",
|
'csgb2312': 'gbk',
|
||||||
"latin4",
|
'cshproman8': 'hp-roman8',
|
||||||
"l4",
|
'csibm037': 'cp037',
|
||||||
"csisolatin4",
|
'csibm1026': 'cp1026',
|
||||||
"iso_8859-6:1987",
|
'csibm424': 'cp424',
|
||||||
"iso-ir-127",
|
'csibm500': 'cp500',
|
||||||
"iso_8859-6",
|
'csibm855': 'cp855',
|
||||||
"iso-8859-6",
|
'csibm857': 'cp857',
|
||||||
"ecma-114",
|
'csibm860': 'cp860',
|
||||||
"asmo-708",
|
'csibm861': 'cp861',
|
||||||
"arabic",
|
'csibm863': 'cp863',
|
||||||
"csisolatinarabic",
|
'csibm864': 'cp864',
|
||||||
"iso_8859-7:1987",
|
'csibm865': 'cp865',
|
||||||
"iso-ir-126",
|
'csibm866': 'cp866',
|
||||||
"iso_8859-7",
|
'csibm869': 'cp869',
|
||||||
"iso-8859-7",
|
'csiso2022jp': 'iso2022_jp',
|
||||||
"elot_928",
|
'csiso2022jp2': 'iso2022_jp_2',
|
||||||
"ecma-118",
|
'csiso2022kr': 'iso2022_kr',
|
||||||
"greek",
|
'csiso58gb231280': 'gbk',
|
||||||
"greek8",
|
'csisolatin1': 'windows-1252',
|
||||||
"csisolatingreek",
|
'csisolatin2': 'iso8859-2',
|
||||||
"iso_8859-8:1988",
|
'csisolatin3': 'iso8859-3',
|
||||||
"iso-ir-138",
|
'csisolatin4': 'iso8859-4',
|
||||||
"iso_8859-8",
|
'csisolatin5': 'windows-1254',
|
||||||
"iso-8859-8",
|
'csisolatin6': 'iso8859-10',
|
||||||
"hebrew",
|
'csisolatinarabic': 'iso8859-6',
|
||||||
"csisolatinhebrew",
|
'csisolatincyrillic': 'iso8859-5',
|
||||||
"iso_8859-5:1988",
|
'csisolatingreek': 'iso8859-7',
|
||||||
"iso-ir-144",
|
'csisolatinhebrew': 'iso8859-8',
|
||||||
"iso_8859-5",
|
'cskoi8r': 'koi8-r',
|
||||||
"iso-8859-5",
|
'csksc56011987': 'cp949',
|
||||||
"cyrillic",
|
'cspc775baltic': 'cp775',
|
||||||
"csisolatincyrillic",
|
'cspc850multilingual': 'cp850',
|
||||||
"iso_8859-9:1989",
|
'cspc862latinhebrew': 'cp862',
|
||||||
"iso-ir-148",
|
'cspc8codepage437': 'cp437',
|
||||||
"iso_8859-9",
|
'cspcp852': 'cp852',
|
||||||
"iso-8859-9",
|
'csptcp154': 'ptcp154',
|
||||||
"latin5",
|
'csshiftjis': 'shift_jis',
|
||||||
"l5",
|
'csunicode11utf7': 'utf-7',
|
||||||
"csisolatin5",
|
'cyrillic': 'iso8859-5',
|
||||||
"iso-8859-10",
|
'cyrillicasian': 'ptcp154',
|
||||||
"iso-ir-157",
|
'ebcdiccpbe': 'cp500',
|
||||||
"l6",
|
'ebcdiccpca': 'cp037',
|
||||||
"iso_8859-10:1992",
|
'ebcdiccpch': 'cp500',
|
||||||
"csisolatin6",
|
'ebcdiccphe': 'cp424',
|
||||||
"latin6",
|
'ebcdiccpnl': 'cp037',
|
||||||
"hp-roman8",
|
'ebcdiccpus': 'cp037',
|
||||||
"roman8",
|
'ebcdiccpwt': 'cp037',
|
||||||
"r8",
|
'ecma114': 'iso8859-6',
|
||||||
"ibm037",
|
'ecma118': 'iso8859-7',
|
||||||
"cp037",
|
'elot928': 'iso8859-7',
|
||||||
"csibm037",
|
'eucjp': 'euc_jp',
|
||||||
"ibm424",
|
'euckr': 'cp949',
|
||||||
"cp424",
|
'extendedunixcodepackedformatforjapanese': 'euc_jp',
|
||||||
"csibm424",
|
'gb18030': 'gb18030',
|
||||||
"ibm437",
|
'gb2312': 'gbk',
|
||||||
"cp437",
|
'gb231280': 'gbk',
|
||||||
"437",
|
'gbk': 'gbk',
|
||||||
"cspc8codepage437",
|
'greek': 'iso8859-7',
|
||||||
"ibm500",
|
'greek8': 'iso8859-7',
|
||||||
"cp500",
|
'hebrew': 'iso8859-8',
|
||||||
"csibm500",
|
'hproman8': 'hp-roman8',
|
||||||
"ibm775",
|
'hzgb2312': 'hz',
|
||||||
"cp775",
|
'ibm037': 'cp037',
|
||||||
"cspc775baltic",
|
'ibm1026': 'cp1026',
|
||||||
"ibm850",
|
'ibm367': 'ascii',
|
||||||
"cp850",
|
'ibm424': 'cp424',
|
||||||
"850",
|
'ibm437': 'cp437',
|
||||||
"cspc850multilingual",
|
'ibm500': 'cp500',
|
||||||
"ibm852",
|
'ibm775': 'cp775',
|
||||||
"cp852",
|
'ibm819': 'windows-1252',
|
||||||
"852",
|
'ibm850': 'cp850',
|
||||||
"cspcp852",
|
'ibm852': 'cp852',
|
||||||
"ibm855",
|
'ibm855': 'cp855',
|
||||||
"cp855",
|
'ibm857': 'cp857',
|
||||||
"855",
|
'ibm860': 'cp860',
|
||||||
"csibm855",
|
'ibm861': 'cp861',
|
||||||
"ibm857",
|
'ibm862': 'cp862',
|
||||||
"cp857",
|
'ibm863': 'cp863',
|
||||||
"857",
|
'ibm864': 'cp864',
|
||||||
"csibm857",
|
'ibm865': 'cp865',
|
||||||
"ibm860",
|
'ibm866': 'cp866',
|
||||||
"cp860",
|
'ibm869': 'cp869',
|
||||||
"860",
|
'iso2022jp': 'iso2022_jp',
|
||||||
"csibm860",
|
'iso2022jp2': 'iso2022_jp_2',
|
||||||
"ibm861",
|
'iso2022kr': 'iso2022_kr',
|
||||||
"cp861",
|
'iso646irv1991': 'ascii',
|
||||||
"861",
|
'iso646us': 'ascii',
|
||||||
"cp-is",
|
'iso88591': 'windows-1252',
|
||||||
"csibm861",
|
'iso885910': 'iso8859-10',
|
||||||
"ibm862",
|
'iso8859101992': 'iso8859-10',
|
||||||
"cp862",
|
'iso885911987': 'windows-1252',
|
||||||
"862",
|
'iso885913': 'iso8859-13',
|
||||||
"cspc862latinhebrew",
|
'iso885914': 'iso8859-14',
|
||||||
"ibm863",
|
'iso8859141998': 'iso8859-14',
|
||||||
"cp863",
|
'iso885915': 'iso8859-15',
|
||||||
"863",
|
'iso885916': 'iso8859-16',
|
||||||
"csibm863",
|
'iso8859162001': 'iso8859-16',
|
||||||
"ibm864",
|
'iso88592': 'iso8859-2',
|
||||||
"cp864",
|
'iso885921987': 'iso8859-2',
|
||||||
"csibm864",
|
'iso88593': 'iso8859-3',
|
||||||
"ibm865",
|
'iso885931988': 'iso8859-3',
|
||||||
"cp865",
|
'iso88594': 'iso8859-4',
|
||||||
"865",
|
'iso885941988': 'iso8859-4',
|
||||||
"csibm865",
|
'iso88595': 'iso8859-5',
|
||||||
"ibm866",
|
'iso885951988': 'iso8859-5',
|
||||||
"cp866",
|
'iso88596': 'iso8859-6',
|
||||||
"866",
|
'iso885961987': 'iso8859-6',
|
||||||
"csibm866",
|
'iso88597': 'iso8859-7',
|
||||||
"ibm869",
|
'iso885971987': 'iso8859-7',
|
||||||
"cp869",
|
'iso88598': 'iso8859-8',
|
||||||
"869",
|
'iso885981988': 'iso8859-8',
|
||||||
"cp-gr",
|
'iso88599': 'windows-1254',
|
||||||
"csibm869",
|
'iso885991989': 'windows-1254',
|
||||||
"ibm1026",
|
'isoceltic': 'iso8859-14',
|
||||||
"cp1026",
|
'isoir100': 'windows-1252',
|
||||||
"csibm1026",
|
'isoir101': 'iso8859-2',
|
||||||
"koi8-r",
|
'isoir109': 'iso8859-3',
|
||||||
"cskoi8r",
|
'isoir110': 'iso8859-4',
|
||||||
"koi8-u",
|
'isoir126': 'iso8859-7',
|
||||||
"big5-hkscs",
|
'isoir127': 'iso8859-6',
|
||||||
"ptcp154",
|
'isoir138': 'iso8859-8',
|
||||||
"csptcp154",
|
'isoir144': 'iso8859-5',
|
||||||
"pt154",
|
'isoir148': 'windows-1254',
|
||||||
"cp154",
|
'isoir149': 'cp949',
|
||||||
"utf-7",
|
'isoir157': 'iso8859-10',
|
||||||
"utf-16be",
|
'isoir199': 'iso8859-14',
|
||||||
"utf-16le",
|
'isoir226': 'iso8859-16',
|
||||||
"utf-16",
|
'isoir58': 'gbk',
|
||||||
"utf-8",
|
'isoir6': 'ascii',
|
||||||
"iso-8859-13",
|
'koi8r': 'koi8-r',
|
||||||
"iso-8859-14",
|
'koi8u': 'koi8-u',
|
||||||
"iso-ir-199",
|
'korean': 'cp949',
|
||||||
"iso_8859-14:1998",
|
'ksc5601': 'cp949',
|
||||||
"iso_8859-14",
|
'ksc56011987': 'cp949',
|
||||||
"latin8",
|
'ksc56011989': 'cp949',
|
||||||
"iso-celtic",
|
'l1': 'windows-1252',
|
||||||
"l8",
|
'l10': 'iso8859-16',
|
||||||
"iso-8859-15",
|
'l2': 'iso8859-2',
|
||||||
"iso_8859-15",
|
'l3': 'iso8859-3',
|
||||||
"iso-8859-16",
|
'l4': 'iso8859-4',
|
||||||
"iso-ir-226",
|
'l5': 'windows-1254',
|
||||||
"iso_8859-16:2001",
|
'l6': 'iso8859-10',
|
||||||
"iso_8859-16",
|
'l8': 'iso8859-14',
|
||||||
"latin10",
|
'latin1': 'windows-1252',
|
||||||
"l10",
|
'latin10': 'iso8859-16',
|
||||||
"gbk",
|
'latin2': 'iso8859-2',
|
||||||
"cp936",
|
'latin3': 'iso8859-3',
|
||||||
"ms936",
|
'latin4': 'iso8859-4',
|
||||||
"gb18030",
|
'latin5': 'windows-1254',
|
||||||
"shift_jis",
|
'latin6': 'iso8859-10',
|
||||||
"ms_kanji",
|
'latin8': 'iso8859-14',
|
||||||
"csshiftjis",
|
'latin9': 'iso8859-15',
|
||||||
"euc-jp",
|
'ms936': 'gbk',
|
||||||
"gb2312",
|
'mskanji': 'shift_jis',
|
||||||
"big5",
|
'pt154': 'ptcp154',
|
||||||
"csbig5",
|
'ptcp154': 'ptcp154',
|
||||||
"windows-1250",
|
'r8': 'hp-roman8',
|
||||||
"windows-1251",
|
'roman8': 'hp-roman8',
|
||||||
"windows-1252",
|
'shiftjis': 'shift_jis',
|
||||||
"windows-1253",
|
'tis620': 'cp874',
|
||||||
"windows-1254",
|
'unicode11utf7': 'utf-7',
|
||||||
"windows-1255",
|
'us': 'ascii',
|
||||||
"windows-1256",
|
'usascii': 'ascii',
|
||||||
"windows-1257",
|
'utf16': 'utf-16',
|
||||||
"windows-1258",
|
'utf16be': 'utf-16-be',
|
||||||
"tis-620",
|
'utf16le': 'utf-16-le',
|
||||||
"hz-gb-2312",
|
'utf8': 'utf-8',
|
||||||
))
|
'windows1250': 'cp1250',
|
||||||
|
'windows1251': 'cp1251',
|
||||||
|
'windows1252': 'cp1252',
|
||||||
|
'windows1253': 'cp1253',
|
||||||
|
'windows1254': 'cp1254',
|
||||||
|
'windows1255': 'cp1255',
|
||||||
|
'windows1256': 'cp1256',
|
||||||
|
'windows1257': 'cp1257',
|
||||||
|
'windows1258': 'cp1258',
|
||||||
|
'windows936': 'gbk',
|
||||||
|
'x-x-big5': 'big5'}
|
||||||
|
|
||||||
|
tokenTypes = {
|
||||||
|
"Doctype":0,
|
||||||
|
"Characters":1,
|
||||||
|
"SpaceCharacters":2,
|
||||||
|
"StartTag":3,
|
||||||
|
"EndTag":4,
|
||||||
|
"EmptyTag":5,
|
||||||
|
"Comment":6,
|
||||||
|
"ParseError":7
|
||||||
|
}
|
||||||
|
|
||||||
|
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||||
|
tokenTypes["EmptyTag"]))
|
||||||
|
|
||||||
|
|
||||||
|
prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
|
||||||
|
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
|
||||||
|
|
||||||
|
class DataLossWarning(UserWarning):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class ReparseException(Exception):
|
||||||
|
pass
|
||||||
|
127
planet/vendor/html5lib/filters/formfiller.py
vendored
Normal file
127
planet/vendor/html5lib/filters/formfiller.py
vendored
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
#
|
||||||
|
# The goal is to finally have a form filler where you pass data for
|
||||||
|
# each form, using the algorithm for "Seeding a form with initial values"
|
||||||
|
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
|
||||||
|
#
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
from html5lib.constants import spaceCharacters
|
||||||
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
class SimpleFilter(_base.Filter):
|
||||||
|
def __init__(self, source, fieldStorage):
|
||||||
|
_base.Filter.__init__(self, source)
|
||||||
|
self.fieldStorage = fieldStorage
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
field_indices = {}
|
||||||
|
state = None
|
||||||
|
field_name = None
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
name = token["name"].lower()
|
||||||
|
if name == "input":
|
||||||
|
field_name = None
|
||||||
|
field_type = None
|
||||||
|
input_value_index = -1
|
||||||
|
input_checked_index = -1
|
||||||
|
for i,(n,v) in enumerate(token["data"]):
|
||||||
|
n = n.lower()
|
||||||
|
if n == u"name":
|
||||||
|
field_name = v.strip(spaceCharacters)
|
||||||
|
elif n == u"type":
|
||||||
|
field_type = v.strip(spaceCharacters)
|
||||||
|
elif n == u"checked":
|
||||||
|
input_checked_index = i
|
||||||
|
elif n == u"value":
|
||||||
|
input_value_index = i
|
||||||
|
|
||||||
|
value_list = self.fieldStorage.getlist(field_name)
|
||||||
|
field_index = field_indices.setdefault(field_name, 0)
|
||||||
|
if field_index < len(value_list):
|
||||||
|
value = value_list[field_index]
|
||||||
|
else:
|
||||||
|
value = ""
|
||||||
|
|
||||||
|
if field_type in (u"checkbox", u"radio"):
|
||||||
|
if value_list:
|
||||||
|
if token["data"][input_value_index][1] == value:
|
||||||
|
if input_checked_index < 0:
|
||||||
|
token["data"].append((u"checked", u""))
|
||||||
|
field_indices[field_name] = field_index + 1
|
||||||
|
elif input_checked_index >= 0:
|
||||||
|
del token["data"][input_checked_index]
|
||||||
|
|
||||||
|
elif field_type not in (u"button", u"submit", u"reset"):
|
||||||
|
if input_value_index >= 0:
|
||||||
|
token["data"][input_value_index] = (u"value", value)
|
||||||
|
else:
|
||||||
|
token["data"].append((u"value", value))
|
||||||
|
field_indices[field_name] = field_index + 1
|
||||||
|
|
||||||
|
field_type = None
|
||||||
|
field_name = None
|
||||||
|
|
||||||
|
elif name == "textarea":
|
||||||
|
field_type = "textarea"
|
||||||
|
field_name = dict((token["data"])[::-1])["name"]
|
||||||
|
|
||||||
|
elif name == "select":
|
||||||
|
field_type = "select"
|
||||||
|
attributes = dict(token["data"][::-1])
|
||||||
|
field_name = attributes.get("name")
|
||||||
|
is_select_multiple = "multiple" in attributes
|
||||||
|
is_selected_option_found = False
|
||||||
|
|
||||||
|
elif field_type == "select" and field_name and name == "option":
|
||||||
|
option_selected_index = -1
|
||||||
|
option_value = None
|
||||||
|
for i,(n,v) in enumerate(token["data"]):
|
||||||
|
n = n.lower()
|
||||||
|
if n == "selected":
|
||||||
|
option_selected_index = i
|
||||||
|
elif n == "value":
|
||||||
|
option_value = v.strip(spaceCharacters)
|
||||||
|
if option_value is None:
|
||||||
|
raise NotImplementedError("<option>s without a value= attribute")
|
||||||
|
else:
|
||||||
|
value_list = self.fieldStorage.getlist(field_name)
|
||||||
|
if value_list:
|
||||||
|
field_index = field_indices.setdefault(field_name, 0)
|
||||||
|
if field_index < len(value_list):
|
||||||
|
value = value_list[field_index]
|
||||||
|
else:
|
||||||
|
value = ""
|
||||||
|
if (is_select_multiple or not is_selected_option_found) and option_value == value:
|
||||||
|
if option_selected_index < 0:
|
||||||
|
token["data"].append((u"selected", u""))
|
||||||
|
field_indices[field_name] = field_index + 1
|
||||||
|
is_selected_option_found = True
|
||||||
|
elif option_selected_index >= 0:
|
||||||
|
del token["data"][option_selected_index]
|
||||||
|
|
||||||
|
elif field_type is not None and field_name and type == "EndTag":
|
||||||
|
name = token["name"].lower()
|
||||||
|
if name == field_type:
|
||||||
|
if name == "textarea":
|
||||||
|
value_list = self.fieldStorage.getlist(field_name)
|
||||||
|
if value_list:
|
||||||
|
field_index = field_indices.setdefault(field_name, 0)
|
||||||
|
if field_index < len(value_list):
|
||||||
|
value = value_list[field_index]
|
||||||
|
else:
|
||||||
|
value = ""
|
||||||
|
yield {"type": "Characters", "data": value}
|
||||||
|
field_indices[field_name] = field_index + 1
|
||||||
|
|
||||||
|
field_name = None
|
||||||
|
|
||||||
|
elif name == "option" and field_type == "select":
|
||||||
|
pass # TODO: part of "option without value= attribute" processing
|
||||||
|
|
||||||
|
elif field_type == "textarea":
|
||||||
|
continue # ignore token
|
||||||
|
|
||||||
|
yield token
|
53
planet/vendor/html5lib/filters/optionaltags.py
vendored
53
planet/vendor/html5lib/filters/optionaltags.py
vendored
@ -14,7 +14,8 @@ class Filter(_base.Filter):
|
|||||||
for previous, token, next in self.slider():
|
for previous, token, next in self.slider():
|
||||||
type = token["type"]
|
type = token["type"]
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
if token["data"] or not self.is_optional_start(token["name"], previous, next):
|
if (token["data"] or
|
||||||
|
not self.is_optional_start(token["name"], previous, next)):
|
||||||
yield token
|
yield token
|
||||||
elif type == "EndTag":
|
elif type == "EndTag":
|
||||||
if not self.is_optional_end(token["name"], next):
|
if not self.is_optional_end(token["name"], next):
|
||||||
@ -31,7 +32,11 @@ class Filter(_base.Filter):
|
|||||||
elif tagname == 'head':
|
elif tagname == 'head':
|
||||||
# A head element's start tag may be omitted if the first thing
|
# A head element's start tag may be omitted if the first thing
|
||||||
# inside the head element is an element.
|
# inside the head element is an element.
|
||||||
return type == "StartTag"
|
# XXX: we also omit the start tag if the head element is empty
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
return True
|
||||||
|
elif type == "EndTag":
|
||||||
|
return next["name"] == "head"
|
||||||
elif tagname == 'body':
|
elif tagname == 'body':
|
||||||
# A body element's start tag may be omitted if the first thing
|
# A body element's start tag may be omitted if the first thing
|
||||||
# inside the body element is not a space character or a comment,
|
# inside the body element is not a space character or a comment,
|
||||||
@ -52,7 +57,7 @@ class Filter(_base.Filter):
|
|||||||
# inside the colgroup element is a col element, and if the element
|
# inside the colgroup element is a col element, and if the element
|
||||||
# is not immediately preceeded by another colgroup element whose
|
# is not immediately preceeded by another colgroup element whose
|
||||||
# end tag has been omitted.
|
# end tag has been omitted.
|
||||||
if type == "StartTag":
|
if type in ("StartTag", "EmptyTag"):
|
||||||
# XXX: we do not look at the preceding event, so instead we never
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
# omit the colgroup element's end tag when it is immediately
|
# omit the colgroup element's end tag when it is immediately
|
||||||
# followed by another colgroup element. See is_optional_end.
|
# followed by another colgroup element. See is_optional_end.
|
||||||
@ -81,16 +86,13 @@ class Filter(_base.Filter):
|
|||||||
# An html element's end tag may be omitted if the html element
|
# An html element's end tag may be omitted if the html element
|
||||||
# is not immediately followed by a space character or a comment.
|
# is not immediately followed by a space character or a comment.
|
||||||
return type not in ("Comment", "SpaceCharacters")
|
return type not in ("Comment", "SpaceCharacters")
|
||||||
elif tagname in ('li', 'optgroup', 'option', 'tr'):
|
elif tagname in ('li', 'optgroup', 'tr'):
|
||||||
# A li element's end tag may be omitted if the li element is
|
# A li element's end tag may be omitted if the li element is
|
||||||
# immediately followed by another li element or if there is
|
# immediately followed by another li element or if there is
|
||||||
# no more content in the parent element.
|
# no more content in the parent element.
|
||||||
# An optgroup element's end tag may be omitted if the optgroup
|
# An optgroup element's end tag may be omitted if the optgroup
|
||||||
# element is immediately followed by another optgroup element,
|
# element is immediately followed by another optgroup element,
|
||||||
# or if there is no more content in the parent element.
|
# or if there is no more content in the parent element.
|
||||||
# An option element's end tag may be omitted if the option
|
|
||||||
# element is immediately followed by another option element,
|
|
||||||
# or if there is no more content in the parent element.
|
|
||||||
# A tr element's end tag may be omitted if the tr element is
|
# A tr element's end tag may be omitted if the tr element is
|
||||||
# immediately followed by another tr element, or if there is
|
# immediately followed by another tr element, or if there is
|
||||||
# no more content in the parent element.
|
# no more content in the parent element.
|
||||||
@ -112,14 +114,39 @@ class Filter(_base.Filter):
|
|||||||
return False
|
return False
|
||||||
elif tagname == 'p':
|
elif tagname == 'p':
|
||||||
# A p element's end tag may be omitted if the p element is
|
# A p element's end tag may be omitted if the p element is
|
||||||
# immediately followed by an address, blockquote, dl, fieldset,
|
# immediately followed by an address, article, aside,
|
||||||
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
||||||
# or ul element, or if there is no more content in the parent
|
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
||||||
|
# nav, ol, p, pre, section, table, or ul, element, or if
|
||||||
|
# there is no more content in the parent element.
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
return next["name"] in ('address', 'article', 'aside',
|
||||||
|
'blockquote', 'datagrid', 'dialog',
|
||||||
|
'dir', 'div', 'dl', 'fieldset', 'footer',
|
||||||
|
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'header', 'hr', 'menu', 'nav', 'ol',
|
||||||
|
'p', 'pre', 'section', 'table', 'ul')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname == 'option':
|
||||||
|
# An option element's end tag may be omitted if the option
|
||||||
|
# element is immediately followed by another option element,
|
||||||
|
# or if it is immediately followed by an <code>optgroup</code>
|
||||||
|
# element, or if there is no more content in the parent
|
||||||
# element.
|
# element.
|
||||||
if type == "StartTag":
|
if type == "StartTag":
|
||||||
return next["name"] in ('address', 'blockquote', \
|
return next["name"] in ('option', 'optgroup')
|
||||||
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
|
else:
|
||||||
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('rt', 'rp'):
|
||||||
|
# An rt element's end tag may be omitted if the rt element is
|
||||||
|
# immediately followed by an rt or rp element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# An rp element's end tag may be omitted if the rp element is
|
||||||
|
# immediately followed by an rt or rp element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('rt', 'rp')
|
||||||
else:
|
else:
|
||||||
return type == "EndTag" or type is None
|
return type == "EndTag" or type is None
|
||||||
elif tagname == 'colgroup':
|
elif tagname == 'colgroup':
|
||||||
|
8
planet/vendor/html5lib/filters/sanitizer.py
vendored
Normal file
8
planet/vendor/html5lib/filters/sanitizer.py
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import _base
|
||||||
|
from html5lib.sanitizer import HTMLSanitizerMixin
|
||||||
|
|
||||||
|
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||||
|
def __iter__(self):
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
token = self.sanitize_token(token)
|
||||||
|
if token: yield token
|
2331
planet/vendor/html5lib/html5parser.py
vendored
2331
planet/vendor/html5lib/html5parser.py
vendored
File diff suppressed because it is too large
Load Diff
170
planet/vendor/html5lib/ihatexml.py
vendored
Normal file
170
planet/vendor/html5lib/ihatexml.py
vendored
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
|
||||||
|
|
||||||
|
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
|
||||||
|
|
||||||
|
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
|
||||||
|
|
||||||
|
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
|
||||||
|
|
||||||
|
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
|
||||||
|
|
||||||
|
letter = " | ".join([baseChar, ideographic])
|
||||||
|
|
||||||
|
#Without the
|
||||||
|
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
|
||||||
|
extender])
|
||||||
|
nameFirst = " | ".join([letter, "_"])
|
||||||
|
|
||||||
|
reChar = re.compile(r"#x([\d|A-F]{4,4})")
|
||||||
|
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
|
||||||
|
|
||||||
|
def charStringToList(chars):
|
||||||
|
charRanges = [item.strip() for item in chars.split(" | ")]
|
||||||
|
rv = []
|
||||||
|
for item in charRanges:
|
||||||
|
foundMatch = False
|
||||||
|
for regexp in (reChar, reCharRange):
|
||||||
|
match = regexp.match(item)
|
||||||
|
if match is not None:
|
||||||
|
rv.append([hexToInt(item) for item in match.groups()])
|
||||||
|
if len(rv[-1]) == 1:
|
||||||
|
rv[-1] = rv[-1]*2
|
||||||
|
foundMatch = True
|
||||||
|
break
|
||||||
|
if not foundMatch:
|
||||||
|
assert len(item) == 1
|
||||||
|
|
||||||
|
rv.append([ord(item)] * 2)
|
||||||
|
rv = normaliseCharList(rv)
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def normaliseCharList(charList):
|
||||||
|
charList = sorted(charList)
|
||||||
|
for item in charList:
|
||||||
|
assert item[1] >= item[0]
|
||||||
|
rv = []
|
||||||
|
i = 0
|
||||||
|
while i < len(charList):
|
||||||
|
j = 1
|
||||||
|
rv.append(charList[i])
|
||||||
|
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
|
||||||
|
rv[-1][1] = charList[i+j][1]
|
||||||
|
j += 1
|
||||||
|
i += j
|
||||||
|
return rv
|
||||||
|
|
||||||
|
#We don't really support characters above the BMP :(
|
||||||
|
max_unicode = int("FFFF", 16)
|
||||||
|
|
||||||
|
def missingRanges(charList):
|
||||||
|
rv = []
|
||||||
|
if charList[0] != 0:
|
||||||
|
rv.append([0, charList[0][0] - 1])
|
||||||
|
for i, item in enumerate(charList[:-1]):
|
||||||
|
rv.append([item[1]+1, charList[i+1][0] - 1])
|
||||||
|
if charList[-1][1] != max_unicode:
|
||||||
|
rv.append([charList[-1][1] + 1, max_unicode])
|
||||||
|
return rv
|
||||||
|
|
||||||
|
def listToRegexpStr(charList):
|
||||||
|
rv = []
|
||||||
|
for item in charList:
|
||||||
|
if item[0] == item[1]:
|
||||||
|
rv.append(intToUnicodeStr(item[0]))
|
||||||
|
else:
|
||||||
|
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
|
||||||
|
return "[%s]"%"|".join(rv)
|
||||||
|
|
||||||
|
def hexToInt(hex_str):
|
||||||
|
return int(hex_str, 16)
|
||||||
|
|
||||||
|
def intToUnicodeStr(intValue):
|
||||||
|
#There must be a better (non-evil) way to do this
|
||||||
|
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
|
||||||
|
|
||||||
|
def escapeRegexp(string):
|
||||||
|
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||||
|
"[", "]", "|", "(", ")", "-")
|
||||||
|
for char in specialCharacters:
|
||||||
|
string = string.replace(char, r"\\" + char)
|
||||||
|
if char in string:
|
||||||
|
print string
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
#output from the above
|
||||||
|
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
|
||||||
|
|
||||||
|
class InfosetFilter(object):
|
||||||
|
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||||
|
def __init__(self, replaceChars = None,
|
||||||
|
replaceRanges = None,
|
||||||
|
dropXmlnsLocalName = False,
|
||||||
|
dropXmlnsAttrNs = False,
|
||||||
|
preventDoubleDashComments = False,
|
||||||
|
preventDashAtCommentEnd = False,
|
||||||
|
replaceFormFeedCharacters = True):
|
||||||
|
if replaceRanges is not None or replaceChars is not None:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
self.replaceCharsRegexp = nonXmlBMPRegexp
|
||||||
|
|
||||||
|
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||||
|
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||||
|
|
||||||
|
self.preventDoubleDashComments = preventDoubleDashComments
|
||||||
|
self.preventDashAtCommentEnd = preventDashAtCommentEnd
|
||||||
|
|
||||||
|
self.replaceFormFeedCharacters = replaceFormFeedCharacters
|
||||||
|
|
||||||
|
self.replaceCache = {}
|
||||||
|
|
||||||
|
def coerceAttribute(self, name, namespace=None):
|
||||||
|
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
|
||||||
|
#Need a datalosswarning here
|
||||||
|
return None
|
||||||
|
elif (self.dropXmlnsAttrNs and
|
||||||
|
namespace == "http://www.w3.org/2000/xmlns/"):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self.toXmlName(name)
|
||||||
|
|
||||||
|
def coerceElement(self, name, namespace=None):
|
||||||
|
return self.toXmlName(name)
|
||||||
|
|
||||||
|
def coerceComment(self, data):
|
||||||
|
if self.preventDoubleDashComments:
|
||||||
|
while "--" in data:
|
||||||
|
data = data.replace("--", "- -")
|
||||||
|
return data
|
||||||
|
|
||||||
|
def coerceCharacters(self, data):
|
||||||
|
if self.replaceFormFeedCharacters:
|
||||||
|
data = data.replace("\x0C", " ")
|
||||||
|
#Other non-xml characters
|
||||||
|
return data
|
||||||
|
|
||||||
|
def toXmlName(self, name):
|
||||||
|
replaceChars = set(self.replaceCharsRegexp.findall(name))
|
||||||
|
for char in replaceChars:
|
||||||
|
if char in self.replaceCache:
|
||||||
|
replacement = self.replaceCache[char]
|
||||||
|
else:
|
||||||
|
replacement = self.escapeChar(char)
|
||||||
|
name = name.replace(char, replacement)
|
||||||
|
return name
|
||||||
|
|
||||||
|
def fromXmlName(self, name):
|
||||||
|
for item in set(self.replacementRegexp.findall(name)):
|
||||||
|
name = name.replace(item, self.unescapeChar(item))
|
||||||
|
return name
|
||||||
|
|
||||||
|
def escapeChar(self, char):
|
||||||
|
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
|
||||||
|
self.replaceCache[char] = replacement
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
def unescapeChar(self, charcode):
|
||||||
|
return unichr(int(charcode[1:], 16))
|
638
planet/vendor/html5lib/inputstream.py
vendored
638
planet/vendor/html5lib/inputstream.py
vendored
@ -1,15 +1,109 @@
|
|||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
import types
|
import types
|
||||||
|
import sys
|
||||||
from gettext import gettext
|
|
||||||
_ = gettext
|
|
||||||
|
|
||||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
from constants import encodings
|
from constants import encodings, ReparseException
|
||||||
from utils import MethodDispatcher
|
|
||||||
|
|
||||||
class HTMLInputStream(object):
|
#Non-unicode versions of constants for use in the pre-parser
|
||||||
|
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
||||||
|
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
|
||||||
|
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
|
||||||
|
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
|
||||||
|
|
||||||
|
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
|
||||||
|
|
||||||
|
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||||
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
|
||||||
|
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
|
||||||
|
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
|
||||||
|
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||||
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
|
||||||
|
0x10FFFE, 0x10FFFF])
|
||||||
|
|
||||||
|
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
|
||||||
|
|
||||||
|
# Cache for charsUntil()
|
||||||
|
charsUntilRegEx = {}
|
||||||
|
|
||||||
|
class BufferedStream:
|
||||||
|
"""Buffering for streams that do not have buffering of their own
|
||||||
|
|
||||||
|
The buffer is implemented as a list of chunks on the assumption that
|
||||||
|
joining many strings will be slow since it is O(n**2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, stream):
|
||||||
|
self.stream = stream
|
||||||
|
self.buffer = []
|
||||||
|
self.position = [-1,0] #chunk number, offset
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
pos = 0
|
||||||
|
for chunk in self.buffer[:self.position[0]]:
|
||||||
|
pos += len(chunk)
|
||||||
|
pos += self.position[1]
|
||||||
|
return pos
|
||||||
|
|
||||||
|
def seek(self, pos):
|
||||||
|
assert pos < self._bufferedBytes()
|
||||||
|
offset = pos
|
||||||
|
i = 0
|
||||||
|
while len(self.buffer[i]) < offset:
|
||||||
|
offset -= pos
|
||||||
|
i += 1
|
||||||
|
self.position = [i, offset]
|
||||||
|
|
||||||
|
def read(self, bytes):
|
||||||
|
if not self.buffer:
|
||||||
|
return self._readStream(bytes)
|
||||||
|
elif (self.position[0] == len(self.buffer) and
|
||||||
|
self.position[1] == len(self.buffer[-1])):
|
||||||
|
return self._readStream(bytes)
|
||||||
|
else:
|
||||||
|
return self._readFromBuffer(bytes)
|
||||||
|
|
||||||
|
def _bufferedBytes(self):
|
||||||
|
return sum([len(item) for item in self.buffer])
|
||||||
|
|
||||||
|
def _readStream(self, bytes):
|
||||||
|
data = self.stream.read(bytes)
|
||||||
|
self.buffer.append(data)
|
||||||
|
self.position[0] += 1
|
||||||
|
self.position[1] = len(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _readFromBuffer(self, bytes):
|
||||||
|
remainingBytes = bytes
|
||||||
|
rv = []
|
||||||
|
bufferIndex = self.position[0]
|
||||||
|
bufferOffset = self.position[1]
|
||||||
|
while bufferIndex < len(self.buffer) and remainingBytes != 0:
|
||||||
|
assert remainingBytes > 0
|
||||||
|
bufferedData = self.buffer[bufferIndex]
|
||||||
|
|
||||||
|
if remainingBytes <= len(bufferedData) - bufferOffset:
|
||||||
|
bytesToRead = remainingBytes
|
||||||
|
self.position = [bufferIndex, bufferOffset + bytesToRead]
|
||||||
|
else:
|
||||||
|
bytesToRead = len(bufferedData) - bufferOffset
|
||||||
|
self.position = [bufferIndex, len(bufferedData)]
|
||||||
|
bufferIndex += 1
|
||||||
|
data = rv.append(bufferedData[bufferOffset:
|
||||||
|
bufferOffset + bytesToRead])
|
||||||
|
remainingBytes -= bytesToRead
|
||||||
|
|
||||||
|
bufferOffset = 0
|
||||||
|
|
||||||
|
if remainingBytes:
|
||||||
|
rv.append(self._readStream(remainingBytes))
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLInputStream:
|
||||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
This class takes care of character encoding and removing or replacing
|
This class takes care of character encoding and removing or replacing
|
||||||
@ -17,11 +111,13 @@ class HTMLInputStream(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
_defaultChunkSize = 10240
|
||||||
|
|
||||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||||
"""Initialises the HTMLInputStream.
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
for use by the HTML5Lib.
|
for use by html5lib.
|
||||||
|
|
||||||
source can be either a file-object, local filename or a string.
|
source can be either a file-object, local filename or a string.
|
||||||
|
|
||||||
@ -33,10 +129,17 @@ class HTMLInputStream(object):
|
|||||||
parseMeta - Look for a <meta> element containing encoding information
|
parseMeta - Look for a <meta> element containing encoding information
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
#Craziness
|
||||||
|
if len(u"\U0010FFFF") == 1:
|
||||||
|
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||||
|
else:
|
||||||
|
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||||
|
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = [0]
|
self.newLines = [0]
|
||||||
|
|
||||||
self.charEncoding = encoding
|
self.charEncoding = (codecName(encoding), "certain")
|
||||||
|
|
||||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||||
# self.charEncoding as appropriate
|
# self.charEncoding as appropriate
|
||||||
@ -52,17 +155,25 @@ class HTMLInputStream(object):
|
|||||||
self.defaultEncoding = "windows-1252"
|
self.defaultEncoding = "windows-1252"
|
||||||
|
|
||||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
|
if (self.charEncoding[0] is None):
|
||||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||||
|
|
||||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
|
|
||||||
'replace')
|
|
||||||
|
|
||||||
self.queue = []
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||||
|
'replace')
|
||||||
|
|
||||||
|
self.chunk = u""
|
||||||
|
self.chunkSize = 0
|
||||||
|
self.chunkOffset = 0
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
self.line = self.col = 0
|
# number of (complete) lines in previous chunks
|
||||||
self.lineLengths = []
|
self.prevNumLines = 0
|
||||||
|
# number of columns in the last line of the previous chunk
|
||||||
|
self.prevNumCols = 0
|
||||||
|
|
||||||
#Flag to indicate we may have a CR LF broken across a data chunk
|
#Flag to indicate we may have a CR LF broken across a data chunk
|
||||||
self._lastChunkEndsWithCR = False
|
self._lastChunkEndsWithCR = False
|
||||||
@ -80,22 +191,29 @@ class HTMLInputStream(object):
|
|||||||
# Otherwise treat source as a string and convert to a file object
|
# Otherwise treat source as a string and convert to a file object
|
||||||
if isinstance(source, unicode):
|
if isinstance(source, unicode):
|
||||||
source = source.encode('utf-8')
|
source = source.encode('utf-8')
|
||||||
self.charEncoding = "utf-8"
|
self.charEncoding = ("utf-8", "certain")
|
||||||
import cStringIO
|
import cStringIO
|
||||||
stream = cStringIO.StringIO(str(source))
|
stream = cStringIO.StringIO(str(source))
|
||||||
|
|
||||||
|
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
|
||||||
|
stream is sys.stdin):
|
||||||
|
stream = BufferedStream(stream)
|
||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||||
|
|
||||||
#First look for a BOM
|
#First look for a BOM
|
||||||
#This will also read past the BOM if present
|
#This will also read past the BOM if present
|
||||||
encoding = self.detectBOM()
|
encoding = self.detectBOM()
|
||||||
|
confidence = "certain"
|
||||||
#If there is no BOM need to look for meta elements with encoding
|
#If there is no BOM need to look for meta elements with encoding
|
||||||
#information
|
#information
|
||||||
if encoding is None and parseMeta:
|
if encoding is None and parseMeta:
|
||||||
encoding = self.detectEncodingMeta()
|
encoding = self.detectEncodingMeta()
|
||||||
|
confidence = "tentative"
|
||||||
#Guess with chardet, if avaliable
|
#Guess with chardet, if avaliable
|
||||||
if encoding is None and chardet:
|
if encoding is None and chardet:
|
||||||
|
confidence = "tentative"
|
||||||
try:
|
try:
|
||||||
from chardet.universaldetector import UniversalDetector
|
from chardet.universaldetector import UniversalDetector
|
||||||
buffers = []
|
buffers = []
|
||||||
@ -108,11 +226,12 @@ class HTMLInputStream(object):
|
|||||||
detector.feed(buffer)
|
detector.feed(buffer)
|
||||||
detector.close()
|
detector.close()
|
||||||
encoding = detector.result['encoding']
|
encoding = detector.result['encoding']
|
||||||
self.seek("".join(buffers), 0)
|
self.rawStream.seek(0)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
# If all else fails use the default encoding
|
# If all else fails use the default encoding
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
|
confidence="tentative"
|
||||||
encoding = self.defaultEncoding
|
encoding = self.defaultEncoding
|
||||||
|
|
||||||
#Substitute for equivalent encodings:
|
#Substitute for equivalent encodings:
|
||||||
@ -121,8 +240,22 @@ class HTMLInputStream(object):
|
|||||||
if encoding.lower() in encodingSub:
|
if encoding.lower() in encodingSub:
|
||||||
encoding = encodingSub[encoding.lower()]
|
encoding = encodingSub[encoding.lower()]
|
||||||
|
|
||||||
return encoding
|
return encoding, confidence
|
||||||
|
|
||||||
|
def changeEncoding(self, newEncoding):
|
||||||
|
newEncoding = codecName(newEncoding)
|
||||||
|
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||||
|
newEncoding = "utf-8"
|
||||||
|
if newEncoding is None:
|
||||||
|
return
|
||||||
|
elif newEncoding == self.charEncoding[0]:
|
||||||
|
self.charEncoding = (self.charEncoding[0], "certain")
|
||||||
|
else:
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
self.reset()
|
||||||
|
self.charEncoding = (newEncoding, "certain")
|
||||||
|
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
|
||||||
|
|
||||||
def detectBOM(self):
|
def detectBOM(self):
|
||||||
"""Attempts to detect at BOM at the start of the stream. If
|
"""Attempts to detect at BOM at the start of the stream. If
|
||||||
an encoding can be determined from the BOM return the name of the
|
an encoding can be determined from the BOM return the name of the
|
||||||
@ -149,198 +282,219 @@ class HTMLInputStream(object):
|
|||||||
|
|
||||||
# Set the read position past the BOM if one was found, otherwise
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
# set it to the start of the stream
|
# set it to the start of the stream
|
||||||
self.seek(string, encoding and seek or 0)
|
self.rawStream.seek(encoding and seek or 0)
|
||||||
|
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
def seek(self, buffer, n):
|
|
||||||
"""Unget buffer[n:]"""
|
|
||||||
if hasattr(self.rawStream, 'unget'):
|
|
||||||
self.rawStream.unget(buffer[n:])
|
|
||||||
return
|
|
||||||
|
|
||||||
if hasattr(self.rawStream, 'seek'):
|
|
||||||
try:
|
|
||||||
self.rawStream.seek(n)
|
|
||||||
return
|
|
||||||
except IOError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
class BufferedStream:
|
|
||||||
def __init__(self, data, stream):
|
|
||||||
self.data = data
|
|
||||||
self.stream = stream
|
|
||||||
def read(self, chars=-1):
|
|
||||||
if chars == -1 or chars > len(self.data):
|
|
||||||
result = self.data
|
|
||||||
self.data = ''
|
|
||||||
if chars == -1:
|
|
||||||
return result + self.stream.read()
|
|
||||||
else:
|
|
||||||
return result + self.stream.read(chars-len(result))
|
|
||||||
elif not self.data:
|
|
||||||
return self.stream.read(chars)
|
|
||||||
else:
|
|
||||||
result = self.data[:chars]
|
|
||||||
self.data = self.data[chars:]
|
|
||||||
return result
|
|
||||||
def unget(self, data):
|
|
||||||
if self.data:
|
|
||||||
self.data += data
|
|
||||||
else:
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
|
|
||||||
|
|
||||||
def detectEncodingMeta(self):
|
def detectEncodingMeta(self):
|
||||||
"""Report the encoding declared by the meta element
|
"""Report the encoding declared by the meta element
|
||||||
"""
|
"""
|
||||||
buffer = self.rawStream.read(self.numBytesMeta)
|
buffer = self.rawStream.read(self.numBytesMeta)
|
||||||
parser = EncodingParser(buffer)
|
parser = EncodingParser(buffer)
|
||||||
self.seek(buffer, 0)
|
self.rawStream.seek(0)
|
||||||
return parser.getEncoding()
|
encoding = parser.getEncoding()
|
||||||
|
|
||||||
|
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||||
|
encoding = "utf-8"
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
def _position(self, offset):
|
||||||
|
chunk = self.chunk
|
||||||
|
nLines = chunk.count(u'\n', 0, offset)
|
||||||
|
positionLine = self.prevNumLines + nLines
|
||||||
|
lastLinePos = chunk.rfind(u'\n', 0, offset)
|
||||||
|
if lastLinePos == -1:
|
||||||
|
positionColumn = self.prevNumCols + offset
|
||||||
|
else:
|
||||||
|
positionColumn = offset - (lastLinePos + 1)
|
||||||
|
return (positionLine, positionColumn)
|
||||||
|
|
||||||
def position(self):
|
def position(self):
|
||||||
"""Returns (line, col) of the current position in the stream."""
|
"""Returns (line, col) of the current position in the stream."""
|
||||||
line, col = self.line, self.col
|
line, col = self._position(self.chunkOffset)
|
||||||
return (line + 1, col)
|
return (line+1, col)
|
||||||
|
|
||||||
def char(self):
|
def char(self):
|
||||||
""" Read one character from the stream or queue if available. Return
|
""" Read one character from the stream or queue if available. Return
|
||||||
EOF when EOF is reached.
|
EOF when EOF is reached.
|
||||||
"""
|
"""
|
||||||
if not self.queue:
|
# Read a new chunk from the input stream if necessary
|
||||||
self.readChunk()
|
if self.chunkOffset >= self.chunkSize:
|
||||||
#If we still don't have a character we have reached EOF
|
if not self.readChunk():
|
||||||
if not self.queue:
|
return EOF
|
||||||
return EOF
|
|
||||||
|
chunkOffset = self.chunkOffset
|
||||||
char = self.queue.pop(0)
|
char = self.chunk[chunkOffset]
|
||||||
|
self.chunkOffset = chunkOffset + 1
|
||||||
# update position in stream
|
|
||||||
if char == '\n':
|
|
||||||
self.lineLengths.append(self.col)
|
|
||||||
self.line += 1
|
|
||||||
self.col = 0
|
|
||||||
else:
|
|
||||||
self.col += 1
|
|
||||||
return char
|
return char
|
||||||
|
|
||||||
def readChunk(self, chunkSize=10240):
|
def readChunk(self, chunkSize=None):
|
||||||
|
if chunkSize is None:
|
||||||
|
chunkSize = self._defaultChunkSize
|
||||||
|
|
||||||
|
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
|
||||||
|
|
||||||
|
self.chunk = u""
|
||||||
|
self.chunkSize = 0
|
||||||
|
self.chunkOffset = 0
|
||||||
|
|
||||||
data = self.dataStream.read(chunkSize)
|
data = self.dataStream.read(chunkSize)
|
||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
return
|
return False
|
||||||
#Replace null characters
|
|
||||||
for i in xrange(data.count(u"\u0000")):
|
self.reportCharacterErrors(data)
|
||||||
self.errors.append(_('null character found in input stream, '
|
|
||||||
'replaced with U+FFFD'))
|
|
||||||
data = data.replace(u"\u0000", u"\ufffd")
|
data = data.replace(u"\u0000", u"\ufffd")
|
||||||
#Check for CR LF broken across chunks
|
#Check for CR LF broken across chunks
|
||||||
if (self._lastChunkEndsWithCR and data[0] == "\n"):
|
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
|
||||||
data = data[1:]
|
data = data[1:]
|
||||||
self._lastChunkEndsWithCR = data[-1] == "\r"
|
# Stop if the chunk is now empty
|
||||||
data = data.replace("\r\n", "\n")
|
if not data:
|
||||||
data = data.replace("\r", "\n")
|
return False
|
||||||
|
self._lastChunkEndsWithCR = data[-1] == u"\r"
|
||||||
data = unicode(data)
|
data = data.replace(u"\r\n", u"\n")
|
||||||
self.queue.extend([char for char in data])
|
data = data.replace(u"\r", u"\n")
|
||||||
|
|
||||||
|
self.chunk = data
|
||||||
|
self.chunkSize = len(data)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def characterErrorsUCS4(self, data):
|
||||||
|
for i in xrange(data.count(u"\u0000")):
|
||||||
|
self.errors.append("null-character")
|
||||||
|
for i in xrange(len(invalid_unicode_re.findall(data))):
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
|
||||||
|
def characterErrorsUCS2(self, data):
|
||||||
|
#Someone picked the wrong compile option
|
||||||
|
#You lose
|
||||||
|
for i in xrange(data.count(u"\u0000")):
|
||||||
|
self.errors.append("null-character")
|
||||||
|
skip = False
|
||||||
|
import sys
|
||||||
|
for match in invalid_unicode_re.finditer(data):
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
codepoint = ord(match.group())
|
||||||
|
pos = match.start()
|
||||||
|
#Pretty sure there should be endianness issues here
|
||||||
|
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
|
||||||
|
pos < len(data) - 1 and
|
||||||
|
ord(data[pos + 1]) >= 0xDC00 and
|
||||||
|
ord(data[pos + 1]) <= 0xDFFF):
|
||||||
|
#We have a surrogate pair!
|
||||||
|
#From a perl manpage
|
||||||
|
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
|
||||||
|
(ord(data[pos + 1]) - 0xDC00))
|
||||||
|
if char_val in non_bmp_invalid_codepoints:
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
skip = True
|
||||||
|
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
|
||||||
|
pos == len(data) - 1):
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
else:
|
||||||
|
skip = False
|
||||||
|
self.errors.append("invalid-codepoint")
|
||||||
|
#This is still wrong if it is possible for a surrogate pair to break a
|
||||||
|
#chunk boundary
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite = False):
|
def charsUntil(self, characters, opposite = False):
|
||||||
""" Returns a string of characters from the stream up to but not
|
""" Returns a string of characters from the stream up to but not
|
||||||
including any character in characters or EOF. characters can be
|
including any character in 'characters' or EOF. 'characters' must be
|
||||||
any container that supports the in method being called on it.
|
a container that supports the 'in' method and iteration over its
|
||||||
|
characters.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
#This method is currently 40-50% of our total runtime and badly needs
|
# Use a cache of regexps to find the required characters
|
||||||
#optimizing
|
try:
|
||||||
#Possible improvements:
|
chars = charsUntilRegEx[(characters, opposite)]
|
||||||
# - use regexp to find characters that match the required character set
|
except KeyError:
|
||||||
# (with regexp cache since we do the same searches many many times)
|
if __debug__:
|
||||||
# - improve EOF handling for fewer if statements
|
for c in characters:
|
||||||
|
assert(ord(c) < 128)
|
||||||
|
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
|
||||||
|
if not opposite:
|
||||||
|
regex = u"^%s" % regex
|
||||||
|
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
|
||||||
|
|
||||||
if not self.queue:
|
rv = []
|
||||||
self.readChunk()
|
|
||||||
#Break if we have reached EOF
|
while True:
|
||||||
if not self.queue or self.queue[0] == None:
|
# Find the longest matching prefix
|
||||||
return u""
|
m = chars.match(self.chunk, self.chunkOffset)
|
||||||
|
if m is None:
|
||||||
i = 0
|
# If nothing matched, and it wasn't because we ran out of chunk,
|
||||||
while (self.queue[i] in characters) == opposite:
|
# then stop
|
||||||
i += 1
|
if self.chunkOffset != self.chunkSize:
|
||||||
if i == len(self.queue):
|
break
|
||||||
self.readChunk()
|
|
||||||
#If the queue doesn't grow we have reached EOF
|
|
||||||
if i == len(self.queue) or self.queue[i] is EOF:
|
|
||||||
break
|
|
||||||
#XXX- wallpaper over bug in calculation below
|
|
||||||
#Otherwise change the stream position
|
|
||||||
if self.queue[i] == '\n':
|
|
||||||
self.lineLengths.append(self.col)
|
|
||||||
self.line += 1
|
|
||||||
self.col = 0
|
|
||||||
else:
|
else:
|
||||||
self.col += 1
|
end = m.end()
|
||||||
|
# If not the whole chunk matched, return everything
|
||||||
|
# up to the part that didn't match
|
||||||
|
if end != self.chunkSize:
|
||||||
|
rv.append(self.chunk[self.chunkOffset:end])
|
||||||
|
self.chunkOffset = end
|
||||||
|
break
|
||||||
|
# If the whole remainder of the chunk matched,
|
||||||
|
# use it all and read the next chunk
|
||||||
|
rv.append(self.chunk[self.chunkOffset:])
|
||||||
|
if not self.readChunk():
|
||||||
|
# Reached EOF
|
||||||
|
break
|
||||||
|
|
||||||
rv = u"".join(self.queue[:i])
|
r = u"".join(rv)
|
||||||
self.queue = self.queue[i:]
|
return r
|
||||||
|
|
||||||
#Calculate where we now are in the stream
|
|
||||||
#One possible optimisation would be to store all read characters and
|
|
||||||
#Calculate this on an as-needed basis (perhaps flushing the read data
|
|
||||||
#every time we read a new chunk) rather than once per call here and
|
|
||||||
#in .char()
|
|
||||||
|
|
||||||
#XXX Temporarily disable this because there is a bug
|
|
||||||
|
|
||||||
#lines = rv.split("\n")
|
|
||||||
#
|
|
||||||
#if lines:
|
|
||||||
# #Add number of lines passed onto positon
|
|
||||||
# oldCol = self.col
|
|
||||||
# self.line += len(lines)-1
|
|
||||||
# if len(lines) > 1:
|
|
||||||
# self.col = len(lines[-1])
|
|
||||||
# else:
|
|
||||||
# self.col += len(lines[0])
|
|
||||||
#
|
|
||||||
# if self.lineLengths and oldCol > 0:
|
|
||||||
# self.lineLengths[-1] += len(lines[0])
|
|
||||||
# lines = lines[1:-1]
|
|
||||||
# else:
|
|
||||||
# lines = lines[:-1]
|
|
||||||
#
|
|
||||||
# for line in lines:
|
|
||||||
# self.lineLengths.append(len(line))
|
|
||||||
#
|
|
||||||
|
|
||||||
return rv
|
|
||||||
|
|
||||||
def unget(self, chars):
|
def unget(self, char):
|
||||||
if chars:
|
# Only one character is allowed to be ungotten at once - it must
|
||||||
self.queue = list(chars) + self.queue
|
# be consumed again before any further call to unget
|
||||||
#Alter the current line, col position
|
|
||||||
for c in chars[::-1]:
|
if char is not None:
|
||||||
if c == '\n':
|
if self.chunkOffset == 0:
|
||||||
self.line -= 1
|
# unget is called quite rarely, so it's a good idea to do
|
||||||
self.col = self.lineLengths[self.line]
|
# more work here if it saves a bit of work in the frequently
|
||||||
else:
|
# called char and charsUntil.
|
||||||
self.col -= 1
|
# So, just prepend the ungotten character onto the current
|
||||||
|
# chunk:
|
||||||
|
self.chunk = char + self.chunk
|
||||||
|
self.chunkSize += 1
|
||||||
|
else:
|
||||||
|
self.chunkOffset -= 1
|
||||||
|
assert self.chunk[self.chunkOffset] == char
|
||||||
|
|
||||||
class EncodingBytes(str):
|
class EncodingBytes(str):
|
||||||
"""String-like object with an assosiated position and various extra methods
|
"""String-like object with an associated position and various extra methods
|
||||||
If the position is ever greater than the string length then an exception is
|
If the position is ever greater than the string length then an exception is
|
||||||
raised"""
|
raised"""
|
||||||
|
def __new__(self, value):
|
||||||
|
return str.__new__(self, value)
|
||||||
|
|
||||||
def __init__(self, value):
|
def __init__(self, value):
|
||||||
str.__init__(self, value)
|
|
||||||
self._position=-1
|
self._position=-1
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
self._position += 1
|
p = self._position = self._position + 1
|
||||||
rv = self[self.position]
|
if p >= len(self):
|
||||||
return rv
|
raise StopIteration
|
||||||
|
elif p < 0:
|
||||||
|
raise TypeError
|
||||||
|
return self[p]
|
||||||
|
|
||||||
|
def previous(self):
|
||||||
|
p = self._position
|
||||||
|
if p >= len(self):
|
||||||
|
raise StopIteration
|
||||||
|
elif p < 0:
|
||||||
|
raise TypeError
|
||||||
|
self._position = p = p - 1
|
||||||
|
return self[p]
|
||||||
|
|
||||||
def setPosition(self, position):
|
def setPosition(self, position):
|
||||||
if self._position >= len(self):
|
if self._position >= len(self):
|
||||||
@ -362,20 +516,39 @@ class EncodingBytes(str):
|
|||||||
|
|
||||||
currentByte = property(getCurrentByte)
|
currentByte = property(getCurrentByte)
|
||||||
|
|
||||||
def skip(self, chars=spaceCharacters):
|
def skip(self, chars=spaceCharactersBytes):
|
||||||
"""Skip past a list of characters"""
|
"""Skip past a list of characters"""
|
||||||
while self.currentByte in chars:
|
p = self.position # use property for the error-checking
|
||||||
self.position += 1
|
while p < len(self):
|
||||||
|
c = self[p]
|
||||||
|
if c not in chars:
|
||||||
|
self._position = p
|
||||||
|
return c
|
||||||
|
p += 1
|
||||||
|
self._position = p
|
||||||
|
return None
|
||||||
|
|
||||||
|
def skipUntil(self, chars):
|
||||||
|
p = self.position
|
||||||
|
while p < len(self):
|
||||||
|
c = self[p]
|
||||||
|
if c in chars:
|
||||||
|
self._position = p
|
||||||
|
return c
|
||||||
|
p += 1
|
||||||
|
self._position = p
|
||||||
|
return None
|
||||||
|
|
||||||
def matchBytes(self, bytes, lower=False):
|
def matchBytes(self, bytes, lower=False):
|
||||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||||
are found return True and advance the position to the byte after the
|
are found return True and advance the position to the byte after the
|
||||||
match. Otherwise return False and leave the position alone"""
|
match. Otherwise return False and leave the position alone"""
|
||||||
data = self[self.position:self.position+len(bytes)]
|
p = self.position
|
||||||
|
data = self[p:p+len(bytes)]
|
||||||
if lower:
|
if lower:
|
||||||
data = data.lower()
|
data = data.lower()
|
||||||
rv = data.startswith(bytes)
|
rv = data.startswith(bytes)
|
||||||
if rv == True:
|
if rv:
|
||||||
self.position += len(bytes)
|
self.position += len(bytes)
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
@ -388,12 +561,6 @@ class EncodingBytes(str):
|
|||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
def findNext(self, byteList):
|
|
||||||
"""Move the pointer so it points to the next byte in a set of possible
|
|
||||||
bytes"""
|
|
||||||
while (self.currentByte not in byteList):
|
|
||||||
self.position += 1
|
|
||||||
|
|
||||||
class EncodingParser(object):
|
class EncodingParser(object):
|
||||||
"""Mini parser for detecting character encoding from meta elements"""
|
"""Mini parser for detecting character encoding from meta elements"""
|
||||||
@ -423,8 +590,7 @@ class EncodingParser(object):
|
|||||||
break
|
break
|
||||||
if not keepParsing:
|
if not keepParsing:
|
||||||
break
|
break
|
||||||
if self.encoding is not None:
|
|
||||||
self.encoding = self.encoding.strip()
|
|
||||||
return self.encoding
|
return self.encoding
|
||||||
|
|
||||||
def handleComment(self):
|
def handleComment(self):
|
||||||
@ -432,7 +598,7 @@ class EncodingParser(object):
|
|||||||
return self.data.jumpTo("-->")
|
return self.data.jumpTo("-->")
|
||||||
|
|
||||||
def handleMeta(self):
|
def handleMeta(self):
|
||||||
if self.data.currentByte not in spaceCharacters:
|
if self.data.currentByte not in spaceCharactersBytes:
|
||||||
#if we have <meta not followed by a space so just keep going
|
#if we have <meta not followed by a space so just keep going
|
||||||
return True
|
return True
|
||||||
#We have a valid meta element we want to search for attributes
|
#We have a valid meta element we want to search for attributes
|
||||||
@ -444,38 +610,41 @@ class EncodingParser(object):
|
|||||||
else:
|
else:
|
||||||
if attr[0] == "charset":
|
if attr[0] == "charset":
|
||||||
tentativeEncoding = attr[1]
|
tentativeEncoding = attr[1]
|
||||||
if isValidEncoding(tentativeEncoding):
|
codec = codecName(tentativeEncoding)
|
||||||
self.encoding = tentativeEncoding
|
if codec is not None:
|
||||||
|
self.encoding = codec
|
||||||
return False
|
return False
|
||||||
elif attr[0] == "content":
|
elif attr[0] == "content":
|
||||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||||
tentativeEncoding = contentParser.parse()
|
tentativeEncoding = contentParser.parse()
|
||||||
if isValidEncoding(tentativeEncoding):
|
codec = codecName(tentativeEncoding)
|
||||||
self.encoding = tentativeEncoding
|
if codec is not None:
|
||||||
|
self.encoding = codec
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def handlePossibleStartTag(self):
|
def handlePossibleStartTag(self):
|
||||||
return self.handlePossibleTag(False)
|
return self.handlePossibleTag(False)
|
||||||
|
|
||||||
def handlePossibleEndTag(self):
|
def handlePossibleEndTag(self):
|
||||||
self.data.position+=1
|
self.data.next()
|
||||||
return self.handlePossibleTag(True)
|
return self.handlePossibleTag(True)
|
||||||
|
|
||||||
def handlePossibleTag(self, endTag):
|
def handlePossibleTag(self, endTag):
|
||||||
if self.data.currentByte not in asciiLetters:
|
data = self.data
|
||||||
|
if data.currentByte not in asciiLettersBytes:
|
||||||
#If the next byte is not an ascii letter either ignore this
|
#If the next byte is not an ascii letter either ignore this
|
||||||
#fragment (possible start tag case) or treat it according to
|
#fragment (possible start tag case) or treat it according to
|
||||||
#handleOther
|
#handleOther
|
||||||
if endTag:
|
if endTag:
|
||||||
self.data.position -= 1
|
data.previous()
|
||||||
self.handleOther()
|
self.handleOther()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
self.data.findNext(list(spaceCharacters) + ["<", ">"])
|
c = data.skipUntil(spacesAngleBrackets)
|
||||||
if self.data.currentByte == "<":
|
if c == "<":
|
||||||
#return to the first step in the overall "two step" algorithm
|
#return to the first step in the overall "two step" algorithm
|
||||||
#reprocessing the < byte
|
#reprocessing the < byte
|
||||||
self.data.position -= 1
|
data.previous()
|
||||||
else:
|
else:
|
||||||
#Read all attributes
|
#Read all attributes
|
||||||
attr = self.getAttribute()
|
attr = self.getAttribute()
|
||||||
@ -489,73 +658,75 @@ class EncodingParser(object):
|
|||||||
def getAttribute(self):
|
def getAttribute(self):
|
||||||
"""Return a name,value pair for the next attribute in the stream,
|
"""Return a name,value pair for the next attribute in the stream,
|
||||||
if one is found, or None"""
|
if one is found, or None"""
|
||||||
self.data.skip(list(spaceCharacters)+["/"])
|
data = self.data
|
||||||
if self.data.currentByte == "<":
|
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
||||||
self.data.position -= 1
|
if c == "<":
|
||||||
|
data.previous()
|
||||||
return None
|
return None
|
||||||
elif self.data.currentByte == ">":
|
elif c == ">" or c is None:
|
||||||
return None
|
return None
|
||||||
attrName = []
|
attrName = []
|
||||||
attrValue = []
|
attrValue = []
|
||||||
spaceFound = False
|
spaceFound = False
|
||||||
#Step 5 attribute name
|
#Step 5 attribute name
|
||||||
while True:
|
while True:
|
||||||
if self.data.currentByte == "=" and attrName:
|
if c == "=" and attrName:
|
||||||
break
|
break
|
||||||
elif self.data.currentByte in spaceCharacters:
|
elif c in spaceCharactersBytes:
|
||||||
spaceFound=True
|
spaceFound=True
|
||||||
break
|
break
|
||||||
elif self.data.currentByte in ("/", "<", ">"):
|
elif c in ("/", "<", ">"):
|
||||||
return "".join(attrName), ""
|
return "".join(attrName), ""
|
||||||
elif self.data.currentByte in asciiUppercase:
|
elif c in asciiUppercaseBytes:
|
||||||
attrName.extend(self.data.currentByte.lower())
|
attrName.append(c.lower())
|
||||||
else:
|
else:
|
||||||
attrName.extend(self.data.currentByte)
|
attrName.append(c)
|
||||||
#Step 6
|
#Step 6
|
||||||
self.data.position += 1
|
c = data.next()
|
||||||
#Step 7
|
#Step 7
|
||||||
if spaceFound:
|
if spaceFound:
|
||||||
self.data.skip()
|
c = data.skip()
|
||||||
#Step 8
|
#Step 8
|
||||||
if self.data.currentByte != "=":
|
if c != "=":
|
||||||
self.data.position -= 1
|
data.previous()
|
||||||
return "".join(attrName), ""
|
return "".join(attrName), ""
|
||||||
#XXX need to advance position in both spaces and value case
|
#XXX need to advance position in both spaces and value case
|
||||||
#Step 9
|
#Step 9
|
||||||
self.data.position += 1
|
data.next()
|
||||||
#Step 10
|
#Step 10
|
||||||
self.data.skip()
|
c = data.skip()
|
||||||
#Step 11
|
#Step 11
|
||||||
if self.data.currentByte in ("'", '"'):
|
if c in ("'", '"'):
|
||||||
#11.1
|
#11.1
|
||||||
quoteChar = self.data.currentByte
|
quoteChar = c
|
||||||
while True:
|
while True:
|
||||||
self.data.position+=1
|
|
||||||
#11.3
|
#11.3
|
||||||
if self.data.currentByte == quoteChar:
|
c = data.next()
|
||||||
self.data.position += 1
|
if c == quoteChar:
|
||||||
|
data.next()
|
||||||
return "".join(attrName), "".join(attrValue)
|
return "".join(attrName), "".join(attrValue)
|
||||||
#11.4
|
#11.4
|
||||||
elif self.data.currentByte in asciiUppercase:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.extend(self.data.currentByte.lower())
|
attrValue.append(c.lower())
|
||||||
#11.5
|
#11.5
|
||||||
else:
|
else:
|
||||||
attrValue.extend(self.data.currentByte)
|
attrValue.append(c)
|
||||||
elif self.data.currentByte in (">", '<'):
|
elif c in (">", "<"):
|
||||||
return "".join(attrName), ""
|
return "".join(attrName), ""
|
||||||
elif self.data.currentByte in asciiUppercase:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.extend(self.data.currentByte.lower())
|
attrValue.append(c.lower())
|
||||||
|
elif c is None:
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
attrValue.extend(self.data.currentByte)
|
attrValue.append(c)
|
||||||
while True:
|
while True:
|
||||||
self.data.position +=1
|
c = data.next()
|
||||||
if self.data.currentByte in (
|
if c in spacesAngleBrackets:
|
||||||
list(spaceCharacters) + [">", '<']):
|
|
||||||
return "".join(attrName), "".join(attrValue)
|
return "".join(attrName), "".join(attrValue)
|
||||||
elif self.data.currentByte in asciiUppercase:
|
elif c in asciiUppercaseBytes:
|
||||||
attrValue.extend(self.data.currentByte.lower())
|
attrValue.append(c.lower())
|
||||||
else:
|
else:
|
||||||
attrValue.extend(self.data.currentByte)
|
attrValue.append(c)
|
||||||
|
|
||||||
|
|
||||||
class ContentAttrParser(object):
|
class ContentAttrParser(object):
|
||||||
@ -588,7 +759,7 @@ class ContentAttrParser(object):
|
|||||||
#Unquoted value
|
#Unquoted value
|
||||||
oldPosition = self.data.position
|
oldPosition = self.data.position
|
||||||
try:
|
try:
|
||||||
self.data.findNext(spaceCharacters)
|
self.data.skipUntil(spaceCharactersBytes)
|
||||||
return self.data[oldPosition:self.data.position]
|
return self.data[oldPosition:self.data.position]
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
#Return the whole remaining value
|
#Return the whole remaining value
|
||||||
@ -596,7 +767,12 @@ class ContentAttrParser(object):
|
|||||||
except StopIteration:
|
except StopIteration:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def isValidEncoding(encoding):
|
|
||||||
"""Determine if a string is a supported encoding"""
|
def codecName(encoding):
|
||||||
return (encoding is not None and type(encoding) == types.StringType and
|
"""Return the python codec name corresponding to an encoding or None if the
|
||||||
encoding.lower().strip() in encodings)
|
string doesn't correspond to a valid encoding."""
|
||||||
|
if (encoding is not None and type(encoding) in types.StringTypes):
|
||||||
|
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||||
|
return encodings.get(canonicalName, None)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
147
planet/vendor/html5lib/liberalxmlparser.py
vendored
147
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -1,147 +0,0 @@
|
|||||||
"""
|
|
||||||
Warning: this module is experimental and subject to change and even removal
|
|
||||||
at any time.
|
|
||||||
|
|
||||||
For background/rationale, see:
|
|
||||||
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
|
||||||
* http://tinyurl.com/ylfj8k (and follow-ups)
|
|
||||||
|
|
||||||
References:
|
|
||||||
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
|
||||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
|
||||||
|
|
||||||
@@TODO:
|
|
||||||
* Selectively lowercase only XHTML, but not foreign markup
|
|
||||||
"""
|
|
||||||
|
|
||||||
import html5parser
|
|
||||||
from constants import voidElements, contentModelFlags
|
|
||||||
|
|
||||||
from xml.dom import XHTML_NAMESPACE
|
|
||||||
from xml.sax.saxutils import unescape
|
|
||||||
|
|
||||||
class XMLParser(html5parser.HTMLParser):
|
|
||||||
""" liberal XML parser """
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
|
||||||
|
|
||||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
|
||||||
|
|
||||||
if token["type"] in ("StartTag", "EmptyTag"):
|
|
||||||
token["data"] = dict(token["data"][::-1])
|
|
||||||
|
|
||||||
# For EmptyTags, process both a Start and an End tag
|
|
||||||
if token["type"] == "EmptyTag":
|
|
||||||
save = self.tokenizer.contentModelFlag
|
|
||||||
self.phase.processStartTag(token["name"], token["data"])
|
|
||||||
self.tokenizer.contentModelFlag = save
|
|
||||||
token["data"] = {}
|
|
||||||
token["type"] = "EndTag"
|
|
||||||
|
|
||||||
elif token["type"] == "Characters":
|
|
||||||
# un-escape rcdataElements (e.g. style, script)
|
|
||||||
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
|
||||||
token["data"] = unescape(token["data"])
|
|
||||||
|
|
||||||
elif token["type"] == "Comment":
|
|
||||||
# Rescue CDATA from the comments
|
|
||||||
if (token["data"].startswith("[CDATA[") and
|
|
||||||
token["data"].endswith("]]")):
|
|
||||||
token["type"] = "Characters"
|
|
||||||
token["data"] = token["data"][7:-2]
|
|
||||||
|
|
||||||
return token
|
|
||||||
|
|
||||||
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
|
|
||||||
**kwargs):
|
|
||||||
|
|
||||||
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
|
|
||||||
encoding, lowercaseElementName=False,
|
|
||||||
lowercaseAttrName=False)
|
|
||||||
|
|
||||||
class XHTMLParser(XMLParser):
|
|
||||||
""" liberal XMTHML parser """
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
|
||||||
self.phases["initial"] = XmlInitialPhase(self, self.tree)
|
|
||||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
|
||||||
token = XMLParser.normalizeToken(self, token)
|
|
||||||
|
|
||||||
# ensure that non-void XHTML elements have content so that separate
|
|
||||||
# open and close tags are emitted
|
|
||||||
if token["type"] == "EndTag":
|
|
||||||
if token["name"] in voidElements:
|
|
||||||
if not self.tree.openElements or \
|
|
||||||
self.tree.openElements[-1].name != token["name"]:
|
|
||||||
token["type"] = "EmptyTag"
|
|
||||||
if not token.has_key("data"): token["data"] = {}
|
|
||||||
else:
|
|
||||||
if token["name"] == self.tree.openElements[-1].name and \
|
|
||||||
not self.tree.openElements[-1].hasContent():
|
|
||||||
for e in self.tree.openElements:
|
|
||||||
if 'xmlns' in e.attributes.keys():
|
|
||||||
if e.attributes['xmlns'] != XHTML_NAMESPACE:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
self.tree.insertText('')
|
|
||||||
|
|
||||||
return token
|
|
||||||
|
|
||||||
class XhmlRootPhase(html5parser.RootElementPhase):
|
|
||||||
def insertHtmlElement(self):
|
|
||||||
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
|
|
||||||
self.tree.openElements.append(element)
|
|
||||||
self.tree.document.appendChild(element)
|
|
||||||
self.parser.phase = self.parser.phases["beforeHead"]
|
|
||||||
|
|
||||||
class XmlInitialPhase(html5parser.InitialPhase):
|
|
||||||
""" Consume XML Prologs """
|
|
||||||
def processComment(self, data):
|
|
||||||
if not data.startswith('?xml') or not data.endswith('?'):
|
|
||||||
html5parser.InitialPhase.processComment(self, data)
|
|
||||||
|
|
||||||
class XmlRootPhase(html5parser.Phase):
|
|
||||||
""" Consume XML Prologs """
|
|
||||||
def processComment(self, data):
|
|
||||||
print repr(data)
|
|
||||||
if not data.startswith('?xml') or not data.endswith('?'):
|
|
||||||
html5parser.InitialPhase.processComment(self, data)
|
|
||||||
|
|
||||||
""" Prime the Xml parser """
|
|
||||||
def __getattr__(self, name):
|
|
||||||
self.tree.openElements.append(self.tree.document)
|
|
||||||
self.parser.phase = XmlElementPhase(self.parser, self.tree)
|
|
||||||
return getattr(self.parser.phase, name)
|
|
||||||
|
|
||||||
class XmlElementPhase(html5parser.Phase):
|
|
||||||
""" Generic handling for all XML elements """
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
html5parser.Phase.__init__(self, *args, **kwargs)
|
|
||||||
self.startTagHandler = html5parser.utils.MethodDispatcher([])
|
|
||||||
self.startTagHandler.default = self.startTagOther
|
|
||||||
self.endTagHandler = html5parser.utils.MethodDispatcher([])
|
|
||||||
self.endTagHandler.default = self.endTagOther
|
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
|
||||||
element = self.tree.createElement(name, attributes)
|
|
||||||
self.tree.openElements[-1].appendChild(element)
|
|
||||||
self.tree.openElements.append(element)
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
|
||||||
for node in self.tree.openElements[::-1]:
|
|
||||||
if node.name == name:
|
|
||||||
while self.tree.openElements.pop() != node:
|
|
||||||
pass
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
self.parser.parseError()
|
|
||||||
|
|
||||||
def processCharacters(self, data):
|
|
||||||
self.tree.insertText(data)
|
|
58
planet/vendor/html5lib/sanitizer.py
vendored
58
planet/vendor/html5lib/sanitizer.py
vendored
@ -1,6 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
from tokenizer import HTMLTokenizer
|
from tokenizer import HTMLTokenizer
|
||||||
|
from constants import tokenTypes
|
||||||
|
|
||||||
class HTMLSanitizerMixin(object):
|
class HTMLSanitizerMixin(object):
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
|
|||||||
|
|
||||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||||
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
|
|||||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
|
||||||
'font-family', 'font-size', 'font-stretch', 'font-style',
|
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||||
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
||||||
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
||||||
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
||||||
@ -82,6 +84,13 @@ class HTMLSanitizerMixin(object):
|
|||||||
|
|
||||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||||
'xlink:href', 'xml:base']
|
'xlink:href', 'xml:base']
|
||||||
|
|
||||||
|
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||||
|
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
|
||||||
|
|
||||||
|
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
|
||||||
|
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
|
||||||
|
'radialGradient', 'textpath', 'tref', 'set', 'use']
|
||||||
|
|
||||||
acceptable_css_properties = ['azimuth', 'background-color',
|
acceptable_css_properties = ['azimuth', 'background-color',
|
||||||
'border-bottom-color', 'border-collapse', 'border-color',
|
'border-bottom-color', 'border-collapse', 'border-color',
|
||||||
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
|
|||||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
# => <a>Click here for $100</a>
|
# => <a>Click here for $100</a>
|
||||||
def sanitize_token(self, token):
|
def sanitize_token(self, token):
|
||||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||||
|
tokenTypes["EmptyTag"]):
|
||||||
if token["name"] in self.allowed_elements:
|
if token["name"] in self.allowed_elements:
|
||||||
if token.has_key("data"):
|
if token.has_key("data"):
|
||||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
attrs = dict([(name,val) for name,val in
|
||||||
|
token["data"][::-1]
|
||||||
|
if name in self.allowed_attributes])
|
||||||
for attr in self.attr_val_is_uri:
|
for attr in self.attr_val_is_uri:
|
||||||
if not attrs.has_key(attr): continue
|
if not attrs.has_key(attr):
|
||||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
continue
|
||||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||||
|
unescape(attrs[attr])).lower()
|
||||||
|
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
|
||||||
|
(val_unescaped.split(':')[0] not in
|
||||||
|
self.allowed_protocols)):
|
||||||
del attrs[attr]
|
del attrs[attr]
|
||||||
|
for attr in self.svg_attr_val_allows_ref:
|
||||||
|
if attr in attrs:
|
||||||
|
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||||
|
' ',
|
||||||
|
unescape(attrs[attr]))
|
||||||
|
if (token["name"] in self.svg_allow_local_href and
|
||||||
|
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||||
|
attrs['xlink:href'])):
|
||||||
|
del attrs['xlink:href']
|
||||||
if attrs.has_key('style'):
|
if attrs.has_key('style'):
|
||||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
if token["type"] == "EndTag":
|
if token["type"] == tokenTypes["EndTag"]:
|
||||||
token["data"] = "</%s>" % token["name"]
|
token["data"] = "</%s>" % token["name"]
|
||||||
elif token["data"]:
|
elif token["data"]:
|
||||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||||
else:
|
else:
|
||||||
token["data"] = "<%s>" % token["name"]
|
token["data"] = "<%s>" % token["name"]
|
||||||
if token["type"] == "EmptyTag":
|
if token["type"] == tokenTypes["EmptyTag"]:
|
||||||
token["data"]=token["data"][:-1] + "/>"
|
token["data"]=token["data"][:-1] + "/>"
|
||||||
token["type"] = "Characters"
|
token["type"] = tokenTypes["Characters"]
|
||||||
del token["name"]
|
del token["name"]
|
||||||
return token
|
return token
|
||||||
elif token["type"] == "Comment":
|
elif token["type"] == tokenTypes["Comment"]:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
return token
|
return token
|
||||||
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
|
|||||||
|
|
||||||
# gauntlet
|
# gauntlet
|
||||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||||
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
|
||||||
|
|
||||||
clean = []
|
clean = []
|
||||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||||
if not value: continue
|
if not value: continue
|
||||||
if prop.lower() in self.allowed_css_properties:
|
if prop.lower() in self.allowed_css_properties:
|
||||||
clean.append(prop + ': ' + value + ';')
|
clean.append(prop + ': ' + value + ';')
|
||||||
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
elif prop.split('-')[0].lower() in ['background','border','margin',
|
||||||
|
'padding']:
|
||||||
for keyword in value.split():
|
for keyword in value.split():
|
||||||
if not keyword in self.acceptable_css_keywords and \
|
if not keyword in self.acceptable_css_keywords and \
|
||||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||||
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
|
|||||||
return ' '.join(clean)
|
return ' '.join(clean)
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||||
lowercaseElementName=False, lowercaseAttrName=False):
|
lowercaseElementName=False, lowercaseAttrName=False):
|
||||||
#Change case matching defaults as we only output lowercase html anyway
|
#Change case matching defaults as we only output lowercase html anyway
|
||||||
#This solution doesn't seem ideal...
|
#This solution doesn't seem ideal...
|
||||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
|
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||||
lowercaseElementName, lowercaseAttrName)
|
lowercaseElementName, lowercaseAttrName)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
14
planet/vendor/html5lib/serializer/__init__.py
vendored
14
planet/vendor/html5lib/serializer/__init__.py
vendored
@ -1,3 +1,17 @@
|
|||||||
|
|
||||||
|
from html5lib import treewalkers
|
||||||
|
|
||||||
from htmlserializer import HTMLSerializer
|
from htmlserializer import HTMLSerializer
|
||||||
from xhtmlserializer import XHTMLSerializer
|
from xhtmlserializer import XHTMLSerializer
|
||||||
|
|
||||||
|
def serialize(input, tree="simpletree", format="html", encoding=None,
|
||||||
|
**serializer_opts):
|
||||||
|
# XXX: Should we cache this?
|
||||||
|
walker = treewalkers.getTreeWalker(tree)
|
||||||
|
if format == "html":
|
||||||
|
s = HTMLSerializer(**serializer_opts)
|
||||||
|
elif format == "xhtml":
|
||||||
|
s = XHTMLSerializer(**serializer_opts)
|
||||||
|
else:
|
||||||
|
raise ValueError, "type must be either html or xhtml"
|
||||||
|
return s.render(walker(input), encoding)
|
||||||
|
@ -147,7 +147,7 @@ class HTMLSerializer(object):
|
|||||||
quote_attr = True
|
quote_attr = True
|
||||||
else:
|
else:
|
||||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||||
spaceCharacters + "<>\"'", False)
|
spaceCharacters + ">\"'=", False)
|
||||||
v = v.replace("&", "&")
|
v = v.replace("&", "&")
|
||||||
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
||||||
if encoding:
|
if encoding:
|
||||||
|
1066
planet/vendor/html5lib/tokenizer.py
vendored
1066
planet/vendor/html5lib/tokenizer.py
vendored
File diff suppressed because it is too large
Load Diff
28
planet/vendor/html5lib/treebuilders/__init__.py
vendored
28
planet/vendor/html5lib/treebuilders/__init__.py
vendored
@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
|
|||||||
|
|
||||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||||
more pythonic idioms.
|
more pythonic idioms.
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
"dom" - A generic builder for DOM implementations, defaulting to
|
||||||
|
a xml.dom.minidom based implementation for the sake of
|
||||||
|
backwards compatibility (as releases up until 0.10 had a
|
||||||
|
builder called "dom" that was a minidom implemenation).
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
"etree" - A generic builder for tree implementations exposing an
|
||||||
elementtree-like interface (known to work with
|
elementtree-like interface (known to work with
|
||||||
ElementTree, cElementTree and lxml.etree).
|
ElementTree, cElementTree and lxml.etree).
|
||||||
"beautifulsoup" - Beautiful soup (if installed)
|
"beautifulsoup" - Beautiful soup (if installed)
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
implementation - (Currently applies to the "etree" and "dom" tree types). A
|
||||||
implementing the tree type e.g. xml.etree.ElementTree or
|
module implementing the tree type e.g.
|
||||||
lxml.etree."""
|
xml.etree.ElementTree or lxml.etree."""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeBuilderCache:
|
if treeType not in treeBuilderCache:
|
||||||
if treeType in ("dom", "simpletree"):
|
if treeType == "dom":
|
||||||
mod = __import__(treeType, globals())
|
import dom
|
||||||
treeBuilderCache[treeType] = mod.TreeBuilder
|
# XXX: Keep backwards compatibility by using minidom if no implementation is given
|
||||||
|
if implementation == None:
|
||||||
|
from xml.dom import minidom
|
||||||
|
implementation = minidom
|
||||||
|
# XXX: NEVER cache here, caching is done in the dom submodule
|
||||||
|
return dom.getDomModule(implementation, **kwargs).TreeBuilder
|
||||||
|
elif treeType == "simpletree":
|
||||||
|
import simpletree
|
||||||
|
treeBuilderCache[treeType] = simpletree.TreeBuilder
|
||||||
elif treeType == "beautifulsoup":
|
elif treeType == "beautifulsoup":
|
||||||
import soup
|
import soup
|
||||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||||
|
elif treeType == "lxml":
|
||||||
|
import etree_lxml
|
||||||
|
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||||
elif treeType == "etree":
|
elif treeType == "etree":
|
||||||
import etree
|
import etree
|
||||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
|
85
planet/vendor/html5lib/treebuilders/_base.py
vendored
85
planet/vendor/html5lib/treebuilders/_base.py
vendored
@ -1,3 +1,4 @@
|
|||||||
|
import warnings
|
||||||
from html5lib.constants import scopingElements, tableInsertModeElements
|
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||||
try:
|
try:
|
||||||
frozenset
|
frozenset
|
||||||
@ -11,9 +12,6 @@ except NameError:
|
|||||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||||
Marker = None
|
Marker = None
|
||||||
|
|
||||||
#XXX - TODO; make the default interface more ElementTree-like
|
|
||||||
# rather than DOM-like
|
|
||||||
|
|
||||||
class Node(object):
|
class Node(object):
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
"""Node representing an item in the tree.
|
"""Node representing an item in the tree.
|
||||||
@ -43,7 +41,7 @@ class Node(object):
|
|||||||
return "<%s>"%(self.name)
|
return "<%s>"%(self.name)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "<%s %s>" % (self.__class__, self.name)
|
return "<%s>" % (self.name)
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
"""Insert node as a child of the current node
|
"""Insert node as a child of the current node
|
||||||
@ -112,7 +110,12 @@ class TreeBuilder(object):
|
|||||||
#Fragment class
|
#Fragment class
|
||||||
fragmentClass = None
|
fragmentClass = None
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, namespaceHTMLElements):
|
||||||
|
if namespaceHTMLElements:
|
||||||
|
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||||
|
else:
|
||||||
|
self.defaultNamespace = None
|
||||||
|
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@ -140,7 +143,8 @@ class TreeBuilder(object):
|
|||||||
return True
|
return True
|
||||||
elif node.name == "table":
|
elif node.name == "table":
|
||||||
return False
|
return False
|
||||||
elif not tableVariant and node.name in scopingElements:
|
elif (not tableVariant and (node.nameTuple in
|
||||||
|
scopingElements)):
|
||||||
return False
|
return False
|
||||||
elif node.name == "html":
|
elif node.name == "html":
|
||||||
return False
|
return False
|
||||||
@ -179,7 +183,10 @@ class TreeBuilder(object):
|
|||||||
clone = self.activeFormattingElements[i].cloneNode()
|
clone = self.activeFormattingElements[i].cloneNode()
|
||||||
|
|
||||||
# Step 9
|
# Step 9
|
||||||
element = self.insertElement(clone.name, clone.attributes)
|
element = self.insertElement({"type":"StartTag",
|
||||||
|
"name":clone.name,
|
||||||
|
"namespace":clone.namespace,
|
||||||
|
"data":clone.attributes})
|
||||||
|
|
||||||
# Step 10
|
# Step 10
|
||||||
self.activeFormattingElements[i] = element
|
self.activeFormattingElements[i] = element
|
||||||
@ -207,21 +214,30 @@ class TreeBuilder(object):
|
|||||||
return item
|
return item
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def insertDoctype(self, name, publicId, systemId):
|
def insertRoot(self, token):
|
||||||
doctype = self.doctypeClass(name)
|
element = self.createElement(token)
|
||||||
doctype.publicId = publicId
|
self.openElements.append(element)
|
||||||
doctype.systemId = systemId
|
self.document.appendChild(element)
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
doctype = self.doctypeClass(name, publicId, systemId)
|
||||||
self.document.appendChild(doctype)
|
self.document.appendChild(doctype)
|
||||||
|
|
||||||
def insertComment(self, data, parent=None):
|
def insertComment(self, token, parent=None):
|
||||||
if parent is None:
|
if parent is None:
|
||||||
parent = self.openElements[-1]
|
parent = self.openElements[-1]
|
||||||
parent.appendChild(self.commentClass(data))
|
parent.appendChild(self.commentClass(token["data"]))
|
||||||
|
|
||||||
def createElement(self, name, attributes):
|
def createElement(self, token):
|
||||||
"""Create an element but don't insert it anywhere"""
|
"""Create an element but don't insert it anywhere"""
|
||||||
element = self.elementClass(name)
|
name = token["name"]
|
||||||
element.attributes = attributes
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
element = self.elementClass(name, namespace)
|
||||||
|
element.attributes = token["data"]
|
||||||
return element
|
return element
|
||||||
|
|
||||||
def _getInsertFromTable(self):
|
def _getInsertFromTable(self):
|
||||||
@ -238,19 +254,20 @@ class TreeBuilder(object):
|
|||||||
|
|
||||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||||
|
|
||||||
def insertElementNormal(self, name, attributes):
|
def insertElementNormal(self, token):
|
||||||
element = self.elementClass(name)
|
name = token["name"]
|
||||||
element.attributes = attributes
|
namespace = token.get("namespace", self.defaultNamespace)
|
||||||
|
element = self.elementClass(name, namespace)
|
||||||
|
element.attributes = token["data"]
|
||||||
self.openElements[-1].appendChild(element)
|
self.openElements[-1].appendChild(element)
|
||||||
self.openElements.append(element)
|
self.openElements.append(element)
|
||||||
return element
|
return element
|
||||||
|
|
||||||
def insertElementTable(self, name, attributes):
|
def insertElementTable(self, token):
|
||||||
"""Create an element and insert it into the tree"""
|
"""Create an element and insert it into the tree"""
|
||||||
element = self.elementClass(name)
|
element = self.createElement(token)
|
||||||
element.attributes = attributes
|
|
||||||
if self.openElements[-1].name not in tableInsertModeElements:
|
if self.openElements[-1].name not in tableInsertModeElements:
|
||||||
return self.insertElementNormal(name, attributes)
|
return self.insertElementNormal(token)
|
||||||
else:
|
else:
|
||||||
#We should be in the InTable mode. This means we want to do
|
#We should be in the InTable mode. This means we want to do
|
||||||
#special magic element rearranging
|
#special magic element rearranging
|
||||||
@ -267,32 +284,32 @@ class TreeBuilder(object):
|
|||||||
if parent is None:
|
if parent is None:
|
||||||
parent = self.openElements[-1]
|
parent = self.openElements[-1]
|
||||||
|
|
||||||
if (not(self.insertFromTable) or (self.insertFromTable and
|
if (not self.insertFromTable or (self.insertFromTable and
|
||||||
self.openElements[-1].name not in
|
self.openElements[-1].name
|
||||||
tableInsertModeElements)):
|
not in tableInsertModeElements)):
|
||||||
parent.insertText(data)
|
parent.insertText(data)
|
||||||
else:
|
else:
|
||||||
#We should be in the InTable mode. This means we want to do
|
# We should be in the InTable mode. This means we want to do
|
||||||
#special magic element rearranging
|
# special magic element rearranging
|
||||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
parent.insertText(data, insertBefore)
|
parent.insertText(data, insertBefore)
|
||||||
|
|
||||||
def getTableMisnestedNodePosition(self):
|
def getTableMisnestedNodePosition(self):
|
||||||
"""Get the foster parent element, and sibling to insert before
|
"""Get the foster parent element, and sibling to insert before
|
||||||
(or None) when inserting a misnested table node"""
|
(or None) when inserting a misnested table node"""
|
||||||
#The foster parent element is the one which comes before the most
|
# The foster parent element is the one which comes before the most
|
||||||
#recently opened table element
|
# recently opened table element
|
||||||
#XXX - this is really inelegant
|
# XXX - this is really inelegant
|
||||||
lastTable=None
|
lastTable=None
|
||||||
fosterParent = None
|
fosterParent = None
|
||||||
insertBefore = None
|
insertBefore = None
|
||||||
for elm in self.openElements[::-1]:
|
for elm in self.openElements[::-1]:
|
||||||
if elm.name == u"table":
|
if elm.name == "table":
|
||||||
lastTable = elm
|
lastTable = elm
|
||||||
break
|
break
|
||||||
if lastTable:
|
if lastTable:
|
||||||
#XXX - we should really check that this parent is actually a
|
# XXX - we should really check that this parent is actually a
|
||||||
#node here
|
# node here
|
||||||
if lastTable.parent:
|
if lastTable.parent:
|
||||||
fosterParent = lastTable.parent
|
fosterParent = lastTable.parent
|
||||||
insertBefore = lastTable
|
insertBefore = lastTable
|
||||||
|
469
planet/vendor/html5lib/treebuilders/dom.py
vendored
469
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -1,203 +1,292 @@
|
|||||||
import _base
|
|
||||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||||
|
import new
|
||||||
import re
|
import re
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
|
||||||
|
|
||||||
class AttrList:
|
import _base
|
||||||
def __init__(self, element):
|
from html5lib import constants, ihatexml
|
||||||
self.element = element
|
from html5lib.constants import namespaces
|
||||||
def __iter__(self):
|
|
||||||
return self.element.attributes.items().__iter__()
|
|
||||||
def __setitem__(self, name, value):
|
|
||||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
|
||||||
self.element.setAttribute(name, value)
|
|
||||||
def items(self):
|
|
||||||
return self.element.attributes.items()
|
|
||||||
def keys(self):
|
|
||||||
return self.element.attributes.keys()
|
|
||||||
def __getitem__(self, name):
|
|
||||||
return self.element.getAttribute(name)
|
|
||||||
|
|
||||||
class NodeBuilder(_base.Node):
|
moduleCache = {}
|
||||||
def __init__(self, element):
|
|
||||||
_base.Node.__init__(self, element.nodeName)
|
|
||||||
self.element = element
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
def getDomModule(DomImplementation):
|
||||||
node.parent = self
|
name = "_" + DomImplementation.__name__+"builder"
|
||||||
self.element.appendChild(node.element)
|
if name in moduleCache:
|
||||||
|
return moduleCache[name]
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
|
||||||
text = self.element.ownerDocument.createTextNode(data)
|
|
||||||
if insertBefore:
|
|
||||||
self.element.insertBefore(text, insertBefore.element)
|
|
||||||
else:
|
|
||||||
self.element.appendChild(text)
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
self.element.insertBefore(node.element, refNode.element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
if node.element.parentNode == self.element:
|
|
||||||
self.element.removeChild(node.element)
|
|
||||||
node.parent = None
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
while self.element.hasChildNodes():
|
|
||||||
child = self.element.firstChild
|
|
||||||
self.element.removeChild(child)
|
|
||||||
newParent.element.appendChild(child)
|
|
||||||
self.childNodes = []
|
|
||||||
|
|
||||||
def getAttributes(self):
|
|
||||||
return AttrList(self.element)
|
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
|
||||||
if attributes:
|
|
||||||
for name, value in attributes.items():
|
|
||||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
|
||||||
self.element.setAttribute(name, value)
|
|
||||||
|
|
||||||
attributes = property(getAttributes, setAttributes)
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
return NodeBuilder(self.element.cloneNode(False))
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
return self.element.hasChildNodes()
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
def documentClass(self):
|
|
||||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def insertDoctype(self, name, publicId, systemId):
|
|
||||||
domimpl = minidom.getDOMImplementation()
|
|
||||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
|
||||||
self.document.appendChild(NodeBuilder(doctype))
|
|
||||||
doctype.ownerDocument = self.dom
|
|
||||||
|
|
||||||
def elementClass(self, name):
|
|
||||||
return NodeBuilder(self.dom.createElement(name))
|
|
||||||
|
|
||||||
def commentClass(self, data):
|
|
||||||
return NodeBuilder(self.dom.createComment(data))
|
|
||||||
|
|
||||||
def fragmentClass(self):
|
|
||||||
return NodeBuilder(self.dom.createDocumentFragment())
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self.dom.appendChild(node.element)
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
return self.dom
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
return _base.TreeBuilder.getFragment(self).element
|
|
||||||
|
|
||||||
def insertText(self, data, parent=None):
|
|
||||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
|
||||||
if parent <> self:
|
|
||||||
_base.TreeBuilder.insertText(self, data, parent)
|
|
||||||
else:
|
|
||||||
# HACK: allow text nodes as children of the document node
|
|
||||||
if hasattr(self.dom, '_child_node_types'):
|
|
||||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
|
||||||
self.dom._child_node_types=list(self.dom._child_node_types)
|
|
||||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
|
||||||
self.dom.appendChild(self.dom.createTextNode(data))
|
|
||||||
|
|
||||||
name = None
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
element.normalize()
|
|
||||||
rv = []
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
|
||||||
if element.name:
|
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
|
||||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
|
||||||
rv.append("#document")
|
|
||||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
|
||||||
rv.append("#document-fragment")
|
|
||||||
elif element.nodeType == Node.COMMENT_NODE:
|
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
|
||||||
elif element.nodeType == Node.TEXT_NODE:
|
|
||||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
|
|
||||||
if element.hasAttributes():
|
|
||||||
for name, value in element.attributes.items():
|
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
|
||||||
indent += 2
|
|
||||||
for child in element.childNodes:
|
|
||||||
serializeElement(child, indent)
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
|
||||||
if node.nodeType == Node.ELEMENT_NODE:
|
|
||||||
if not nsmap:
|
|
||||||
handler.startElement(node.nodeName, node.attributes)
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
handler.endElement(node.nodeName)
|
|
||||||
else:
|
else:
|
||||||
attributes = dict(node.attributes.itemsNS())
|
mod = new.module(name)
|
||||||
|
objs = getDomBuilder(DomImplementation)
|
||||||
|
mod.__dict__.update(objs)
|
||||||
|
moduleCache[name] = mod
|
||||||
|
return mod
|
||||||
|
|
||||||
# gather namespace declarations
|
def getDomBuilder(DomImplementation):
|
||||||
prefixes = []
|
Dom = DomImplementation
|
||||||
for attrname in node.attributes.keys():
|
infoset_filter = ihatexml.InfosetFilter()
|
||||||
attr = node.getAttributeNode(attrname)
|
class AttrList:
|
||||||
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
def __init__(self, element):
|
||||||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
self.element = element
|
||||||
prefix = (attr.localName != 'xmlns' and attr.localName or None)
|
def __iter__(self):
|
||||||
handler.startPrefixMapping(prefix, attr.nodeValue)
|
return self.element.attributes.items().__iter__()
|
||||||
prefixes.append(prefix)
|
def __setitem__(self, name, value):
|
||||||
nsmap = nsmap.copy()
|
self.element.setAttribute(infoset_filter.coerceAttribute(name),
|
||||||
nsmap[prefix] = attr.nodeValue
|
infoset_filter.coerceCharacters(value))
|
||||||
del attributes[(attr.namespaceURI, attr.localName)]
|
def items(self):
|
||||||
|
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
|
||||||
|
self.element.attributes.items()]
|
||||||
|
def keys(self):
|
||||||
|
return [infoset_filter.fromXmlName(item) for item in
|
||||||
|
self.element.attributes.keys()]
|
||||||
|
def __getitem__(self, name):
|
||||||
|
name = infoset_filter.toXmlName(name)
|
||||||
|
return self.element.getAttribute(name)
|
||||||
|
|
||||||
# apply namespace declarations
|
def __contains__(self, name):
|
||||||
for attrname in node.attributes.keys():
|
if isinstance(name, tuple):
|
||||||
attr = node.getAttributeNode(attrname)
|
raise NotImplementedError
|
||||||
if attr.namespaceURI == None and ':' in attr.nodeName:
|
else:
|
||||||
prefix = attr.nodeName.split(':')[0]
|
return self.element.hasAttribute(infoset_filter.toXmlName(name))
|
||||||
if nsmap.has_key(prefix):
|
|
||||||
del attributes[(attr.namespaceURI, attr.localName)]
|
class NodeBuilder(_base.Node):
|
||||||
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
|
def __init__(self, element):
|
||||||
|
_base.Node.__init__(self, element.localName)
|
||||||
|
self.element = element
|
||||||
|
|
||||||
# SAX events
|
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
|
||||||
ns = node.namespaceURI or nsmap.get(None,None)
|
and self.element.namespaceURI or None)
|
||||||
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
|
||||||
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
|
||||||
|
|
||||||
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
def appendChild(self, node):
|
||||||
handler.characters(node.nodeValue)
|
node.parent = self
|
||||||
|
self.element.appendChild(node.element)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
data=infoset_filter.coerceCharacters(data)
|
||||||
|
text = self.element.ownerDocument.createTextNode(data)
|
||||||
|
if insertBefore:
|
||||||
|
self.element.insertBefore(text, insertBefore.element)
|
||||||
|
else:
|
||||||
|
self.element.appendChild(text)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
self.element.insertBefore(node.element, refNode.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
if node.element.parentNode == self.element:
|
||||||
|
self.element.removeChild(node.element)
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
while self.element.hasChildNodes():
|
||||||
|
child = self.element.firstChild
|
||||||
|
self.element.removeChild(child)
|
||||||
|
newParent.element.appendChild(child)
|
||||||
|
self.childNodes = []
|
||||||
|
|
||||||
|
def getAttributes(self):
|
||||||
|
return AttrList(self.element)
|
||||||
|
|
||||||
|
def setAttributes(self, attributes):
|
||||||
|
if attributes:
|
||||||
|
for name, value in attributes.items():
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
if name[0] is not None:
|
||||||
|
qualifiedName = (name[0] + ":" +
|
||||||
|
infoset_filter.coerceAttribute(
|
||||||
|
name[1]))
|
||||||
|
else:
|
||||||
|
qualifiedName = infoset_filter.coerceAttribute(
|
||||||
|
name[1])
|
||||||
|
self.element.setAttributeNS(name[2], qualifiedName,
|
||||||
|
value)
|
||||||
|
else:
|
||||||
|
self.element.setAttribute(
|
||||||
|
infoset_filter.coerceAttribute(name), value)
|
||||||
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
return NodeBuilder(self.element.cloneNode(False))
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
return self.element.hasChildNodes()
|
||||||
|
|
||||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
def getNameTuple(self):
|
||||||
handler.startDocument()
|
if self.namespace == None:
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
return namespaces["html"], self.name
|
||||||
handler.endDocument()
|
else:
|
||||||
|
return self.namespace, self.name
|
||||||
|
|
||||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
nameTuple = property(getNameTuple)
|
||||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
|
||||||
|
|
||||||
else:
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
# ATTRIBUTE_NODE
|
def documentClass(self):
|
||||||
# ENTITY_NODE
|
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
|
||||||
# PROCESSING_INSTRUCTION_NODE
|
return self
|
||||||
# COMMENT_NODE
|
|
||||||
# DOCUMENT_TYPE_NODE
|
def insertDoctype(self, token):
|
||||||
# NOTATION_NODE
|
name = token["name"]
|
||||||
pass
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
domimpl = Dom.getDOMImplementation()
|
||||||
|
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||||
|
self.document.appendChild(NodeBuilder(doctype))
|
||||||
|
if Dom == minidom:
|
||||||
|
doctype.ownerDocument = self.dom
|
||||||
|
|
||||||
|
def elementClass(self, name, namespace=None):
|
||||||
|
if namespace is None and self.defaultNamespace is None:
|
||||||
|
node = self.dom.createElement(name)
|
||||||
|
else:
|
||||||
|
node = self.dom.createElementNS(namespace, name)
|
||||||
|
|
||||||
|
return NodeBuilder(node)
|
||||||
|
|
||||||
|
def commentClass(self, data):
|
||||||
|
return NodeBuilder(self.dom.createComment(data))
|
||||||
|
|
||||||
|
def fragmentClass(self):
|
||||||
|
return NodeBuilder(self.dom.createDocumentFragment())
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self.dom.appendChild(node.element)
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.dom
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return _base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def insertText(self, data, parent=None):
|
||||||
|
data=infoset_filter.coerceCharacters(data)
|
||||||
|
if parent <> self:
|
||||||
|
_base.TreeBuilder.insertText(self, data, parent)
|
||||||
|
else:
|
||||||
|
# HACK: allow text nodes as children of the document node
|
||||||
|
if hasattr(self.dom, '_child_node_types'):
|
||||||
|
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||||
|
self.dom._child_node_types=list(self.dom._child_node_types)
|
||||||
|
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||||
|
self.dom.appendChild(self.dom.createTextNode(data))
|
||||||
|
|
||||||
|
name = None
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
element.normalize()
|
||||||
|
rv = []
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
if element.name:
|
||||||
|
if element.publicId or element.systemId:
|
||||||
|
publicId = element.publicId or ""
|
||||||
|
systemId = element.systemId or ""
|
||||||
|
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
|
||||||
|
' '*indent, element.name, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||||
|
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||||
|
rv.append("#document")
|
||||||
|
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
|
rv.append("#document-fragment")
|
||||||
|
elif element.nodeType == Node.COMMENT_NODE:
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||||
|
elif element.nodeType == Node.TEXT_NODE:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||||
|
else:
|
||||||
|
if (hasattr(element, "namespaceURI") and
|
||||||
|
element.namespaceURI not in (None,
|
||||||
|
constants.namespaces["html"])):
|
||||||
|
name = "%s %s"%(constants.prefixes[element.namespaceURI],
|
||||||
|
element.nodeName)
|
||||||
|
else:
|
||||||
|
name = element.nodeName
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, name))
|
||||||
|
if element.hasAttributes():
|
||||||
|
i = 0
|
||||||
|
attr = element.attributes.item(i)
|
||||||
|
while attr:
|
||||||
|
name = infoset_filter.fromXmlName(attr.localName)
|
||||||
|
value = attr.value
|
||||||
|
ns = attr.namespaceURI
|
||||||
|
if ns:
|
||||||
|
name = "%s %s"%(constants.prefixes[ns], name)
|
||||||
|
i += 1
|
||||||
|
attr = element.attributes.item(i)
|
||||||
|
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.childNodes:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||||
|
if node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
if not nsmap:
|
||||||
|
handler.startElement(node.nodeName, node.attributes)
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endElement(node.nodeName)
|
||||||
|
else:
|
||||||
|
attributes = dict(node.attributes.itemsNS())
|
||||||
|
|
||||||
|
# gather namespace declarations
|
||||||
|
prefixes = []
|
||||||
|
for attrname in node.attributes.keys():
|
||||||
|
attr = node.getAttributeNode(attrname)
|
||||||
|
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
||||||
|
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
||||||
|
prefix = (attr.localName != 'xmlns' and attr.localName or None)
|
||||||
|
handler.startPrefixMapping(prefix, attr.nodeValue)
|
||||||
|
prefixes.append(prefix)
|
||||||
|
nsmap = nsmap.copy()
|
||||||
|
nsmap[prefix] = attr.nodeValue
|
||||||
|
del attributes[(attr.namespaceURI, attr.localName)]
|
||||||
|
|
||||||
|
# apply namespace declarations
|
||||||
|
for attrname in node.attributes.keys():
|
||||||
|
attr = node.getAttributeNode(attrname)
|
||||||
|
if attr.namespaceURI == None and ':' in attr.nodeName:
|
||||||
|
prefix = attr.nodeName.split(':')[0]
|
||||||
|
if nsmap.has_key(prefix):
|
||||||
|
del attributes[(attr.namespaceURI, attr.localName)]
|
||||||
|
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
|
||||||
|
|
||||||
|
# SAX events
|
||||||
|
ns = node.namespaceURI or nsmap.get(None,None)
|
||||||
|
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||||
|
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
||||||
|
|
||||||
|
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
||||||
|
handler.characters(node.nodeValue)
|
||||||
|
|
||||||
|
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||||
|
handler.startDocument()
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endDocument()
|
||||||
|
|
||||||
|
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# ATTRIBUTE_NODE
|
||||||
|
# ENTITY_NODE
|
||||||
|
# PROCESSING_INSTRUCTION_NODE
|
||||||
|
# COMMENT_NODE
|
||||||
|
# DOCUMENT_TYPE_NODE
|
||||||
|
# NOTATION_NODE
|
||||||
|
pass
|
||||||
|
|
||||||
|
return locals()
|
||||||
|
|
||||||
|
# Keep backwards compatibility with things that directly load
|
||||||
|
# classes/functions from this module
|
||||||
|
for key, value in getDomModule(minidom).__dict__.items():
|
||||||
|
globals()[key] = value
|
||||||
|
100
planet/vendor/html5lib/treebuilders/etree.py
vendored
100
planet/vendor/html5lib/treebuilders/etree.py
vendored
@ -1,5 +1,12 @@
|
|||||||
import _base
|
|
||||||
import new
|
import new
|
||||||
|
import re
|
||||||
|
|
||||||
|
import _base
|
||||||
|
from html5lib import ihatexml
|
||||||
|
from html5lib import constants
|
||||||
|
from html5lib.constants import namespaces
|
||||||
|
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
moduleCache = {}
|
moduleCache = {}
|
||||||
|
|
||||||
@ -17,20 +24,43 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
|
|||||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||||
ElementTree = ElementTreeImplementation
|
ElementTree = ElementTreeImplementation
|
||||||
class Element(_base.Node):
|
class Element(_base.Node):
|
||||||
def __init__(self, name):
|
def __init__(self, name, namespace=None):
|
||||||
self._element = ElementTree.Element(name)
|
self._name = name
|
||||||
self.name = name
|
self._namespace = namespace
|
||||||
|
self._element = ElementTree.Element(self._getETreeTag(name,
|
||||||
|
namespace))
|
||||||
|
if namespace is None:
|
||||||
|
self.nameTuple = namespaces["html"], self._name
|
||||||
|
else:
|
||||||
|
self.nameTuple = self._namespace, self._name
|
||||||
self.parent = None
|
self.parent = None
|
||||||
self._childNodes = []
|
self._childNodes = []
|
||||||
self._flags = []
|
self._flags = []
|
||||||
|
|
||||||
|
def _getETreeTag(self, name, namespace):
|
||||||
|
if namespace is None:
|
||||||
|
etree_tag = name
|
||||||
|
else:
|
||||||
|
etree_tag = "{%s}%s"%(namespace, name)
|
||||||
|
return etree_tag
|
||||||
|
|
||||||
def _setName(self, name):
|
def _setName(self, name):
|
||||||
self._element.tag = name
|
self._name = name
|
||||||
|
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||||
|
|
||||||
def _getName(self):
|
def _getName(self):
|
||||||
return self._element.tag
|
return self._name
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _setNamespace(self, namespace):
|
||||||
|
self._namespace = namespace
|
||||||
|
self._element.tag = self._getETreeTag(self._name, self._namespace)
|
||||||
|
|
||||||
|
def _getNamespace(self):
|
||||||
|
return self._namespace
|
||||||
|
|
||||||
|
namespace = property(_getNamespace, _setNamespace)
|
||||||
|
|
||||||
def _getAttributes(self):
|
def _getAttributes(self):
|
||||||
return self._element.attrib
|
return self._element.attrib
|
||||||
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
for key in self._element.attrib.keys():
|
for key in self._element.attrib.keys():
|
||||||
del self._element.attrib[key]
|
del self._element.attrib[key]
|
||||||
for key, value in attributes.iteritems():
|
for key, value in attributes.iteritems():
|
||||||
self._element.set(key, value)
|
if isinstance(key, tuple):
|
||||||
|
name = "{%s}%s"%(key[2], key[1])
|
||||||
|
else:
|
||||||
|
name = key
|
||||||
|
self._element.set(name, value)
|
||||||
|
|
||||||
attributes = property(_getAttributes, _setAttributes)
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
def _getChildNodes(self):
|
def _getChildNodes(self):
|
||||||
return self._childNodes
|
return self._childNodes
|
||||||
|
|
||||||
def _setChildNodes(self, value):
|
def _setChildNodes(self, value):
|
||||||
del self._element[:]
|
del self._element[:]
|
||||||
self._childNodes = []
|
self._childNodes = []
|
||||||
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
data = property(_getData, _setData)
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
class DocumentType(Element):
|
class DocumentType(Element):
|
||||||
def __init__(self, name):
|
def __init__(self, name, publicId, systemId):
|
||||||
Element.__init__(self, "<!DOCTYPE>")
|
Element.__init__(self, "<!DOCTYPE>")
|
||||||
self._element.text = name
|
self._element.text = name
|
||||||
|
self.publicId = publicId
|
||||||
|
self.systemId = systemId
|
||||||
|
|
||||||
def _getPublicId(self):
|
def _getPublicId(self):
|
||||||
return self._element.get(u"publicId", None)
|
return self._element.get(u"publicId", "")
|
||||||
|
|
||||||
def _setPublicId(self, value):
|
def _setPublicId(self, value):
|
||||||
if value is not None:
|
if value is not None:
|
||||||
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
publicId = property(_getPublicId, _setPublicId)
|
publicId = property(_getPublicId, _setPublicId)
|
||||||
|
|
||||||
def _getSystemId(self):
|
def _getSystemId(self):
|
||||||
return self._element.get(u"systemId", None)
|
return self._element.get(u"systemId", "")
|
||||||
|
|
||||||
def _setSystemId(self, value):
|
def _setSystemId(self, value):
|
||||||
if value is not None:
|
if value is not None:
|
||||||
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
if not(hasattr(element, "tag")):
|
if not(hasattr(element, "tag")):
|
||||||
element = element.getroot()
|
element = element.getroot()
|
||||||
if element.tag == "<!DOCTYPE>":
|
if element.tag == "<!DOCTYPE>":
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
if element.get("publicId") or element.get("systemId"):
|
||||||
|
publicId = element.get("publicId") or ""
|
||||||
|
systemId = element.get("systemId") or ""
|
||||||
|
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
|
||||||
|
element.text, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||||
elif element.tag == "<DOCUMENT_ROOT>":
|
elif element.tag == "<DOCUMENT_ROOT>":
|
||||||
rv.append("#document")
|
rv.append("#document")
|
||||||
if element.text:
|
if element.text:
|
||||||
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
elif type(element.tag) == type(ElementTree.Comment):
|
elif type(element.tag) == type(ElementTree.Comment):
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||||
else:
|
else:
|
||||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
nsmatch = tag_regexp.match(element.tag)
|
||||||
|
|
||||||
|
if nsmatch is None:
|
||||||
|
name = element.tag
|
||||||
|
else:
|
||||||
|
ns, name = nsmatch.groups()
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
if prefix != "html":
|
||||||
|
name = "%s %s"%(prefix, name)
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, name))
|
||||||
|
|
||||||
if hasattr(element, "attrib"):
|
if hasattr(element, "attrib"):
|
||||||
for name, value in element.attrib.iteritems():
|
for name, value in element.attrib.iteritems():
|
||||||
|
nsmatch = tag_regexp.match(name)
|
||||||
|
if nsmatch is not None:
|
||||||
|
ns, name = nsmatch.groups()
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
name = "%s %s"%(prefix, name)
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||||
@ -201,12 +257,19 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
"""Serialize an element and its child nodes to a string"""
|
"""Serialize an element and its child nodes to a string"""
|
||||||
rv = []
|
rv = []
|
||||||
finalText = None
|
finalText = None
|
||||||
|
filter = ihatexml.InfosetFilter()
|
||||||
def serializeElement(element):
|
def serializeElement(element):
|
||||||
if type(element) == type(ElementTree.ElementTree):
|
if type(element) == type(ElementTree.ElementTree):
|
||||||
element = element.getroot()
|
element = element.getroot()
|
||||||
|
|
||||||
if element.tag == "<!DOCTYPE>":
|
if element.tag == "<!DOCTYPE>":
|
||||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
if element.get("publicId") or element.get("systemId"):
|
||||||
|
publicId = element.get("publicId") or ""
|
||||||
|
systemId = element.get("systemId") or ""
|
||||||
|
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
|
||||||
|
element.text, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||||
elif element.tag == "<DOCUMENT_ROOT>":
|
elif element.tag == "<DOCUMENT_ROOT>":
|
||||||
if element.text:
|
if element.text:
|
||||||
rv.append(element.text)
|
rv.append(element.text)
|
||||||
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
else:
|
else:
|
||||||
#This is assumed to be an ordinary element
|
#This is assumed to be an ordinary element
|
||||||
if not element.attrib:
|
if not element.attrib:
|
||||||
rv.append("<%s>"%(element.tag,))
|
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
|
||||||
else:
|
else:
|
||||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
attr = " ".join(["%s=\"%s\""%(
|
||||||
|
filter.fromXmlName(name), value)
|
||||||
for name, value in element.attrib.iteritems()])
|
for name, value in element.attrib.iteritems()])
|
||||||
rv.append("<%s %s>"%(element.tag, attr))
|
rv.append("<%s %s>"%(element.tag, attr))
|
||||||
if element.text:
|
if element.text:
|
||||||
|
331
planet/vendor/html5lib/treebuilders/etree_lxml.py
vendored
Normal file
331
planet/vendor/html5lib/treebuilders/etree_lxml.py
vendored
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
import new
|
||||||
|
import warnings
|
||||||
|
import re
|
||||||
|
|
||||||
|
import _base
|
||||||
|
from html5lib.constants import DataLossWarning
|
||||||
|
import html5lib.constants as constants
|
||||||
|
import etree as etree_builders
|
||||||
|
from html5lib import ihatexml
|
||||||
|
|
||||||
|
try:
|
||||||
|
import lxml.etree as etree
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
fullTree = True
|
||||||
|
|
||||||
|
"""Module for supporting the lxml.etree library. The idea here is to use as much
|
||||||
|
of the native library as possible, without using fragile hacks like custom element
|
||||||
|
names that break between releases. The downside of this is that we cannot represent
|
||||||
|
all possible trees; specifically the following are known to cause problems:
|
||||||
|
|
||||||
|
Text or comments as siblings of the root element
|
||||||
|
Docypes with no name
|
||||||
|
|
||||||
|
When any of these things occur, we emit a DataLossWarning
|
||||||
|
"""
|
||||||
|
|
||||||
|
class DocumentType(object):
|
||||||
|
def __init__(self, name, publicId, systemId):
|
||||||
|
self.name = name
|
||||||
|
self.publicId = publicId
|
||||||
|
self.systemId = systemId
|
||||||
|
|
||||||
|
class Document(object):
|
||||||
|
def __init__(self):
|
||||||
|
self._elementTree = None
|
||||||
|
self._childNodes = []
|
||||||
|
|
||||||
|
def appendChild(self, element):
|
||||||
|
self._elementTree.getroot().addnext(element._element)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes)
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
filter = ihatexml.InfosetFilter()
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if not hasattr(element, "tag"):
|
||||||
|
if hasattr(element, "getroot"):
|
||||||
|
#Full tree case
|
||||||
|
rv.append("#document")
|
||||||
|
if element.docinfo.internalDTD:
|
||||||
|
if not (element.docinfo.public_id or
|
||||||
|
element.docinfo.system_url):
|
||||||
|
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||||
|
else:
|
||||||
|
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
|
||||||
|
element.docinfo.root_name,
|
||||||
|
element.docinfo.public_id,
|
||||||
|
element.docinfo.system_url)
|
||||||
|
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
|
||||||
|
next_element = element.getroot()
|
||||||
|
while next_element.getprevious() is not None:
|
||||||
|
next_element = next_element.getprevious()
|
||||||
|
while next_element is not None:
|
||||||
|
serializeElement(next_element, indent+2)
|
||||||
|
next_element = next_element.getnext()
|
||||||
|
elif isinstance(element, basestring):
|
||||||
|
#Text in a fragment
|
||||||
|
rv.append("|%s\"%s\""%(' '*indent, element))
|
||||||
|
else:
|
||||||
|
#Fragment case
|
||||||
|
rv.append("#document-fragment")
|
||||||
|
for next_element in element:
|
||||||
|
serializeElement(next_element, indent+2)
|
||||||
|
elif type(element.tag) == type(etree.Comment):
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||||
|
else:
|
||||||
|
nsmatch = etree_builders.tag_regexp.match(element.tag)
|
||||||
|
if nsmatch is not None:
|
||||||
|
ns = nsmatch.group(1)
|
||||||
|
tag = nsmatch.group(2)
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
if prefix != "html":
|
||||||
|
rv.append("|%s<%s %s>"%(' '*indent, prefix,
|
||||||
|
filter.fromXmlName(tag)))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent,
|
||||||
|
filter.fromXmlName(tag)))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent,
|
||||||
|
filter.fromXmlName(element.tag)))
|
||||||
|
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
for name, value in element.attrib.iteritems():
|
||||||
|
nsmatch = etree_builders.tag_regexp.match(name)
|
||||||
|
if nsmatch:
|
||||||
|
ns = nsmatch.group(1)
|
||||||
|
name = nsmatch.group(2)
|
||||||
|
prefix = constants.prefixes[ns]
|
||||||
|
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
|
||||||
|
prefix,
|
||||||
|
filter.fromXmlName(name),
|
||||||
|
value))
|
||||||
|
else:
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2),
|
||||||
|
filter.fromXmlName(name),
|
||||||
|
value))
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def tostring(element):
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element):
|
||||||
|
if not hasattr(element, "tag"):
|
||||||
|
if element.docinfo.internalDTD:
|
||||||
|
if element.docinfo.doctype:
|
||||||
|
dtd_str = element.docinfo.doctype
|
||||||
|
else:
|
||||||
|
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
|
||||||
|
rv.append(dtd_str)
|
||||||
|
serializeElement(element.getroot())
|
||||||
|
|
||||||
|
elif type(element.tag) == type(etree.Comment):
|
||||||
|
rv.append("<!--%s-->"%(element.text,))
|
||||||
|
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>"%(element.tag,))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||||
|
for name, value in element.attrib.iteritems()])
|
||||||
|
rv.append("<%s %s>"%(element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>"%(element.tag,))
|
||||||
|
|
||||||
|
if hasattr(element, "tail") and element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = None
|
||||||
|
commentClass = None
|
||||||
|
fragmentClass = Document
|
||||||
|
|
||||||
|
def __init__(self, namespaceHTMLElements, fullTree = False):
|
||||||
|
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||||
|
filter = self.filter = ihatexml.InfosetFilter()
|
||||||
|
self.namespaceHTMLElements = namespaceHTMLElements
|
||||||
|
|
||||||
|
class Attributes(dict):
|
||||||
|
def __init__(self, element, value={}):
|
||||||
|
self._element = element
|
||||||
|
dict.__init__(self, value)
|
||||||
|
for key, value in self.iteritems():
|
||||||
|
if isinstance(key, tuple):
|
||||||
|
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||||
|
else:
|
||||||
|
name = filter.coerceAttribute(key)
|
||||||
|
self._element._element.attrib[name] = value
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
dict.__setitem__(self, key, value)
|
||||||
|
if isinstance(key, tuple):
|
||||||
|
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
|
||||||
|
else:
|
||||||
|
name = filter.coerceAttribute(key)
|
||||||
|
self._element._element.attrib[name] = value
|
||||||
|
|
||||||
|
class Element(builder.Element):
|
||||||
|
def __init__(self, name, namespace):
|
||||||
|
name = filter.coerceElement(name)
|
||||||
|
builder.Element.__init__(self, name, namespace=namespace)
|
||||||
|
self._attributes = Attributes(self)
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._name = filter.coerceElement(name)
|
||||||
|
self._element.tag = self._getETreeTag(
|
||||||
|
self._name, self._namespace)
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._attributes
|
||||||
|
|
||||||
|
def _setAttributes(self, attributes):
|
||||||
|
self._attributes = Attributes(self, attributes)
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
data = filter.coerceCharacters(data)
|
||||||
|
builder.Element.insertText(self, data, insertBefore)
|
||||||
|
|
||||||
|
def appendChild(self, child):
|
||||||
|
builder.Element.appendChild(self, child)
|
||||||
|
|
||||||
|
|
||||||
|
class Comment(builder.Comment):
|
||||||
|
def __init__(self, data):
|
||||||
|
data = filter.coerceComment(data)
|
||||||
|
builder.Comment.__init__(self, data)
|
||||||
|
|
||||||
|
def _setData(self, data):
|
||||||
|
data = filter.coerceComment(data)
|
||||||
|
self._element.text = data
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
self.elementClass = Element
|
||||||
|
self.commentClass = builder.Comment
|
||||||
|
#self.fragmentClass = builder.DocumentFragment
|
||||||
|
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
_base.TreeBuilder.reset(self)
|
||||||
|
self.insertComment = self.insertCommentInitial
|
||||||
|
self.initial_comments = []
|
||||||
|
self.doctype = None
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
if fullTree:
|
||||||
|
return self.document._elementTree
|
||||||
|
else:
|
||||||
|
return self.document._elementTree.getroot()
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
fragment = []
|
||||||
|
element = self.openElements[0]._element
|
||||||
|
if element.text:
|
||||||
|
fragment.append(element.text)
|
||||||
|
fragment.extend(element.getchildren())
|
||||||
|
if element.tail:
|
||||||
|
fragment.append(element.tail)
|
||||||
|
return fragment
|
||||||
|
|
||||||
|
def insertDoctype(self, token):
|
||||||
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
if not name or ihatexml.nonXmlBMPRegexp.search(name):
|
||||||
|
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
|
||||||
|
doctype = self.doctypeClass(name, publicId, systemId)
|
||||||
|
self.doctype = doctype
|
||||||
|
|
||||||
|
def insertCommentInitial(self, data, parent=None):
|
||||||
|
self.initial_comments.append(data)
|
||||||
|
|
||||||
|
def insertRoot(self, token):
|
||||||
|
"""Create the document root"""
|
||||||
|
#Because of the way libxml2 works, it doesn't seem to be possible to
|
||||||
|
#alter information like the doctype after the tree has been parsed.
|
||||||
|
#Therefore we need to use the built-in parser to create our iniial
|
||||||
|
#tree, after which we can add elements like normal
|
||||||
|
docStr = ""
|
||||||
|
if self.doctype and self.doctype.name:
|
||||||
|
docStr += "<!DOCTYPE %s"%self.doctype.name
|
||||||
|
if (self.doctype.publicId is not None or
|
||||||
|
self.doctype.systemId is not None):
|
||||||
|
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
|
||||||
|
self.doctype.systemId or "")
|
||||||
|
docStr += ">"
|
||||||
|
#TODO - this needs to work when elements are not put into the default ns
|
||||||
|
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
|
||||||
|
|
||||||
|
try:
|
||||||
|
root = etree.fromstring(docStr)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
print docStr
|
||||||
|
raise
|
||||||
|
|
||||||
|
#Append the initial comments:
|
||||||
|
for comment_token in self.initial_comments:
|
||||||
|
root.addprevious(etree.Comment(comment_token["data"]))
|
||||||
|
|
||||||
|
#Create the root document and add the ElementTree to it
|
||||||
|
self.document = self.documentClass()
|
||||||
|
self.document._elementTree = root.getroottree()
|
||||||
|
|
||||||
|
#Add the root element to the internal child/open data structures
|
||||||
|
namespace = token.get("namespace", None)
|
||||||
|
root_element = self.elementClass(token["name"], namespace)
|
||||||
|
root_element._element = root
|
||||||
|
self.document._childNodes.append(root_element)
|
||||||
|
self.openElements.append(root_element)
|
||||||
|
|
||||||
|
#Reset to the default insert comment function
|
||||||
|
self.insertComment = super(TreeBuilder, self).insertComment
|
@ -1,5 +1,5 @@
|
|||||||
import _base
|
import _base
|
||||||
from html5lib.constants import voidElements
|
from html5lib.constants import voidElements, namespaces, prefixes
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
# Really crappy basic implementation of a DOM-core like thing
|
# Really crappy basic implementation of a DOM-core like thing
|
||||||
@ -63,6 +63,8 @@ class Node(_base.Node):
|
|||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
newNode = type(self)(self.name)
|
newNode = type(self)(self.name)
|
||||||
|
if hasattr(self, 'namespace'):
|
||||||
|
newNode.namespace = self.namespace
|
||||||
if hasattr(self, 'attributes'):
|
if hasattr(self, 'attributes'):
|
||||||
for attr, value in self.attributes.iteritems():
|
for attr, value in self.attributes.iteritems():
|
||||||
newNode.attributes[attr] = value
|
newNode.attributes[attr] = value
|
||||||
@ -73,6 +75,14 @@ class Node(_base.Node):
|
|||||||
"""Return true if the node has children or text"""
|
"""Return true if the node has children or text"""
|
||||||
return bool(self.childNodes)
|
return bool(self.childNodes)
|
||||||
|
|
||||||
|
def getNameTuple(self):
|
||||||
|
if self.namespace == None:
|
||||||
|
return namespaces["html"], self.name
|
||||||
|
else:
|
||||||
|
return self.namespace, self.name
|
||||||
|
|
||||||
|
nameTuple = property(getNameTuple)
|
||||||
|
|
||||||
class Document(Node):
|
class Document(Node):
|
||||||
type = 1
|
type = 1
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -81,6 +91,9 @@ class Document(Node):
|
|||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return "#document"
|
return "#document"
|
||||||
|
|
||||||
|
def appendChild(self, child):
|
||||||
|
Node.appendChild(self, child)
|
||||||
|
|
||||||
def toxml(self, encoding="utf=8"):
|
def toxml(self, encoding="utf=8"):
|
||||||
result = ""
|
result = ""
|
||||||
for child in self.childNodes:
|
for child in self.childNodes:
|
||||||
@ -106,13 +119,21 @@ class DocumentFragment(Document):
|
|||||||
|
|
||||||
class DocumentType(Node):
|
class DocumentType(Node):
|
||||||
type = 3
|
type = 3
|
||||||
def __init__(self, name):
|
def __init__(self, name, publicId, systemId):
|
||||||
Node.__init__(self, name)
|
Node.__init__(self, name)
|
||||||
self.publicId = u""
|
self.publicId = publicId
|
||||||
self.systemId = u""
|
self.systemId = systemId
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return u"<!DOCTYPE %s>" % self.name
|
if self.publicId or self.systemId:
|
||||||
|
publicId = self.publicId or ""
|
||||||
|
systemId = self.systemId or ""
|
||||||
|
return """<!DOCTYPE %s "%s" "%s">"""%(
|
||||||
|
self.name, publicId, systemId)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return u"<!DOCTYPE %s>" % self.name
|
||||||
|
|
||||||
|
|
||||||
toxml = __unicode__
|
toxml = __unicode__
|
||||||
|
|
||||||
@ -135,12 +156,16 @@ class TextNode(Node):
|
|||||||
|
|
||||||
class Element(Node):
|
class Element(Node):
|
||||||
type = 5
|
type = 5
|
||||||
def __init__(self, name):
|
def __init__(self, name, namespace=None):
|
||||||
Node.__init__(self, name)
|
Node.__init__(self, name)
|
||||||
|
self.namespace = namespace
|
||||||
self.attributes = {}
|
self.attributes = {}
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return u"<%s>" % self.name
|
if self.namespace in (None, namespaces["html"]):
|
||||||
|
return u"<%s>" % self.name
|
||||||
|
else:
|
||||||
|
return u"<%s %s>"%(prefixes[self.namespace], self.name)
|
||||||
|
|
||||||
def toxml(self):
|
def toxml(self):
|
||||||
result = '<' + self.name
|
result = '<' + self.name
|
||||||
@ -174,6 +199,8 @@ class Element(Node):
|
|||||||
indent += 2
|
indent += 2
|
||||||
if self.attributes:
|
if self.attributes:
|
||||||
for name, value in self.attributes.iteritems():
|
for name, value in self.attributes.iteritems():
|
||||||
|
if isinstance(name, tuple):
|
||||||
|
name = "%s %s"%(name[0], name[1])
|
||||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||||
for child in self.childNodes:
|
for child in self.childNodes:
|
||||||
tree += child.printTree(indent)
|
tree += child.printTree(indent)
|
||||||
|
107
planet/vendor/html5lib/treebuilders/soup.py
vendored
107
planet/vendor/html5lib/treebuilders/soup.py
vendored
@ -1,6 +1,9 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||||
|
|
||||||
import _base
|
import _base
|
||||||
|
from html5lib.constants import namespaces, DataLossWarning
|
||||||
|
|
||||||
class AttrList(object):
|
class AttrList(object):
|
||||||
def __init__(self, element):
|
def __init__(self, element):
|
||||||
@ -22,22 +25,39 @@ class AttrList(object):
|
|||||||
|
|
||||||
|
|
||||||
class Element(_base.Node):
|
class Element(_base.Node):
|
||||||
def __init__(self, element, soup):
|
def __init__(self, element, soup, namespace):
|
||||||
_base.Node.__init__(self, element.name)
|
_base.Node.__init__(self, element.name)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup=soup
|
self.soup = soup
|
||||||
|
self.namespace = namespace
|
||||||
|
|
||||||
|
def _nodeIndex(self, node, refNode):
|
||||||
|
# Finds a node by identity rather than equality
|
||||||
|
for index in range(len(self.element.contents)):
|
||||||
|
if id(self.element.contents[index]) == id(refNode.element):
|
||||||
|
return index
|
||||||
|
return None
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
if (node.element.__class__ == NavigableString and self.element.contents
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
and self.element.contents[-1].__class__ == NavigableString):
|
and self.element.contents[-1].__class__ == NavigableString):
|
||||||
newNode = TextNode(NavigableString(
|
# Concatenate new text onto old text node
|
||||||
self.element.contents[-1]+node.element), self.soup)
|
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
|
||||||
self.element.contents[-1].extract()
|
newStr = NavigableString(self.element.contents[-1]+node.element)
|
||||||
self.appendChild(newNode)
|
|
||||||
|
# Remove the old text node
|
||||||
|
# (Can't simply use .extract() by itself, because it fails if
|
||||||
|
# an equal text node exists within the parent node)
|
||||||
|
oldElement = self.element.contents[-1]
|
||||||
|
del self.element.contents[-1]
|
||||||
|
oldElement.parent = None
|
||||||
|
oldElement.extract()
|
||||||
|
|
||||||
|
self.element.insert(len(self.element.contents), newStr)
|
||||||
else:
|
else:
|
||||||
self.element.insert(len(self.element.contents), node.element)
|
self.element.insert(len(self.element.contents), node.element)
|
||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
def getAttributes(self):
|
def getAttributes(self):
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
@ -56,18 +76,25 @@ class Element(_base.Node):
|
|||||||
self.appendChild(text)
|
self.appendChild(text)
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
index = self.element.contents.index(refNode.element)
|
index = self._nodeIndex(node, refNode)
|
||||||
if (node.element.__class__ == NavigableString and self.element.contents
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
and self.element.contents[index-1].__class__ == NavigableString):
|
and self.element.contents[index-1].__class__ == NavigableString):
|
||||||
newNode = TextNode(NavigableString(
|
# (See comments in appendChild)
|
||||||
self.element.contents[index-1]+node.element), self.soup)
|
newStr = NavigableString(self.element.contents[index-1]+node.element)
|
||||||
self.element.contents[index-1].extract()
|
oldNode = self.element.contents[index-1]
|
||||||
self.insertBefore(newNode, refNode)
|
del self.element.contents[index-1]
|
||||||
|
oldNode.parent = None
|
||||||
|
oldNode.extract()
|
||||||
|
|
||||||
|
self.element.insert(index-1, newStr)
|
||||||
else:
|
else:
|
||||||
self.element.insert(index, node.element)
|
self.element.insert(index, node.element)
|
||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
def removeChild(self, node):
|
def removeChild(self, node):
|
||||||
|
index = self._nodeIndex(node.parent, node)
|
||||||
|
del node.parent.element.contents[index]
|
||||||
|
node.element.parent = None
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
node.parent = None
|
node.parent = None
|
||||||
|
|
||||||
@ -76,12 +103,12 @@ class Element(_base.Node):
|
|||||||
child = self.element.contents[0]
|
child = self.element.contents[0]
|
||||||
child.extract()
|
child.extract()
|
||||||
if isinstance(child, Tag):
|
if isinstance(child, Tag):
|
||||||
newParent.appendChild(Element(child, self.soup))
|
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
|
||||||
else:
|
else:
|
||||||
newParent.appendChild(TextNode(child, self.soup))
|
newParent.appendChild(TextNode(child, self.soup))
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
node = Element(Tag(self.soup, self.element.name), self.soup)
|
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
|
||||||
for key,value in self.attributes:
|
for key,value in self.attributes:
|
||||||
node.attributes[key] = value
|
node.attributes[key] = value
|
||||||
return node
|
return node
|
||||||
@ -89,11 +116,19 @@ class Element(_base.Node):
|
|||||||
def hasContent(self):
|
def hasContent(self):
|
||||||
return self.element.contents
|
return self.element.contents
|
||||||
|
|
||||||
|
def getNameTuple(self):
|
||||||
|
if self.namespace == None:
|
||||||
|
return namespaces["html"], self.name
|
||||||
|
else:
|
||||||
|
return self.namespace, self.name
|
||||||
|
|
||||||
|
nameTuple = property(getNameTuple)
|
||||||
|
|
||||||
class TextNode(Element):
|
class TextNode(Element):
|
||||||
def __init__(self, element, soup):
|
def __init__(self, element, soup):
|
||||||
_base.Node.__init__(self, None)
|
_base.Node.__init__(self, None)
|
||||||
self.element = element
|
self.element = element
|
||||||
self.soup=soup
|
self.soup = soup
|
||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@ -101,13 +136,25 @@ class TextNode(Element):
|
|||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
self.soup = BeautifulSoup("")
|
self.soup = BeautifulSoup("")
|
||||||
return Element(self.soup, self.soup)
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
def insertDoctype(self, name, publicId, systemId):
|
def insertDoctype(self, token):
|
||||||
self.soup.insert(0, Declaration(name))
|
name = token["name"]
|
||||||
|
publicId = token["publicId"]
|
||||||
|
systemId = token["systemId"]
|
||||||
|
|
||||||
|
if publicId:
|
||||||
|
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
||||||
|
elif systemId:
|
||||||
|
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
|
||||||
|
(name, systemId)))
|
||||||
|
else:
|
||||||
|
self.soup.insert(0, Declaration(name))
|
||||||
|
|
||||||
def elementClass(self, name):
|
def elementClass(self, name, namespace):
|
||||||
return Element(Tag(self.soup, name), self.soup)
|
if namespace not in (None, namespaces["html"]):
|
||||||
|
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
|
||||||
|
return Element(Tag(self.soup, name), self.soup, namespace)
|
||||||
|
|
||||||
def commentClass(self, data):
|
def commentClass(self, data):
|
||||||
return TextNode(Comment(data), self.soup)
|
return TextNode(Comment(data), self.soup)
|
||||||
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
def fragmentClass(self):
|
def fragmentClass(self):
|
||||||
self.soup = BeautifulSoup("")
|
self.soup = BeautifulSoup("")
|
||||||
self.soup.name = "[document_fragment]"
|
self.soup.name = "[document_fragment]"
|
||||||
return Element(self.soup, self.soup)
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
self.soup.insert(len(self.soup.contents), node.element)
|
self.soup.insert(len(self.soup.contents), node.element)
|
||||||
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
return _base.TreeBuilder.getFragment(self).element
|
return _base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
def testSerializer(element):
|
def testSerializer(element):
|
||||||
|
import re
|
||||||
rv = []
|
rv = []
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if isinstance(element, Declaration):
|
if isinstance(element, Declaration):
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
|
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
||||||
|
m = re.compile(doctype_regexp).match(element.string)
|
||||||
|
assert m is not None, "DOCTYPE did not match expected format"
|
||||||
|
name = m.group('name')
|
||||||
|
publicId = m.group('publicId')
|
||||||
|
if publicId is not None:
|
||||||
|
systemId = m.group('systemId1') or ""
|
||||||
|
else:
|
||||||
|
systemId = m.group('systemId2')
|
||||||
|
|
||||||
|
if publicId is not None or systemId is not None:
|
||||||
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
|
||||||
|
(' '*indent, name, publicId or "", systemId or ""))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
|
||||||
|
|
||||||
elif isinstance(element, BeautifulSoup):
|
elif isinstance(element, BeautifulSoup):
|
||||||
if element.name == "[document_fragment]":
|
if element.name == "[document_fragment]":
|
||||||
rv.append("#document-fragment")
|
rv.append("#document-fragment")
|
||||||
|
38
planet/vendor/html5lib/treewalkers/_base.py
vendored
38
planet/vendor/html5lib/treewalkers/_base.py
vendored
@ -21,18 +21,24 @@ class TreeWalker(object):
|
|||||||
attrs = attrs.items()
|
attrs = attrs.items()
|
||||||
return [(unicode(name),unicode(value)) for name,value in attrs]
|
return [(unicode(name),unicode(value)) for name,value in attrs]
|
||||||
|
|
||||||
def emptyTag(self, name, attrs, hasChildren=False):
|
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||||
yield {"type": "EmptyTag", "name": unicode(name), \
|
yield {"type": "EmptyTag", "name": unicode(name),
|
||||||
"data": self.normalizeAttrs(attrs)}
|
"namespace":unicode(namespace),
|
||||||
|
"data": self.normalizeAttrs(attrs)}
|
||||||
if hasChildren:
|
if hasChildren:
|
||||||
yield self.error(_("Void element has children"))
|
yield self.error(_("Void element has children"))
|
||||||
|
|
||||||
def startTag(self, name, attrs):
|
def startTag(self, namespace, name, attrs):
|
||||||
return {"type": "StartTag", "name": unicode(name), \
|
return {"type": "StartTag",
|
||||||
"data": self.normalizeAttrs(attrs)}
|
"name": unicode(name),
|
||||||
|
"namespace":unicode(namespace),
|
||||||
|
"data": self.normalizeAttrs(attrs)}
|
||||||
|
|
||||||
def endTag(self, name):
|
def endTag(self, namespace, name):
|
||||||
return {"type": "EndTag", "name": unicode(name), "data": []}
|
return {"type": "EndTag",
|
||||||
|
"name": unicode(name),
|
||||||
|
"namespace":unicode(namespace),
|
||||||
|
"data": []}
|
||||||
|
|
||||||
def text(self, data):
|
def text(self, data):
|
||||||
data = unicode(data)
|
data = unicode(data)
|
||||||
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
|
|||||||
def walkChildren(self, node):
|
def walkChildren(self, node):
|
||||||
raise NodeImplementedError
|
raise NodeImplementedError
|
||||||
|
|
||||||
def element(self, node, name, attrs, hasChildren):
|
def element(self, node, namespace, name, attrs, hasChildren):
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
for token in self.emptyTag(name, attrs, hasChildren):
|
for token in self.emptyTag(namespace, name, attrs, hasChildren):
|
||||||
yield token
|
yield token
|
||||||
else:
|
else:
|
||||||
yield self.startTag(name, attrs)
|
yield self.startTag(name, attrs)
|
||||||
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||||||
details = self.getNodeDetails(currentNode)
|
details = self.getNodeDetails(currentNode)
|
||||||
type, details = details[0], details[1:]
|
type, details = details[0], details[1:]
|
||||||
hasChildren = False
|
hasChildren = False
|
||||||
|
endTag = None
|
||||||
|
|
||||||
if type == DOCTYPE:
|
if type == DOCTYPE:
|
||||||
yield self.doctype(*details)
|
yield self.doctype(*details)
|
||||||
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
elif type == ELEMENT:
|
elif type == ELEMENT:
|
||||||
name, attributes, hasChildren = details
|
namespace, name, attributes, hasChildren = details
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
for token in self.emptyTag(name, attributes, hasChildren):
|
for token in self.emptyTag(namespace, name, attributes, hasChildren):
|
||||||
yield token
|
yield token
|
||||||
hasChildren = False
|
hasChildren = False
|
||||||
else:
|
else:
|
||||||
yield self.startTag(name, attributes)
|
endTag = name
|
||||||
|
yield self.startTag(namespace, name, attributes)
|
||||||
|
|
||||||
elif type == COMMENT:
|
elif type == COMMENT:
|
||||||
yield self.comment(details[0])
|
yield self.comment(details[0])
|
||||||
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||||||
details = self.getNodeDetails(currentNode)
|
details = self.getNodeDetails(currentNode)
|
||||||
type, details = details[0], details[1:]
|
type, details = details[0], details[1:]
|
||||||
if type == ELEMENT:
|
if type == ELEMENT:
|
||||||
name, attributes, hasChildren = details
|
namespace, name, attributes, hasChildren = details
|
||||||
if name not in voidElements:
|
if name not in voidElements:
|
||||||
yield self.endTag(name)
|
yield self.endTag(namespace, name)
|
||||||
nextSibling = self.getNextSibling(currentNode)
|
nextSibling = self.getNextSibling(currentNode)
|
||||||
if nextSibling is not None:
|
if nextSibling is not None:
|
||||||
currentNode = nextSibling
|
currentNode = nextSibling
|
||||||
|
3
planet/vendor/html5lib/treewalkers/dom.py
vendored
3
planet/vendor/html5lib/treewalkers/dom.py
vendored
@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
return _base.TEXT, node.nodeValue
|
return _base.TEXT, node.nodeValue
|
||||||
|
|
||||||
elif node.nodeType == Node.ELEMENT_NODE:
|
elif node.nodeType == Node.ELEMENT_NODE:
|
||||||
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
|
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||||
|
node.attributes.items(), node.hasChildNodes)
|
||||||
|
|
||||||
elif node.nodeType == Node.COMMENT_NODE:
|
elif node.nodeType == Node.COMMENT_NODE:
|
||||||
return _base.COMMENT, node.nodeValue
|
return _base.COMMENT, node.nodeValue
|
||||||
|
124
planet/vendor/html5lib/treewalkers/etree.py
vendored
124
planet/vendor/html5lib/treewalkers/etree.py
vendored
@ -3,10 +3,13 @@ _ = gettext.gettext
|
|||||||
|
|
||||||
import new
|
import new
|
||||||
import copy
|
import copy
|
||||||
|
import re
|
||||||
|
|
||||||
import _base
|
import _base
|
||||||
from html5lib.constants import voidElements
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
|
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||||
|
|
||||||
moduleCache = {}
|
moduleCache = {}
|
||||||
|
|
||||||
def getETreeModule(ElementTreeImplementation):
|
def getETreeModule(ElementTreeImplementation):
|
||||||
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||||||
to avoid using recursion, returns "nodes" as tuples with the following
|
to avoid using recursion, returns "nodes" as tuples with the following
|
||||||
content:
|
content:
|
||||||
|
|
||||||
1. An Element node serving as *context* (it cannot be called the parent
|
1. The current element
|
||||||
node due to the particular ``tail`` text nodes.
|
|
||||||
|
2. The index of the element relative to its parent
|
||||||
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
|
|
||||||
|
3. A stack of ancestor elements
|
||||||
3. A list used as a stack of all ancestor *context nodes*. It is a
|
|
||||||
pair tuple whose first item is an Element and second item is a child
|
4. A flag "text", "tail" or None to indicate if the current node is a
|
||||||
index.
|
text node; either the text or tail of the current element (1)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
if isinstance(node, tuple): # It might be the root Element
|
if isinstance(node, tuple): # It might be the root Element
|
||||||
elt, key, parents = node
|
elt, key, parents, flag = node
|
||||||
if key in ("text", "tail"):
|
if flag in ("text", "tail"):
|
||||||
return _base.TEXT, getattr(elt, key)
|
return _base.TEXT, getattr(elt, flag)
|
||||||
else:
|
else:
|
||||||
node = elt[int(key)]
|
node = elt
|
||||||
|
|
||||||
if not(hasattr(node, "tag")):
|
if not(hasattr(node, "tag")):
|
||||||
node = node.getroot()
|
node = node.getroot()
|
||||||
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||||||
return (_base.DOCUMENT,)
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
elif node.tag == "<!DOCTYPE>":
|
elif node.tag == "<!DOCTYPE>":
|
||||||
return _base.DOCTYPE, node.text
|
return (_base.DOCTYPE, node.text,
|
||||||
|
node.get("publicId"), node.get("systemId"))
|
||||||
|
|
||||||
elif type(node.tag) == type(ElementTree.Comment):
|
elif type(node.tag) == type(ElementTree.Comment):
|
||||||
return _base.COMMENT, node.text
|
return _base.COMMENT, node.text
|
||||||
|
|
||||||
else:
|
else:
|
||||||
#This is assumed to be an ordinary element
|
#This is assumed to be an ordinary element
|
||||||
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
|
match = tag_regexp.match(node.tag)
|
||||||
|
if match:
|
||||||
|
namespace, tag = match.groups()
|
||||||
|
else:
|
||||||
|
namespace = None
|
||||||
|
tag = node.tag
|
||||||
|
return (_base.ELEMENT, namespace, tag,
|
||||||
|
node.attrib.items(), len(node) or node.text)
|
||||||
|
|
||||||
def getFirstChild(self, node):
|
def getFirstChild(self, node):
|
||||||
if isinstance(node, tuple): # It might be the root Element
|
if isinstance(node, tuple):
|
||||||
elt, key, parents = node
|
element, key, parents, flag = node
|
||||||
assert key not in ("text", "tail"), "Text nodes have no children"
|
|
||||||
parents.append((elt, int(key)))
|
|
||||||
node = elt[int(key)]
|
|
||||||
else:
|
else:
|
||||||
parents = []
|
element, key, parents, flag = node, None, [], None
|
||||||
|
|
||||||
assert len(node) or node.text, "Node has no children"
|
if flag in ("text", "tail"):
|
||||||
if node.text:
|
return None
|
||||||
return (node, "text", parents)
|
|
||||||
else:
|
else:
|
||||||
return (node, 0, parents)
|
if element.text:
|
||||||
|
return element, key, parents, "text"
|
||||||
|
elif len(element):
|
||||||
|
parents.append(element)
|
||||||
|
return element[0], 0, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def getNextSibling(self, node):
|
def getNextSibling(self, node):
|
||||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
if isinstance(node, tuple):
|
||||||
|
element, key, parents, flag = node
|
||||||
elt, key, parents = node
|
|
||||||
if key == "text":
|
|
||||||
key = -1
|
|
||||||
elif key == "tail":
|
|
||||||
elt, key = parents.pop()
|
|
||||||
else:
|
|
||||||
# Look for "tail" of the "revisited" node
|
|
||||||
child = elt[key]
|
|
||||||
if child.tail:
|
|
||||||
parents.append((elt, key))
|
|
||||||
return (child, "tail", parents)
|
|
||||||
|
|
||||||
# case where key were "text" or "tail" or elt[key] had a tail
|
|
||||||
key += 1
|
|
||||||
if len(elt) > key:
|
|
||||||
return (elt, key, parents)
|
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if flag == "text":
|
||||||
|
if len(element):
|
||||||
|
parents.append(element)
|
||||||
|
return element[0], 0, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if element.tail and flag != "tail":
|
||||||
|
return element, key, parents, "tail"
|
||||||
|
elif key < len(parents[-1]) - 1:
|
||||||
|
return parents[-1][key+1], key+1, parents, None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def getParentNode(self, node):
|
def getParentNode(self, node):
|
||||||
assert isinstance(node, tuple)
|
if isinstance(node, tuple):
|
||||||
elt, key, parents = node
|
element, key, parents, flag = node
|
||||||
if parents:
|
|
||||||
elt, key = parents.pop()
|
|
||||||
return elt, key, parents
|
|
||||||
else:
|
else:
|
||||||
# HACK: We could return ``elt`` but None will stop the algorithm the same way
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if flag == "text":
|
||||||
|
if not parents:
|
||||||
|
return element
|
||||||
|
else:
|
||||||
|
return element, key, parents, None
|
||||||
|
else:
|
||||||
|
parent = parents.pop()
|
||||||
|
if not parents:
|
||||||
|
return parent
|
||||||
|
else:
|
||||||
|
return parent, list(parents[-1]).index(parent), parents, None
|
||||||
|
|
||||||
return locals()
|
return locals()
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
|
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
|
||||||
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||||
from genshi.output import NamespaceFlattener
|
from genshi.output import NamespaceFlattener
|
||||||
|
|
||||||
@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
depth = 0
|
depth = 0
|
||||||
ignore_until = None
|
ignore_until = None
|
||||||
previous = None
|
previous = None
|
||||||
for event in NamespaceFlattener(prefixes={
|
for event in self.tree:
|
||||||
'http://www.w3.org/1999/xhtml': ''
|
|
||||||
})(self.tree):
|
|
||||||
if previous is not None:
|
if previous is not None:
|
||||||
if previous[0] == START:
|
if previous[0] == START:
|
||||||
depth += 1
|
depth += 1
|
||||||
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
kind, data, pos = event
|
kind, data, pos = event
|
||||||
if kind == START:
|
if kind == START:
|
||||||
tag, attrib = data
|
tag, attrib = data
|
||||||
|
name = tag.localname
|
||||||
|
namespace = tag.namespace
|
||||||
if tag in voidElements:
|
if tag in voidElements:
|
||||||
for token in self.emptyTag(tag, list(attrib), \
|
for token in self.emptyTag(namespace, name, list(attrib),
|
||||||
not next or next[0] != END or next[1] != tag):
|
not next or next[0] != END
|
||||||
|
or next[1] != tag):
|
||||||
yield token
|
yield token
|
||||||
else:
|
else:
|
||||||
yield self.startTag(tag, list(attrib))
|
yield self.startTag(namespace, name, list(attrib))
|
||||||
|
|
||||||
elif kind == END:
|
elif kind == END:
|
||||||
if data not in voidElements:
|
name = data.localname
|
||||||
yield self.endTag(data)
|
namespace = data.namespace
|
||||||
|
if (namespace, name) not in voidElements:
|
||||||
|
yield self.endTag(namespace, name)
|
||||||
|
|
||||||
elif kind == COMMENT:
|
elif kind == COMMENT:
|
||||||
yield self.comment(data)
|
yield self.comment(data)
|
||||||
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
elif kind == DOCTYPE:
|
elif kind == DOCTYPE:
|
||||||
yield self.doctype(*data)
|
yield self.doctype(*data)
|
||||||
|
|
||||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
|
||||||
START_CDATA, END_CDATA, PI):
|
START_CDATA, END_CDATA, PI):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
175
planet/vendor/html5lib/treewalkers/lxmletree.py
vendored
Normal file
175
planet/vendor/html5lib/treewalkers/lxmletree.py
vendored
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
from lxml import etree
|
||||||
|
from html5lib.treebuilders.etree import tag_regexp
|
||||||
|
|
||||||
|
from gettext import gettext
|
||||||
|
_ = gettext
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements
|
||||||
|
from html5lib import ihatexml
|
||||||
|
|
||||||
|
class Root(object):
|
||||||
|
def __init__(self, et):
|
||||||
|
self.elementtree = et
|
||||||
|
self.children = []
|
||||||
|
if et.docinfo.internalDTD:
|
||||||
|
self.children.append(Doctype(self, et.docinfo.root_name,
|
||||||
|
et.docinfo.public_id,
|
||||||
|
et.docinfo.system_url))
|
||||||
|
root = et.getroot()
|
||||||
|
node = root
|
||||||
|
|
||||||
|
while node.getprevious() is not None:
|
||||||
|
node = node.getprevious()
|
||||||
|
while node is not None:
|
||||||
|
self.children.append(node)
|
||||||
|
node = node.getnext()
|
||||||
|
|
||||||
|
self.text = None
|
||||||
|
self.tail = None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.children[key]
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
class Doctype(object):
|
||||||
|
def __init__(self, root_node, name, public_id, system_id):
|
||||||
|
self.root_node = root_node
|
||||||
|
self.name = name
|
||||||
|
self.public_id = public_id
|
||||||
|
self.system_id = system_id
|
||||||
|
|
||||||
|
self.text = None
|
||||||
|
self.tail = None
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return self.root_node.children[1]
|
||||||
|
|
||||||
|
class FragmentRoot(Root):
|
||||||
|
def __init__(self, children):
|
||||||
|
self.children = [FragmentWrapper(self, child) for child in children]
|
||||||
|
self.text = self.tail = None
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
class FragmentWrapper(object):
|
||||||
|
def __init__(self, fragment_root, obj):
|
||||||
|
self.root_node = fragment_root
|
||||||
|
self.obj = obj
|
||||||
|
if hasattr(self.obj, 'text'):
|
||||||
|
self.text = self.obj.text
|
||||||
|
else:
|
||||||
|
self.text = None
|
||||||
|
if hasattr(self.obj, 'tail'):
|
||||||
|
self.tail = self.obj.tail
|
||||||
|
else:
|
||||||
|
self.tail = None
|
||||||
|
self.isstring = isinstance(obj, basestring)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.obj, name)
|
||||||
|
|
||||||
|
def getnext(self):
|
||||||
|
siblings = self.root_node.children
|
||||||
|
idx = siblings.index(self)
|
||||||
|
if idx < len(siblings) - 1:
|
||||||
|
return siblings[idx + 1]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return self.obj[key]
|
||||||
|
|
||||||
|
def __nonzero__(self):
|
||||||
|
return bool(self.obj)
|
||||||
|
|
||||||
|
def getparent(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str(self.obj)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.obj)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
def __init__(self, tree):
|
||||||
|
if hasattr(tree, "getroot"):
|
||||||
|
tree = Root(tree)
|
||||||
|
elif isinstance(tree, list):
|
||||||
|
tree = FragmentRoot(tree)
|
||||||
|
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||||
|
self.filter = ihatexml.InfosetFilter()
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||||
|
return _base.TEXT, getattr(node, key)
|
||||||
|
|
||||||
|
elif isinstance(node, Root):
|
||||||
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif isinstance(node, Doctype):
|
||||||
|
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||||
|
|
||||||
|
elif isinstance(node, FragmentWrapper) and node.isstring:
|
||||||
|
return _base.TEXT, node
|
||||||
|
|
||||||
|
elif node.tag == etree.Comment:
|
||||||
|
return _base.COMMENT, node.text
|
||||||
|
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
match = tag_regexp.match(node.tag)
|
||||||
|
if match:
|
||||||
|
namespace, tag = match.groups()
|
||||||
|
else:
|
||||||
|
namespace = None
|
||||||
|
tag = node.tag
|
||||||
|
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||||
|
[(self.filter.fromXmlName(name), value) for
|
||||||
|
name,value in node.attrib.iteritems()],
|
||||||
|
len(node) > 0 or node.text)
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
assert not isinstance(node, tuple), _("Text nodes have no children")
|
||||||
|
|
||||||
|
assert len(node) or node.text, "Node has no children"
|
||||||
|
if node.text:
|
||||||
|
return (node, "text")
|
||||||
|
else:
|
||||||
|
return node[0]
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||||
|
if key == "text":
|
||||||
|
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
|
||||||
|
# because node[0] might evaluate to False if it has no child element
|
||||||
|
if len(node):
|
||||||
|
return node[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else: # tail
|
||||||
|
return node.getnext()
|
||||||
|
|
||||||
|
return node.tail and (node, "tail") or node.getnext()
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
if isinstance(node, tuple): # Text node
|
||||||
|
node, key = node
|
||||||
|
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
|
||||||
|
if key == "text":
|
||||||
|
return node
|
||||||
|
# else: fallback to "normal" processing
|
||||||
|
|
||||||
|
return node.getparent()
|
12
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
12
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
type, node = event
|
type, node = event
|
||||||
if type == START_ELEMENT:
|
if type == START_ELEMENT:
|
||||||
name = node.nodeName
|
name = node.nodeName
|
||||||
|
namespace = node.namespaceURI
|
||||||
if name in voidElements:
|
if name in voidElements:
|
||||||
for token in self.emptyTag(name, \
|
for token in self.emptyTag(namespace,
|
||||||
node.attributes.items(), not next or next[1] is not node):
|
name,
|
||||||
|
node.attributes.items(),
|
||||||
|
not next or next[1] is not node):
|
||||||
yield token
|
yield token
|
||||||
else:
|
else:
|
||||||
yield self.startTag(name, node.attributes.items())
|
yield self.startTag(namespace, name, node.attributes.items())
|
||||||
|
|
||||||
elif type == END_ELEMENT:
|
elif type == END_ELEMENT:
|
||||||
name = node.nodeName
|
name = node.nodeName
|
||||||
|
namespace = node.namespaceURI
|
||||||
if name not in voidElements:
|
if name not in voidElements:
|
||||||
yield self.endTag(name)
|
yield self.endTag(namespace, name)
|
||||||
|
|
||||||
elif type == COMMENT:
|
elif type == COMMENT:
|
||||||
yield self.comment(node.nodeValue)
|
yield self.comment(node.nodeValue)
|
||||||
|
@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
return _base.TEXT, node.value
|
return _base.TEXT, node.value
|
||||||
|
|
||||||
elif node.type == 5: # Element
|
elif node.type == 5: # Element
|
||||||
return _base.ELEMENT, node.name, \
|
return (_base.ELEMENT, node.namespace, node.name,
|
||||||
node.attributes.items(), node.hasContent()
|
node.attributes.items(), node.hasContent())
|
||||||
|
|
||||||
elif node.type == 6: # CommentNode
|
elif node.type == 6: # CommentNode
|
||||||
return _base.COMMENT, node.data
|
return _base.COMMENT, node.data
|
||||||
|
29
planet/vendor/html5lib/treewalkers/soup.py
vendored
29
planet/vendor/html5lib/treewalkers/soup.py
vendored
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
|||||||
import _base
|
import _base
|
||||||
|
|
||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
doctype_regexp = re.compile(
|
||||||
|
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||||
return (_base.DOCUMENT,)
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
elif isinstance(node, Declaration): # DocumentType
|
elif isinstance(node, Declaration): # DocumentType
|
||||||
#Slice needed to remove markup added during unicode conversion
|
string = unicode(node.string)
|
||||||
return _base.DOCTYPE, unicode(node.string)[2:-1]
|
#Slice needed to remove markup added during unicode conversion,
|
||||||
|
#but only in some versions of BeautifulSoup/Python
|
||||||
|
if string.startswith('<!') and string.endswith('>'):
|
||||||
|
string = string[2:-1]
|
||||||
|
m = self.doctype_regexp.match(string)
|
||||||
|
#This regexp approach seems wrong and fragile
|
||||||
|
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
|
||||||
|
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
|
||||||
|
#been modified at all
|
||||||
|
#We could just feed to it a html5lib tokenizer, I guess...
|
||||||
|
assert m is not None, "DOCTYPE did not match expected format"
|
||||||
|
name = m.group('name')
|
||||||
|
publicId = m.group('publicId')
|
||||||
|
if publicId is not None:
|
||||||
|
systemId = m.group('systemId1')
|
||||||
|
else:
|
||||||
|
systemId = m.group('systemId2')
|
||||||
|
return _base.DOCTYPE, name, publicId or "", systemId or ""
|
||||||
|
|
||||||
elif isinstance(node, Comment):
|
elif isinstance(node, Comment):
|
||||||
return _base.COMMENT, unicode(node.string)[4:-3]
|
string = unicode(node.string)
|
||||||
|
if string.startswith('<!--') and string.endswith('-->'):
|
||||||
|
string = string[4:-3]
|
||||||
|
return _base.COMMENT, string
|
||||||
|
|
||||||
elif isinstance(node, unicode): # TextNode
|
elif isinstance(node, unicode): # TextNode
|
||||||
return _base.TEXT, node
|
return _base.TEXT, node
|
||||||
|
120
planet/vendor/html5lib/utils.py
vendored
120
planet/vendor/html5lib/utils.py
vendored
@ -34,3 +34,123 @@ class MethodDispatcher(dict):
|
|||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
return dict.get(self, key, self.default)
|
return dict.get(self, key, self.default)
|
||||||
|
|
||||||
|
#Pure python implementation of deque taken from the ASPN Python Cookbook
|
||||||
|
#Original code by Raymond Hettinger
|
||||||
|
|
||||||
|
class deque(object):
|
||||||
|
|
||||||
|
def __init__(self, iterable=(), maxsize=-1):
|
||||||
|
if not hasattr(self, 'data'):
|
||||||
|
self.left = self.right = 0
|
||||||
|
self.data = {}
|
||||||
|
self.maxsize = maxsize
|
||||||
|
self.extend(iterable)
|
||||||
|
|
||||||
|
def append(self, x):
|
||||||
|
self.data[self.right] = x
|
||||||
|
self.right += 1
|
||||||
|
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||||
|
self.popleft()
|
||||||
|
|
||||||
|
def appendleft(self, x):
|
||||||
|
self.left -= 1
|
||||||
|
self.data[self.left] = x
|
||||||
|
if self.maxsize != -1 and len(self) > self.maxsize:
|
||||||
|
self.pop()
|
||||||
|
|
||||||
|
def pop(self):
|
||||||
|
if self.left == self.right:
|
||||||
|
raise IndexError('cannot pop from empty deque')
|
||||||
|
self.right -= 1
|
||||||
|
elem = self.data[self.right]
|
||||||
|
del self.data[self.right]
|
||||||
|
return elem
|
||||||
|
|
||||||
|
def popleft(self):
|
||||||
|
if self.left == self.right:
|
||||||
|
raise IndexError('cannot pop from empty deque')
|
||||||
|
elem = self.data[self.left]
|
||||||
|
del self.data[self.left]
|
||||||
|
self.left += 1
|
||||||
|
return elem
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
self.data.clear()
|
||||||
|
self.left = self.right = 0
|
||||||
|
|
||||||
|
def extend(self, iterable):
|
||||||
|
for elem in iterable:
|
||||||
|
self.append(elem)
|
||||||
|
|
||||||
|
def extendleft(self, iterable):
|
||||||
|
for elem in iterable:
|
||||||
|
self.appendleft(elem)
|
||||||
|
|
||||||
|
def rotate(self, n=1):
|
||||||
|
if self:
|
||||||
|
n %= len(self)
|
||||||
|
for i in xrange(n):
|
||||||
|
self.appendleft(self.pop())
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
if i < 0:
|
||||||
|
i += len(self)
|
||||||
|
try:
|
||||||
|
return self.data[i + self.left]
|
||||||
|
except KeyError:
|
||||||
|
raise IndexError
|
||||||
|
|
||||||
|
def __setitem__(self, i, value):
|
||||||
|
if i < 0:
|
||||||
|
i += len(self)
|
||||||
|
try:
|
||||||
|
self.data[i + self.left] = value
|
||||||
|
except KeyError:
|
||||||
|
raise IndexError
|
||||||
|
|
||||||
|
def __delitem__(self, i):
|
||||||
|
size = len(self)
|
||||||
|
if not (-size <= i < size):
|
||||||
|
raise IndexError
|
||||||
|
data = self.data
|
||||||
|
if i < 0:
|
||||||
|
i += size
|
||||||
|
for j in xrange(self.left+i, self.right-1):
|
||||||
|
data[j] = data[j+1]
|
||||||
|
self.pop()
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.right - self.left
|
||||||
|
|
||||||
|
def __cmp__(self, other):
|
||||||
|
if type(self) != type(other):
|
||||||
|
return cmp(type(self), type(other))
|
||||||
|
return cmp(list(self), list(other))
|
||||||
|
|
||||||
|
def __repr__(self, _track=[]):
|
||||||
|
if id(self) in _track:
|
||||||
|
return '...'
|
||||||
|
_track.append(id(self))
|
||||||
|
r = 'deque(%r)' % (list(self),)
|
||||||
|
_track.remove(id(self))
|
||||||
|
return r
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
return (tuple(self),)
|
||||||
|
|
||||||
|
def __setstate__(self, s):
|
||||||
|
self.__init__(s[0])
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
raise TypeError
|
||||||
|
|
||||||
|
def __copy__(self):
|
||||||
|
return self.__class__(self)
|
||||||
|
|
||||||
|
def __deepcopy__(self, memo={}):
|
||||||
|
from copy import deepcopy
|
||||||
|
result = self.__class__()
|
||||||
|
memo[id(self)] = result
|
||||||
|
result.__init__(deepcopy(tuple(self), memo))
|
||||||
|
return result
|
@ -1,6 +1,6 @@
|
|||||||
<!--
|
<!--
|
||||||
Description: illegal control character
|
Description: illegal control character
|
||||||
Expect: content[0].value == u'Page 1\ufffdPage 2'
|
Expect: content[0].value == u'Page 1 Page 2'
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<feed xmns="http://www.w3.org/2005/Atom">
|
<feed xmns="http://www.w3.org/2005/Atom">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user