Update to the lastest html5lib; replace feedparser's sanitizer with

html5lib's
This commit is contained in:
Sam Ruby 2009-09-09 10:54:21 -04:00
parent 63fa05e556
commit 6f0f23dd36
32 changed files with 4868 additions and 2386 deletions

View File

@ -16,7 +16,7 @@ Todo:
import re, time, sgmllib import re, time, sgmllib
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
from xml.dom import minidom, Node from xml.dom import minidom, Node
from html5lib import liberalxmlparser from html5lib import html5parser
from html5lib.treebuilders import dom from html5lib.treebuilders import dom
import planet, config import planet, config
@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
bozo=1 bozo=1
if detail.type.find('xhtml')<0 or bozo: if detail.type.find('xhtml')<0 or bozo:
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder) parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8") html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes: for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue if body.nodeType != Node.ELEMENT_NODE: continue

View File

@ -128,5 +128,11 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs( node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type) node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML( # Run this through HTML5's serializer
node.value, 'utf-8', node.type) from html5lib import html5parser, sanitizer, treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
doc = p.parseFragment(node.value, encoding='utf-8')
walker = treewalkers.getTreeWalker('simpletree')
xhtml = serializer.XHTMLSerializer()
tree = xhtml.serialize(walker(doc), encoding='utf-8')
node['value'] = ''.join([n for n in tree])

View File

@ -11,5 +11,6 @@ f = open("my_document.html")
p = html5lib.HTMLParser() p = html5lib.HTMLParser()
tree = p.parse(f) tree = p.parse(f)
""" """
from html5parser import HTMLParser from html5parser import HTMLParser, parse
from liberalxmlparser import XMLParser, XHTMLParser from treebuilders import getTreeBuilder
from serializer import serialize

View File

@ -1,4 +1,5 @@
import string import string, gettext
_ = gettext.gettext
try: try:
frozenset frozenset
@ -9,6 +10,260 @@ except NameError:
EOF = None EOF = None
E = {
"null-character":
_(u"Null character in input stream, replaced with U+FFFD."),
"invalid-character":
_(u"Invalid codepoint in stream."),
"incorrectly-placed-solidus":
_(u"Solidus (/) incorrectly placed in tag."),
"incorrect-cr-newline-entity":
_(u"Incorrect CR newline entity, replaced with LF."),
"illegal-windows-1252-entity":
_(u"Entity used with illegal number (windows-1252 reference)."),
"cant-convert-numeric-entity":
_(u"Numeric entity couldn't be converted to character "
u"(codepoint U+%(charAsInt)08x)."),
"illegal-codepoint-for-numeric-entity":
_(u"Numeric entity represents an illegal codepoint: "
u"U+%(charAsInt)08x."),
"numeric-entity-without-semicolon":
_(u"Numeric entity didn't end with ';'."),
"expected-numeric-entity-but-got-eof":
_(u"Numeric entity expected. Got end of file instead."),
"expected-numeric-entity":
_(u"Numeric entity expected but none found."),
"named-entity-without-semicolon":
_(u"Named entity didn't end with ';'."),
"expected-named-entity":
_(u"Named entity expected. Got none."),
"attributes-in-end-tag":
_(u"End tag contains unexpected attributes."),
"expected-tag-name-but-got-right-bracket":
_(u"Expected tag name. Got '>' instead."),
"expected-tag-name-but-got-question-mark":
_(u"Expected tag name. Got '?' instead. (HTML doesn't "
u"support processing instructions.)"),
"expected-tag-name":
_(u"Expected tag name. Got something else instead"),
"expected-closing-tag-but-got-right-bracket":
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
"expected-closing-tag-but-got-eof":
_(u"Expected closing tag. Unexpected end of file."),
"expected-closing-tag-but-got-char":
_(u"Expected closing tag. Unexpected character '%(data)s' found."),
"eof-in-tag-name":
_(u"Unexpected end of file in the tag name."),
"expected-attribute-name-but-got-eof":
_(u"Unexpected end of file. Expected attribute name instead."),
"eof-in-attribute-name":
_(u"Unexpected end of file in attribute name."),
"invalid-character-in-attribute-name":
_(u"Invalid chracter in attribute name"),
"duplicate-attribute":
_(u"Dropped duplicate attribute on tag."),
"expected-end-of-tag-name-but-got-eof":
_(u"Unexpected end of file. Expected = or end of tag."),
"expected-attribute-value-but-got-eof":
_(u"Unexpected end of file. Expected attribute value."),
"expected-attribute-value-but-got-right-bracket":
_(u"Expected attribute value. Got '>' instead."),
"eof-in-attribute-value-double-quote":
_(u"Unexpected end of file in attribute value (\")."),
"eof-in-attribute-value-single-quote":
_(u"Unexpected end of file in attribute value (')."),
"eof-in-attribute-value-no-quotes":
_(u"Unexpected end of file in attribute value."),
"unexpected-EOF-after-solidus-in-tag":
_(u"Unexpected end of file in tag. Expected >"),
"unexpected-character-after-soldius-in-tag":
_(u"Unexpected character after / in tag. Expected >"),
"expected-dashes-or-doctype":
_(u"Expected '--' or 'DOCTYPE'. Not found."),
"incorrect-comment":
_(u"Incorrect comment."),
"eof-in-comment":
_(u"Unexpected end of file in comment."),
"eof-in-comment-end-dash":
_(u"Unexpected end of file in comment (-)"),
"unexpected-dash-after-double-dash-in-comment":
_(u"Unexpected '-' after '--' found in comment."),
"eof-in-comment-double-dash":
_(u"Unexpected end of file in comment (--)."),
"unexpected-char-in-comment":
_(u"Unexpected character in comment found."),
"need-space-after-doctype":
_(u"No space after literal string 'DOCTYPE'."),
"expected-doctype-name-but-got-right-bracket":
_(u"Unexpected > character. Expected DOCTYPE name."),
"expected-doctype-name-but-got-eof":
_(u"Unexpected end of file. Expected DOCTYPE name."),
"eof-in-doctype-name":
_(u"Unexpected end of file in DOCTYPE name."),
"eof-in-doctype":
_(u"Unexpected end of file in DOCTYPE."),
"expected-space-or-right-bracket-in-doctype":
_(u"Expected space or '>'. Got '%(data)s'"),
"unexpected-end-of-doctype":
_(u"Unexpected end of DOCTYPE."),
"unexpected-char-in-doctype":
_(u"Unexpected character in DOCTYPE."),
"eof-in-innerhtml":
_(u"XXX innerHTML EOF"),
"unexpected-doctype":
_(u"Unexpected DOCTYPE. Ignored."),
"non-html-root":
_(u"html needs to be the first start tag."),
"expected-doctype-but-got-eof":
_(u"Unexpected End of file. Expected DOCTYPE."),
"unknown-doctype":
_(u"Erroneous DOCTYPE."),
"expected-doctype-but-got-chars":
_(u"Unexpected non-space characters. Expected DOCTYPE."),
"expected-doctype-but-got-start-tag":
_(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
"expected-doctype-but-got-end-tag":
_(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
"end-tag-after-implied-root":
_(u"Unexpected end tag (%(name)s) after the (implied) root element."),
"expected-named-closing-tag-but-got-eof":
_(u"Unexpected end of file. Expected end tag (%(name)s)."),
"two-heads-are-not-better-than-one":
_(u"Unexpected start tag head in existing head. Ignored."),
"unexpected-end-tag":
_(u"Unexpected end tag (%(name)s). Ignored."),
"unexpected-start-tag-out-of-my-head":
_(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
"unexpected-start-tag":
_(u"Unexpected start tag (%(name)s)."),
"missing-end-tag":
_(u"Missing end tag (%(name)s)."),
"missing-end-tags":
_(u"Missing end tags (%(name)s)."),
"unexpected-start-tag-implies-end-tag":
_(u"Unexpected start tag (%(startName)s) "
u"implies end tag (%(endName)s)."),
"unexpected-start-tag-treated-as":
_(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
"deprecated-tag":
_(u"Unexpected start tag %(name)s. Don't use it!"),
"unexpected-start-tag-ignored":
_(u"Unexpected start tag %(name)s. Ignored."),
"expected-one-end-tag-but-got-another":
_(u"Unexpected end tag (%(gotName)s). "
u"Missing end tag (%(expectedName)s)."),
"end-tag-too-early":
_(u"End tag (%(name)s) seen too early. Expected other end tag."),
"end-tag-too-early-named":
_(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
"end-tag-too-early-ignored":
_(u"End tag (%(name)s) seen too early. Ignored."),
"adoption-agency-1.1":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 1 of the adoption agency algorithm."),
"adoption-agency-1.2":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 2 of the adoption agency algorithm."),
"adoption-agency-1.3":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 3 of the adoption agency algorithm."),
"unexpected-end-tag-treated-as":
_(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
"no-end-tag":
_(u"This element (%(name)s) has no end tag."),
"unexpected-implied-end-tag-in-table":
_(u"Unexpected implied end tag (%(name)s) in the table phase."),
"unexpected-implied-end-tag-in-table-body":
_(u"Unexpected implied end tag (%(name)s) in the table body phase."),
"unexpected-char-implies-table-voodoo":
_(u"Unexpected non-space characters in "
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
"unexpected-end-tag-implies-table-voodoo":
_(u"Unexpected end tag (%(name)s) in "
u"table context caused voodoo mode."),
"unexpected-cell-in-table-body":
_(u"Unexpected table cell start tag (%(name)s) "
u"in the table body phase."),
"unexpected-cell-end-tag":
_(u"Got table cell end tag (%(name)s) "
u"while required end tags are missing."),
"unexpected-end-tag-in-table-body":
_(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
"unexpected-implied-end-tag-in-table-row":
_(u"Unexpected implied end tag (%(name)s) in the table row phase."),
"unexpected-end-tag-in-table-row":
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
"unexpected-select-in-select":
_(u"Unexpected select start tag in the select phase "
u"treated as select end tag."),
"unexpected-input-in-select":
_(u"Unexpected input start tag in the select phase."),
"unexpected-start-tag-in-select":
_(u"Unexpected start tag token (%(name)s in the select phase. "
u"Ignored."),
"unexpected-end-tag-in-select":
_(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
"unexpected-table-element-start-tag-in-select-in-table":
_(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
"unexpected-table-element-end-tag-in-select-in-table":
_(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
"unexpected-char-after-body":
_(u"Unexpected non-space characters in the after body phase."),
"unexpected-start-tag-after-body":
_(u"Unexpected start tag token (%(name)s)"
u" in the after body phase."),
"unexpected-end-tag-after-body":
_(u"Unexpected end tag token (%(name)s)"
u" in the after body phase."),
"unexpected-char-in-frameset":
_(u"Unepxected characters in the frameset phase. Characters ignored."),
"unexpected-start-tag-in-frameset":
_(u"Unexpected start tag token (%(name)s)"
u" in the frameset phase. Ignored."),
"unexpected-frameset-in-frameset-innerhtml":
_(u"Unexpected end tag token (frameset) "
u"in the frameset phase (innerHTML)."),
"unexpected-end-tag-in-frameset":
_(u"Unexpected end tag token (%(name)s)"
u" in the frameset phase. Ignored."),
"unexpected-char-after-frameset":
_(u"Unexpected non-space characters in the "
u"after frameset phase. Ignored."),
"unexpected-start-tag-after-frameset":
_(u"Unexpected start tag (%(name)s)"
u" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-frameset":
_(u"Unexpected end tag (%(name)s)"
u" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-body-innerhtml":
_(u"Unexpected end tag after body(innerHtml)"),
"expected-eof-but-got-char":
_(u"Unexpected non-space characters. Expected end of file."),
"expected-eof-but-got-start-tag":
_(u"Unexpected start tag (%(name)s)"
u". Expected end of file."),
"expected-eof-but-got-end-tag":
_(u"Unexpected end tag (%(name)s)"
u". Expected end of file."),
"eof-in-table":
_(u"Unexpected end of file. Expected table content."),
"eof-in-select":
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
contentModelFlags = { contentModelFlags = {
"PCDATA":0, "PCDATA":0,
"RCDATA":1, "RCDATA":1,
@ -16,101 +271,126 @@ contentModelFlags = {
"PLAINTEXT":3 "PLAINTEXT":3
} }
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
"svg":"http://www.w3.org/2000/svg",
"xlink":"http://www.w3.org/1999/xlink",
"xml":"http://www.w3.org/XML/1998/namespace",
"xmlns":"http://www.w3.org/2000/xmlns/"
}
scopingElements = frozenset(( scopingElements = frozenset((
"button", (namespaces["html"], "applet"),
"caption", (namespaces["html"], "button"),
"html", (namespaces["html"], "caption"),
"marquee", (namespaces["html"], "html"),
"object", (namespaces["html"], "marquee"),
"table", (namespaces["html"], "object"),
"td", (namespaces["html"], "table"),
"th" (namespaces["html"], "td"),
(namespaces["html"], "th"),
(namespaces["svg"], "foreignObject")
)) ))
formattingElements = frozenset(( formattingElements = frozenset((
"a", (namespaces["html"], "a"),
"b", (namespaces["html"], "b"),
"big", (namespaces["html"], "big"),
"em", (namespaces["html"], "code"),
"font", (namespaces["html"], "em"),
"i", (namespaces["html"], "font"),
"nobr", (namespaces["html"], "i"),
"s", (namespaces["html"], "nobr"),
"small", (namespaces["html"], "s"),
"strike", (namespaces["html"], "small"),
"strong", (namespaces["html"], "strike"),
"tt", (namespaces["html"], "strong"),
"u" (namespaces["html"], "tt"),
(namespaces["html"], "u")
)) ))
specialElements = frozenset(( specialElements = frozenset((
"address", (namespaces["html"], "address"),
"area", (namespaces["html"], "area"),
"base", (namespaces["html"], "article"),
"basefont", (namespaces["html"], "aside"),
"bgsound", (namespaces["html"], "base"),
"blockquote", (namespaces["html"], "basefont"),
"body", (namespaces["html"], "bgsound"),
"br", (namespaces["html"], "blockquote"),
"center", (namespaces["html"], "body"),
"col", (namespaces["html"], "br"),
"colgroup", (namespaces["html"], "center"),
"dd", (namespaces["html"], "col"),
"dir", (namespaces["html"], "colgroup"),
"div", (namespaces["html"], "command"),
"dl", (namespaces["html"], "datagrid"),
"dt", (namespaces["html"], "dd"),
"embed", (namespaces["html"], "details"),
"fieldset", (namespaces["html"], "dialog"),
"form", (namespaces["html"], "dir"),
"frame", (namespaces["html"], "div"),
"frameset", (namespaces["html"], "dl"),
"h1", (namespaces["html"], "dt"),
"h2", (namespaces["html"], "embed"),
"h3", (namespaces["html"], "event-source"),
"h4", (namespaces["html"], "fieldset"),
"h5", (namespaces["html"], "figure"),
"h6", (namespaces["html"], "footer"),
"head", (namespaces["html"], "form"),
"hr", (namespaces["html"], "frame"),
"iframe", (namespaces["html"], "frameset"),
"image", (namespaces["html"], "h1"),
"img", (namespaces["html"], "h2"),
"input", (namespaces["html"], "h3"),
"isindex", (namespaces["html"], "h4"),
"li", (namespaces["html"], "h5"),
"link", (namespaces["html"], "h6"),
"listing", (namespaces["html"], "head"),
"menu", (namespaces["html"], "header"),
"meta", (namespaces["html"], "hr"),
"noembed", (namespaces["html"], "iframe"),
"noframes", # Note that image is commented out in the spec as "this isn't an
"noscript", # element that can end up on the stack, so it doesn't matter,"
"ol", (namespaces["html"], "image"),
"optgroup", (namespaces["html"], "img"),
"option", (namespaces["html"], "input"),
"p", (namespaces["html"], "isindex"),
"param", (namespaces["html"], "li"),
"plaintext", (namespaces["html"], "link"),
"pre", (namespaces["html"], "listing"),
"script", (namespaces["html"], "menu"),
"select", (namespaces["html"], "meta"),
"spacer", (namespaces["html"], "nav"),
"style", (namespaces["html"], "noembed"),
"tbody", (namespaces["html"], "noframes"),
"textarea", (namespaces["html"], "noscript"),
"tfoot", (namespaces["html"], "ol"),
"thead", (namespaces["html"], "optgroup"),
"title", (namespaces["html"], "option"),
"tr", (namespaces["html"], "p"),
"ul", (namespaces["html"], "param"),
"wbr" (namespaces["html"], "plaintext"),
(namespaces["html"], "pre"),
(namespaces["html"], "script"),
(namespaces["html"], "section"),
(namespaces["html"], "select"),
(namespaces["html"], "spacer"),
(namespaces["html"], "style"),
(namespaces["html"], "tbody"),
(namespaces["html"], "textarea"),
(namespaces["html"], "tfoot"),
(namespaces["html"], "thead"),
(namespaces["html"], "title"),
(namespaces["html"], "tr"),
(namespaces["html"], "ul"),
(namespaces["html"], "wbr")
)) ))
spaceCharacters = frozenset(( spaceCharacters = frozenset((
u"\t", u"\t",
u"\n", u"\n",
u"\u000B",
u"\u000C", u"\u000C",
u" ", u" ",
u"\r" u"\r"
@ -143,9 +423,10 @@ headingElements = (
"h6" "h6"
) )
# XXX What about event-source and command?
voidElements = frozenset(( voidElements = frozenset((
"base", "base",
"command",
"event-source",
"link", "link",
"meta", "meta",
"hr", "hr",
@ -155,7 +436,8 @@ voidElements = frozenset((
"param", "param",
"area", "area",
"col", "col",
"input" "input",
"source"
)) ))
cdataElements = frozenset(('title', 'textarea')) cdataElements = frozenset(('title', 'textarea'))
@ -440,7 +722,7 @@ entities = {
"kappa;": u"\u03BA", "kappa;": u"\u03BA",
"lArr;": u"\u21D0", "lArr;": u"\u21D0",
"lambda;": u"\u03BB", "lambda;": u"\u03BB",
"lang;": u"\u3008", "lang;": u"\u27E8",
"laquo;": u"\u00AB", "laquo;": u"\u00AB",
"laquo": u"\u00AB", "laquo": u"\u00AB",
"larr;": u"\u2190", "larr;": u"\u2190",
@ -520,7 +802,7 @@ entities = {
"quot": u"\u0022", "quot": u"\u0022",
"rArr;": u"\u21D2", "rArr;": u"\u21D2",
"radic;": u"\u221A", "radic;": u"\u221A",
"rang;": u"\u3009", "rang;": u"\u27E9",
"raquo;": u"\u00BB", "raquo;": u"\u00BB",
"raquo": u"\u00BB", "raquo": u"\u00BB",
"rarr;": u"\u2192", "rarr;": u"\u2192",
@ -596,221 +878,255 @@ entities = {
"zwnj;": u"\u200C" "zwnj;": u"\u200C"
} }
encodings = frozenset(( encodings = {
"ansi_x3.4-1968", '437': 'cp437',
"iso-ir-6", '850': 'cp850',
"ansi_x3.4-1986", '852': 'cp852',
"iso_646.irv:1991", '855': 'cp855',
"ascii", '857': 'cp857',
"iso646-us", '860': 'cp860',
"us-ascii", '861': 'cp861',
"us", '862': 'cp862',
"ibm367", '863': 'cp863',
"cp367", '865': 'cp865',
"csascii", '866': 'cp866',
"ks_c_5601-1987", '869': 'cp869',
"korean", 'ansix341968': 'ascii',
"iso-2022-kr", 'ansix341986': 'ascii',
"csiso2022kr", 'arabic': 'iso8859-6',
"euc-kr", 'ascii': 'ascii',
"iso-2022-jp", 'asmo708': 'iso8859-6',
"csiso2022jp", 'big5': 'big5',
"iso-2022-jp-2", 'big5hkscs': 'big5hkscs',
"iso-ir-58", 'chinese': 'gbk',
"chinese", 'cp037': 'cp037',
"csiso58gb231280", 'cp1026': 'cp1026',
"iso_8859-1:1987", 'cp154': 'ptcp154',
"iso-ir-100", 'cp367': 'ascii',
"iso_8859-1", 'cp424': 'cp424',
"iso-8859-1", 'cp437': 'cp437',
"latin1", 'cp500': 'cp500',
"l1", 'cp775': 'cp775',
"ibm819", 'cp819': 'windows-1252',
"cp819", 'cp850': 'cp850',
"csisolatin1", 'cp852': 'cp852',
"iso_8859-2:1987", 'cp855': 'cp855',
"iso-ir-101", 'cp857': 'cp857',
"iso_8859-2", 'cp860': 'cp860',
"iso-8859-2", 'cp861': 'cp861',
"latin2", 'cp862': 'cp862',
"l2", 'cp863': 'cp863',
"csisolatin2", 'cp864': 'cp864',
"iso_8859-3:1988", 'cp865': 'cp865',
"iso-ir-109", 'cp866': 'cp866',
"iso_8859-3", 'cp869': 'cp869',
"iso-8859-3", 'cp936': 'gbk',
"latin3", 'cpgr': 'cp869',
"l3", 'cpis': 'cp861',
"csisolatin3", 'csascii': 'ascii',
"iso_8859-4:1988", 'csbig5': 'big5',
"iso-ir-110", 'cseuckr': 'cp949',
"iso_8859-4", 'cseucpkdfmtjapanese': 'euc_jp',
"iso-8859-4", 'csgb2312': 'gbk',
"latin4", 'cshproman8': 'hp-roman8',
"l4", 'csibm037': 'cp037',
"csisolatin4", 'csibm1026': 'cp1026',
"iso_8859-6:1987", 'csibm424': 'cp424',
"iso-ir-127", 'csibm500': 'cp500',
"iso_8859-6", 'csibm855': 'cp855',
"iso-8859-6", 'csibm857': 'cp857',
"ecma-114", 'csibm860': 'cp860',
"asmo-708", 'csibm861': 'cp861',
"arabic", 'csibm863': 'cp863',
"csisolatinarabic", 'csibm864': 'cp864',
"iso_8859-7:1987", 'csibm865': 'cp865',
"iso-ir-126", 'csibm866': 'cp866',
"iso_8859-7", 'csibm869': 'cp869',
"iso-8859-7", 'csiso2022jp': 'iso2022_jp',
"elot_928", 'csiso2022jp2': 'iso2022_jp_2',
"ecma-118", 'csiso2022kr': 'iso2022_kr',
"greek", 'csiso58gb231280': 'gbk',
"greek8", 'csisolatin1': 'windows-1252',
"csisolatingreek", 'csisolatin2': 'iso8859-2',
"iso_8859-8:1988", 'csisolatin3': 'iso8859-3',
"iso-ir-138", 'csisolatin4': 'iso8859-4',
"iso_8859-8", 'csisolatin5': 'windows-1254',
"iso-8859-8", 'csisolatin6': 'iso8859-10',
"hebrew", 'csisolatinarabic': 'iso8859-6',
"csisolatinhebrew", 'csisolatincyrillic': 'iso8859-5',
"iso_8859-5:1988", 'csisolatingreek': 'iso8859-7',
"iso-ir-144", 'csisolatinhebrew': 'iso8859-8',
"iso_8859-5", 'cskoi8r': 'koi8-r',
"iso-8859-5", 'csksc56011987': 'cp949',
"cyrillic", 'cspc775baltic': 'cp775',
"csisolatincyrillic", 'cspc850multilingual': 'cp850',
"iso_8859-9:1989", 'cspc862latinhebrew': 'cp862',
"iso-ir-148", 'cspc8codepage437': 'cp437',
"iso_8859-9", 'cspcp852': 'cp852',
"iso-8859-9", 'csptcp154': 'ptcp154',
"latin5", 'csshiftjis': 'shift_jis',
"l5", 'csunicode11utf7': 'utf-7',
"csisolatin5", 'cyrillic': 'iso8859-5',
"iso-8859-10", 'cyrillicasian': 'ptcp154',
"iso-ir-157", 'ebcdiccpbe': 'cp500',
"l6", 'ebcdiccpca': 'cp037',
"iso_8859-10:1992", 'ebcdiccpch': 'cp500',
"csisolatin6", 'ebcdiccphe': 'cp424',
"latin6", 'ebcdiccpnl': 'cp037',
"hp-roman8", 'ebcdiccpus': 'cp037',
"roman8", 'ebcdiccpwt': 'cp037',
"r8", 'ecma114': 'iso8859-6',
"ibm037", 'ecma118': 'iso8859-7',
"cp037", 'elot928': 'iso8859-7',
"csibm037", 'eucjp': 'euc_jp',
"ibm424", 'euckr': 'cp949',
"cp424", 'extendedunixcodepackedformatforjapanese': 'euc_jp',
"csibm424", 'gb18030': 'gb18030',
"ibm437", 'gb2312': 'gbk',
"cp437", 'gb231280': 'gbk',
"437", 'gbk': 'gbk',
"cspc8codepage437", 'greek': 'iso8859-7',
"ibm500", 'greek8': 'iso8859-7',
"cp500", 'hebrew': 'iso8859-8',
"csibm500", 'hproman8': 'hp-roman8',
"ibm775", 'hzgb2312': 'hz',
"cp775", 'ibm037': 'cp037',
"cspc775baltic", 'ibm1026': 'cp1026',
"ibm850", 'ibm367': 'ascii',
"cp850", 'ibm424': 'cp424',
"850", 'ibm437': 'cp437',
"cspc850multilingual", 'ibm500': 'cp500',
"ibm852", 'ibm775': 'cp775',
"cp852", 'ibm819': 'windows-1252',
"852", 'ibm850': 'cp850',
"cspcp852", 'ibm852': 'cp852',
"ibm855", 'ibm855': 'cp855',
"cp855", 'ibm857': 'cp857',
"855", 'ibm860': 'cp860',
"csibm855", 'ibm861': 'cp861',
"ibm857", 'ibm862': 'cp862',
"cp857", 'ibm863': 'cp863',
"857", 'ibm864': 'cp864',
"csibm857", 'ibm865': 'cp865',
"ibm860", 'ibm866': 'cp866',
"cp860", 'ibm869': 'cp869',
"860", 'iso2022jp': 'iso2022_jp',
"csibm860", 'iso2022jp2': 'iso2022_jp_2',
"ibm861", 'iso2022kr': 'iso2022_kr',
"cp861", 'iso646irv1991': 'ascii',
"861", 'iso646us': 'ascii',
"cp-is", 'iso88591': 'windows-1252',
"csibm861", 'iso885910': 'iso8859-10',
"ibm862", 'iso8859101992': 'iso8859-10',
"cp862", 'iso885911987': 'windows-1252',
"862", 'iso885913': 'iso8859-13',
"cspc862latinhebrew", 'iso885914': 'iso8859-14',
"ibm863", 'iso8859141998': 'iso8859-14',
"cp863", 'iso885915': 'iso8859-15',
"863", 'iso885916': 'iso8859-16',
"csibm863", 'iso8859162001': 'iso8859-16',
"ibm864", 'iso88592': 'iso8859-2',
"cp864", 'iso885921987': 'iso8859-2',
"csibm864", 'iso88593': 'iso8859-3',
"ibm865", 'iso885931988': 'iso8859-3',
"cp865", 'iso88594': 'iso8859-4',
"865", 'iso885941988': 'iso8859-4',
"csibm865", 'iso88595': 'iso8859-5',
"ibm866", 'iso885951988': 'iso8859-5',
"cp866", 'iso88596': 'iso8859-6',
"866", 'iso885961987': 'iso8859-6',
"csibm866", 'iso88597': 'iso8859-7',
"ibm869", 'iso885971987': 'iso8859-7',
"cp869", 'iso88598': 'iso8859-8',
"869", 'iso885981988': 'iso8859-8',
"cp-gr", 'iso88599': 'windows-1254',
"csibm869", 'iso885991989': 'windows-1254',
"ibm1026", 'isoceltic': 'iso8859-14',
"cp1026", 'isoir100': 'windows-1252',
"csibm1026", 'isoir101': 'iso8859-2',
"koi8-r", 'isoir109': 'iso8859-3',
"cskoi8r", 'isoir110': 'iso8859-4',
"koi8-u", 'isoir126': 'iso8859-7',
"big5-hkscs", 'isoir127': 'iso8859-6',
"ptcp154", 'isoir138': 'iso8859-8',
"csptcp154", 'isoir144': 'iso8859-5',
"pt154", 'isoir148': 'windows-1254',
"cp154", 'isoir149': 'cp949',
"utf-7", 'isoir157': 'iso8859-10',
"utf-16be", 'isoir199': 'iso8859-14',
"utf-16le", 'isoir226': 'iso8859-16',
"utf-16", 'isoir58': 'gbk',
"utf-8", 'isoir6': 'ascii',
"iso-8859-13", 'koi8r': 'koi8-r',
"iso-8859-14", 'koi8u': 'koi8-u',
"iso-ir-199", 'korean': 'cp949',
"iso_8859-14:1998", 'ksc5601': 'cp949',
"iso_8859-14", 'ksc56011987': 'cp949',
"latin8", 'ksc56011989': 'cp949',
"iso-celtic", 'l1': 'windows-1252',
"l8", 'l10': 'iso8859-16',
"iso-8859-15", 'l2': 'iso8859-2',
"iso_8859-15", 'l3': 'iso8859-3',
"iso-8859-16", 'l4': 'iso8859-4',
"iso-ir-226", 'l5': 'windows-1254',
"iso_8859-16:2001", 'l6': 'iso8859-10',
"iso_8859-16", 'l8': 'iso8859-14',
"latin10", 'latin1': 'windows-1252',
"l10", 'latin10': 'iso8859-16',
"gbk", 'latin2': 'iso8859-2',
"cp936", 'latin3': 'iso8859-3',
"ms936", 'latin4': 'iso8859-4',
"gb18030", 'latin5': 'windows-1254',
"shift_jis", 'latin6': 'iso8859-10',
"ms_kanji", 'latin8': 'iso8859-14',
"csshiftjis", 'latin9': 'iso8859-15',
"euc-jp", 'ms936': 'gbk',
"gb2312", 'mskanji': 'shift_jis',
"big5", 'pt154': 'ptcp154',
"csbig5", 'ptcp154': 'ptcp154',
"windows-1250", 'r8': 'hp-roman8',
"windows-1251", 'roman8': 'hp-roman8',
"windows-1252", 'shiftjis': 'shift_jis',
"windows-1253", 'tis620': 'cp874',
"windows-1254", 'unicode11utf7': 'utf-7',
"windows-1255", 'us': 'ascii',
"windows-1256", 'usascii': 'ascii',
"windows-1257", 'utf16': 'utf-16',
"windows-1258", 'utf16be': 'utf-16-be',
"tis-620", 'utf16le': 'utf-16-le',
"hz-gb-2312", 'utf8': 'utf-8',
)) 'windows1250': 'cp1250',
'windows1251': 'cp1251',
'windows1252': 'cp1252',
'windows1253': 'cp1253',
'windows1254': 'cp1254',
'windows1255': 'cp1255',
'windows1256': 'cp1256',
'windows1257': 'cp1257',
'windows1258': 'cp1258',
'windows936': 'gbk',
'x-x-big5': 'big5'}
tokenTypes = {
"Doctype":0,
"Characters":1,
"SpaceCharacters":2,
"StartTag":3,
"EndTag":4,
"EmptyTag":5,
"Comment":6,
"ParseError":7
}
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]))
prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
class DataLossWarning(UserWarning):
pass
class ReparseException(Exception):
pass

View File

@ -0,0 +1,127 @@
#
# The goal is to finally have a form filler where you pass data for
# each form, using the algorithm for "Seeding a form with initial values"
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
import _base
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
_base.Filter.__init__(self, source)
self.fieldStorage = fieldStorage
def __iter__(self):
field_indices = {}
state = None
field_name = None
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"].lower()
if name == "input":
field_name = None
field_type = None
input_value_index = -1
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == u"name":
field_name = v.strip(spaceCharacters)
elif n == u"type":
field_type = v.strip(spaceCharacters)
elif n == u"checked":
input_checked_index = i
elif n == u"value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if field_type in (u"checkbox", u"radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
token["data"].append((u"checked", u""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
elif field_type not in (u"button", u"submit", u"reset"):
if input_value_index >= 0:
token["data"][input_value_index] = (u"value", value)
else:
token["data"].append((u"value", value))
field_indices[field_name] = field_index + 1
field_type = None
field_name = None
elif name == "textarea":
field_type = "textarea"
field_name = dict((token["data"])[::-1])["name"]
elif name == "select":
field_type = "select"
attributes = dict(token["data"][::-1])
field_name = attributes.get("name")
is_select_multiple = "multiple" in attributes
is_selected_option_found = False
elif field_type == "select" and field_name and name == "option":
option_selected_index = -1
option_value = None
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == "selected":
option_selected_index = i
elif n == "value":
option_value = v.strip(spaceCharacters)
if option_value is None:
raise NotImplementedError("<option>s without a value= attribute")
else:
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
token["data"].append((u"selected", u""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
del token["data"][option_selected_index]
elif field_type is not None and field_name and type == "EndTag":
name = token["name"].lower()
if name == field_type:
if name == "textarea":
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
yield {"type": "Characters", "data": value}
field_indices[field_name] = field_index + 1
field_name = None
elif name == "option" and field_type == "select":
pass # TODO: part of "option without value= attribute" processing
elif field_type == "textarea":
continue # ignore token
yield token

View File

@ -14,7 +14,8 @@ class Filter(_base.Filter):
for previous, token, next in self.slider(): for previous, token, next in self.slider():
type = token["type"] type = token["type"]
if type == "StartTag": if type == "StartTag":
if token["data"] or not self.is_optional_start(token["name"], previous, next): if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
yield token yield token
elif type == "EndTag": elif type == "EndTag":
if not self.is_optional_end(token["name"], next): if not self.is_optional_end(token["name"], next):
@ -31,7 +32,11 @@ class Filter(_base.Filter):
elif tagname == 'head': elif tagname == 'head':
# A head element's start tag may be omitted if the first thing # A head element's start tag may be omitted if the first thing
# inside the head element is an element. # inside the head element is an element.
return type == "StartTag" # XXX: we also omit the start tag if the head element is empty
if type in ("StartTag", "EmptyTag"):
return True
elif type == "EndTag":
return next["name"] == "head"
elif tagname == 'body': elif tagname == 'body':
# A body element's start tag may be omitted if the first thing # A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment, # inside the body element is not a space character or a comment,
@ -52,7 +57,7 @@ class Filter(_base.Filter):
# inside the colgroup element is a col element, and if the element # inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose # is not immediately preceeded by another colgroup element whose
# end tag has been omitted. # end tag has been omitted.
if type == "StartTag": if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never # XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately # omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end. # followed by another colgroup element. See is_optional_end.
@ -81,16 +86,13 @@ class Filter(_base.Filter):
# An html element's end tag may be omitted if the html element # An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment. # is not immediately followed by a space character or a comment.
return type not in ("Comment", "SpaceCharacters") return type not in ("Comment", "SpaceCharacters")
elif tagname in ('li', 'optgroup', 'option', 'tr'): elif tagname in ('li', 'optgroup', 'tr'):
# A li element's end tag may be omitted if the li element is # A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is # immediately followed by another li element or if there is
# no more content in the parent element. # no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup # An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element, # element is immediately followed by another optgroup element,
# or if there is no more content in the parent element. # or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is # A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is # immediately followed by another tr element, or if there is
# no more content in the parent element. # no more content in the parent element.
@ -112,14 +114,39 @@ class Filter(_base.Filter):
return False return False
elif tagname == 'p': elif tagname == 'p':
# A p element's end tag may be omitted if the p element is # A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset, # immediately followed by an address, article, aside,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, # blockquote, datagrid, dialog, dir, div, dl, fieldset,
# or ul element, or if there is no more content in the parent # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
# nav, ol, p, pre, section, table, or ul, element, or if
# there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul')
else:
return type == "EndTag" or type is None
elif tagname == 'option':
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if it is immediately followed by an <code>optgroup</code>
# element, or if there is no more content in the parent
# element. # element.
if type == "StartTag": if type == "StartTag":
return next["name"] in ('address', 'blockquote', \ return next["name"] in ('option', 'optgroup')
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \ else:
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul') return type == "EndTag" or type is None
elif tagname in ('rt', 'rp'):
# An rt element's end tag may be omitted if the rt element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
# An rp element's end tag may be omitted if the rp element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('rt', 'rp')
else: else:
return type == "EndTag" or type is None return type == "EndTag" or type is None
elif tagname == 'colgroup': elif tagname == 'colgroup':

View File

@ -0,0 +1,8 @@
import _base
from html5lib.sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self):
for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token: yield token

File diff suppressed because it is too large Load Diff

170
planet/vendor/html5lib/ihatexml.py vendored Normal file
View File

@ -0,0 +1,170 @@
import re
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
#Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1]*2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
assert item[1] >= item[0]
rv = []
i = 0
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i+j][1]
j += 1
i += j
return rv
#We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1]+1, charList[i+1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(intToUnicodeStr(item[0]))
else:
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
return "[%s]"%"|".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def intToUnicodeStr(intValue):
#There must be a better (non-evil) way to do this
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, r"\\" + char)
if char in string:
print string
return string
#output from the above
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
replaceRanges = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
if replaceRanges is not None or replaceChars is not None:
raise NotImplementedError
else:
self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
self.preventDoubleDashComments = preventDoubleDashComments
self.preventDashAtCommentEnd = preventDashAtCommentEnd
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
#Need a datalosswarning here
return None
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
return None
else:
return self.toXmlName(name)
def coerceElement(self, name, namespace=None):
return self.toXmlName(name)
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
data = data.replace("--", "- -")
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
data = data.replace("\x0C", " ")
#Other non-xml characters
return data
def toXmlName(self, name):
replaceChars = set(self.replaceCharsRegexp.findall(name))
for char in replaceChars:
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
name = name.replace(char, replacement)
return name
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
name = name.replace(item, self.unescapeChar(item))
return name
def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return unichr(int(charcode[1:], 16))

View File

@ -1,15 +1,109 @@
import codecs import codecs
import re import re
import types import types
import sys
from gettext import gettext
_ = gettext
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings from constants import encodings, ReparseException
from utils import MethodDispatcher
class HTMLInputStream(object): #Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
class BufferedStream:
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""
def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1,0] #chunk number, offset
def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos
def seek(self, pos):
assert pos < self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= pos
i += 1
self.position = [i, offset]
def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)
def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])
def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data
def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]
if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
data = rv.append(bufferedData[bufferOffset:
bufferOffset + bytesToRead])
remainingBytes -= bytesToRead
bufferOffset = 0
if remainingBytes:
rv.append(self._readStream(remainingBytes))
return "".join(rv)
class HTMLInputStream:
"""Provides a unicode stream of characters to the HTMLTokenizer. """Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing This class takes care of character encoding and removing or replacing
@ -17,11 +111,13 @@ class HTMLInputStream(object):
""" """
_defaultChunkSize = 10240
def __init__(self, source, encoding=None, parseMeta=True, chardet=True): def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream. """Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by the HTML5Lib. for use by html5lib.
source can be either a file-object, local filename or a string. source can be either a file-object, local filename or a string.
@ -33,10 +129,17 @@ class HTMLInputStream(object):
parseMeta - Look for a <meta> element containing encoding information parseMeta - Look for a <meta> element containing encoding information
""" """
#Craziness
if len(u"\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
else:
self.reportCharacterErrors = self.characterErrorsUCS2
# List of where new lines occur # List of where new lines occur
self.newLines = [0] self.newLines = [0]
self.charEncoding = encoding self.charEncoding = (codecName(encoding), "certain")
# Raw Stream - for unicode objects this will encode to utf-8 and set # Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate # self.charEncoding as appropriate
@ -52,17 +155,25 @@ class HTMLInputStream(object):
self.defaultEncoding = "windows-1252" self.defaultEncoding = "windows-1252"
#Detect encoding iff no explicit "transport level" encoding is supplied #Detect encoding iff no explicit "transport level" encoding is supplied
if self.charEncoding is None or not isValidEncoding(self.charEncoding): if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet) self.charEncoding = self.detectEncoding(parseMeta, chardet)
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
'replace')
self.queue = [] self.reset()
def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')
self.chunk = u""
self.chunkSize = 0
self.chunkOffset = 0
self.errors = [] self.errors = []
self.line = self.col = 0 # number of (complete) lines in previous chunks
self.lineLengths = [] self.prevNumLines = 0
# number of columns in the last line of the previous chunk
self.prevNumCols = 0
#Flag to indicate we may have a CR LF broken across a data chunk #Flag to indicate we may have a CR LF broken across a data chunk
self._lastChunkEndsWithCR = False self._lastChunkEndsWithCR = False
@ -80,22 +191,29 @@ class HTMLInputStream(object):
# Otherwise treat source as a string and convert to a file object # Otherwise treat source as a string and convert to a file object
if isinstance(source, unicode): if isinstance(source, unicode):
source = source.encode('utf-8') source = source.encode('utf-8')
self.charEncoding = "utf-8" self.charEncoding = ("utf-8", "certain")
import cStringIO import cStringIO
stream = cStringIO.StringIO(str(source)) stream = cStringIO.StringIO(str(source))
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
stream is sys.stdin):
stream = BufferedStream(stream)
return stream return stream
def detectEncoding(self, parseMeta=True, chardet=True): def detectEncoding(self, parseMeta=True, chardet=True):
#First look for a BOM #First look for a BOM
#This will also read past the BOM if present #This will also read past the BOM if present
encoding = self.detectBOM() encoding = self.detectBOM()
confidence = "certain"
#If there is no BOM need to look for meta elements with encoding #If there is no BOM need to look for meta elements with encoding
#information #information
if encoding is None and parseMeta: if encoding is None and parseMeta:
encoding = self.detectEncodingMeta() encoding = self.detectEncodingMeta()
confidence = "tentative"
#Guess with chardet, if avaliable #Guess with chardet, if avaliable
if encoding is None and chardet: if encoding is None and chardet:
confidence = "tentative"
try: try:
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
buffers = [] buffers = []
@ -108,11 +226,12 @@ class HTMLInputStream(object):
detector.feed(buffer) detector.feed(buffer)
detector.close() detector.close()
encoding = detector.result['encoding'] encoding = detector.result['encoding']
self.seek("".join(buffers), 0) self.rawStream.seek(0)
except ImportError: except ImportError:
pass pass
# If all else fails use the default encoding # If all else fails use the default encoding
if encoding is None: if encoding is None:
confidence="tentative"
encoding = self.defaultEncoding encoding = self.defaultEncoding
#Substitute for equivalent encodings: #Substitute for equivalent encodings:
@ -121,8 +240,22 @@ class HTMLInputStream(object):
if encoding.lower() in encodingSub: if encoding.lower() in encodingSub:
encoding = encodingSub[encoding.lower()] encoding = encodingSub[encoding.lower()]
return encoding return encoding, confidence
def changeEncoding(self, newEncoding):
newEncoding = codecName(newEncoding)
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
if newEncoding is None:
return
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certain")
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
def detectBOM(self): def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If """Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the an encoding can be determined from the BOM return the name of the
@ -149,198 +282,219 @@ class HTMLInputStream(object):
# Set the read position past the BOM if one was found, otherwise # Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream # set it to the start of the stream
self.seek(string, encoding and seek or 0) self.rawStream.seek(encoding and seek or 0)
return encoding return encoding
def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return
if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass
class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
def detectEncodingMeta(self): def detectEncodingMeta(self):
"""Report the encoding declared by the meta element """Report the encoding declared by the meta element
""" """
buffer = self.rawStream.read(self.numBytesMeta) buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer) parser = EncodingParser(buffer)
self.seek(buffer, 0) self.rawStream.seek(0)
return parser.getEncoding() encoding = parser.getEncoding()
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"
return encoding
def _position(self, offset):
chunk = self.chunk
nLines = chunk.count(u'\n', 0, offset)
positionLine = self.prevNumLines + nLines
lastLinePos = chunk.rfind(u'\n', 0, offset)
if lastLinePos == -1:
positionColumn = self.prevNumCols + offset
else:
positionColumn = offset - (lastLinePos + 1)
return (positionLine, positionColumn)
def position(self): def position(self):
"""Returns (line, col) of the current position in the stream.""" """Returns (line, col) of the current position in the stream."""
line, col = self.line, self.col line, col = self._position(self.chunkOffset)
return (line + 1, col) return (line+1, col)
def char(self): def char(self):
""" Read one character from the stream or queue if available. Return """ Read one character from the stream or queue if available. Return
EOF when EOF is reached. EOF when EOF is reached.
""" """
if not self.queue: # Read a new chunk from the input stream if necessary
self.readChunk() if self.chunkOffset >= self.chunkSize:
#If we still don't have a character we have reached EOF if not self.readChunk():
if not self.queue: return EOF
return EOF
chunkOffset = self.chunkOffset
char = self.queue.pop(0) char = self.chunk[chunkOffset]
self.chunkOffset = chunkOffset + 1
# update position in stream
if char == '\n':
self.lineLengths.append(self.col)
self.line += 1
self.col = 0
else:
self.col += 1
return char return char
def readChunk(self, chunkSize=10240): def readChunk(self, chunkSize=None):
if chunkSize is None:
chunkSize = self._defaultChunkSize
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
self.chunk = u""
self.chunkSize = 0
self.chunkOffset = 0
data = self.dataStream.read(chunkSize) data = self.dataStream.read(chunkSize)
if not data: if not data:
return return False
#Replace null characters
for i in xrange(data.count(u"\u0000")): self.reportCharacterErrors(data)
self.errors.append(_('null character found in input stream, '
'replaced with U+FFFD'))
data = data.replace(u"\u0000", u"\ufffd") data = data.replace(u"\u0000", u"\ufffd")
#Check for CR LF broken across chunks #Check for CR LF broken across chunks
if (self._lastChunkEndsWithCR and data[0] == "\n"): if (self._lastChunkEndsWithCR and data[0] == u"\n"):
data = data[1:] data = data[1:]
self._lastChunkEndsWithCR = data[-1] == "\r" # Stop if the chunk is now empty
data = data.replace("\r\n", "\n") if not data:
data = data.replace("\r", "\n") return False
self._lastChunkEndsWithCR = data[-1] == u"\r"
data = unicode(data) data = data.replace(u"\r\n", u"\n")
self.queue.extend([char for char in data]) data = data.replace(u"\r", u"\n")
self.chunk = data
self.chunkSize = len(data)
return True
def characterErrorsUCS4(self, data):
for i in xrange(data.count(u"\u0000")):
self.errors.append("null-character")
for i in xrange(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
#Someone picked the wrong compile option
#You lose
for i in xrange(data.count(u"\u0000")):
self.errors.append("null-character")
skip = False
import sys
for match in invalid_unicode_re.finditer(data):
if skip:
continue
codepoint = ord(match.group())
pos = match.start()
#Pretty sure there should be endianness issues here
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
pos < len(data) - 1 and
ord(data[pos + 1]) >= 0xDC00 and
ord(data[pos + 1]) <= 0xDFFF):
#We have a surrogate pair!
#From a perl manpage
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
(ord(data[pos + 1]) - 0xDC00))
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
pos == len(data) - 1):
self.errors.append("invalid-codepoint")
else:
skip = False
self.errors.append("invalid-codepoint")
#This is still wrong if it is possible for a surrogate pair to break a
#chunk boundary
def charsUntil(self, characters, opposite = False): def charsUntil(self, characters, opposite = False):
""" Returns a string of characters from the stream up to but not """ Returns a string of characters from the stream up to but not
including any character in characters or EOF. characters can be including any character in 'characters' or EOF. 'characters' must be
any container that supports the in method being called on it. a container that supports the 'in' method and iteration over its
characters.
""" """
#This method is currently 40-50% of our total runtime and badly needs # Use a cache of regexps to find the required characters
#optimizing try:
#Possible improvements: chars = charsUntilRegEx[(characters, opposite)]
# - use regexp to find characters that match the required character set except KeyError:
# (with regexp cache since we do the same searches many many times) if __debug__:
# - improve EOF handling for fewer if statements for c in characters:
assert(ord(c) < 128)
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = u"^%s" % regex
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
if not self.queue: rv = []
self.readChunk()
#Break if we have reached EOF while True:
if not self.queue or self.queue[0] == None: # Find the longest matching prefix
return u"" m = chars.match(self.chunk, self.chunkOffset)
if m is None:
i = 0 # If nothing matched, and it wasn't because we ran out of chunk,
while (self.queue[i] in characters) == opposite: # then stop
i += 1 if self.chunkOffset != self.chunkSize:
if i == len(self.queue): break
self.readChunk()
#If the queue doesn't grow we have reached EOF
if i == len(self.queue) or self.queue[i] is EOF:
break
#XXX- wallpaper over bug in calculation below
#Otherwise change the stream position
if self.queue[i] == '\n':
self.lineLengths.append(self.col)
self.line += 1
self.col = 0
else: else:
self.col += 1 end = m.end()
# If not the whole chunk matched, return everything
# up to the part that didn't match
if end != self.chunkSize:
rv.append(self.chunk[self.chunkOffset:end])
self.chunkOffset = end
break
# If the whole remainder of the chunk matched,
# use it all and read the next chunk
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
rv = u"".join(self.queue[:i]) r = u"".join(rv)
self.queue = self.queue[i:] return r
#Calculate where we now are in the stream
#One possible optimisation would be to store all read characters and
#Calculate this on an as-needed basis (perhaps flushing the read data
#every time we read a new chunk) rather than once per call here and
#in .char()
#XXX Temporarily disable this because there is a bug
#lines = rv.split("\n")
#
#if lines:
# #Add number of lines passed onto positon
# oldCol = self.col
# self.line += len(lines)-1
# if len(lines) > 1:
# self.col = len(lines[-1])
# else:
# self.col += len(lines[0])
#
# if self.lineLengths and oldCol > 0:
# self.lineLengths[-1] += len(lines[0])
# lines = lines[1:-1]
# else:
# lines = lines[:-1]
#
# for line in lines:
# self.lineLengths.append(len(line))
#
return rv
def unget(self, chars): def unget(self, char):
if chars: # Only one character is allowed to be ungotten at once - it must
self.queue = list(chars) + self.queue # be consumed again before any further call to unget
#Alter the current line, col position
for c in chars[::-1]: if char is not None:
if c == '\n': if self.chunkOffset == 0:
self.line -= 1 # unget is called quite rarely, so it's a good idea to do
self.col = self.lineLengths[self.line] # more work here if it saves a bit of work in the frequently
else: # called char and charsUntil.
self.col -= 1 # So, just prepend the ungotten character onto the current
# chunk:
self.chunk = char + self.chunk
self.chunkSize += 1
else:
self.chunkOffset -= 1
assert self.chunk[self.chunkOffset] == char
class EncodingBytes(str): class EncodingBytes(str):
"""String-like object with an assosiated position and various extra methods """String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is If the position is ever greater than the string length then an exception is
raised""" raised"""
def __new__(self, value):
return str.__new__(self, value)
def __init__(self, value): def __init__(self, value):
str.__init__(self, value)
self._position=-1 self._position=-1
def __iter__(self): def __iter__(self):
return self return self
def next(self): def next(self):
self._position += 1 p = self._position = self._position + 1
rv = self[self.position] if p >= len(self):
return rv raise StopIteration
elif p < 0:
raise TypeError
return self[p]
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p]
def setPosition(self, position): def setPosition(self, position):
if self._position >= len(self): if self._position >= len(self):
@ -362,20 +516,39 @@ class EncodingBytes(str):
currentByte = property(getCurrentByte) currentByte = property(getCurrentByte)
def skip(self, chars=spaceCharacters): def skip(self, chars=spaceCharactersBytes):
"""Skip past a list of characters""" """Skip past a list of characters"""
while self.currentByte in chars: p = self.position # use property for the error-checking
self.position += 1 while p < len(self):
c = self[p]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return None
def skipUntil(self, chars):
p = self.position
while p < len(self):
c = self[p]
if c in chars:
self._position = p
return c
p += 1
self._position = p
return None
def matchBytes(self, bytes, lower=False): def matchBytes(self, bytes, lower=False):
"""Look for a sequence of bytes at the start of a string. If the bytes """Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone""" match. Otherwise return False and leave the position alone"""
data = self[self.position:self.position+len(bytes)] p = self.position
data = self[p:p+len(bytes)]
if lower: if lower:
data = data.lower() data = data.lower()
rv = data.startswith(bytes) rv = data.startswith(bytes)
if rv == True: if rv:
self.position += len(bytes) self.position += len(bytes)
return rv return rv
@ -388,12 +561,6 @@ class EncodingBytes(str):
return True return True
else: else:
raise StopIteration raise StopIteration
def findNext(self, byteList):
"""Move the pointer so it points to the next byte in a set of possible
bytes"""
while (self.currentByte not in byteList):
self.position += 1
class EncodingParser(object): class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements""" """Mini parser for detecting character encoding from meta elements"""
@ -423,8 +590,7 @@ class EncodingParser(object):
break break
if not keepParsing: if not keepParsing:
break break
if self.encoding is not None:
self.encoding = self.encoding.strip()
return self.encoding return self.encoding
def handleComment(self): def handleComment(self):
@ -432,7 +598,7 @@ class EncodingParser(object):
return self.data.jumpTo("-->") return self.data.jumpTo("-->")
def handleMeta(self): def handleMeta(self):
if self.data.currentByte not in spaceCharacters: if self.data.currentByte not in spaceCharactersBytes:
#if we have <meta not followed by a space so just keep going #if we have <meta not followed by a space so just keep going
return True return True
#We have a valid meta element we want to search for attributes #We have a valid meta element we want to search for attributes
@ -444,38 +610,41 @@ class EncodingParser(object):
else: else:
if attr[0] == "charset": if attr[0] == "charset":
tentativeEncoding = attr[1] tentativeEncoding = attr[1]
if isValidEncoding(tentativeEncoding): codec = codecName(tentativeEncoding)
self.encoding = tentativeEncoding if codec is not None:
self.encoding = codec
return False return False
elif attr[0] == "content": elif attr[0] == "content":
contentParser = ContentAttrParser(EncodingBytes(attr[1])) contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse() tentativeEncoding = contentParser.parse()
if isValidEncoding(tentativeEncoding): codec = codecName(tentativeEncoding)
self.encoding = tentativeEncoding if codec is not None:
self.encoding = codec
return False return False
def handlePossibleStartTag(self): def handlePossibleStartTag(self):
return self.handlePossibleTag(False) return self.handlePossibleTag(False)
def handlePossibleEndTag(self): def handlePossibleEndTag(self):
self.data.position+=1 self.data.next()
return self.handlePossibleTag(True) return self.handlePossibleTag(True)
def handlePossibleTag(self, endTag): def handlePossibleTag(self, endTag):
if self.data.currentByte not in asciiLetters: data = self.data
if data.currentByte not in asciiLettersBytes:
#If the next byte is not an ascii letter either ignore this #If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to #fragment (possible start tag case) or treat it according to
#handleOther #handleOther
if endTag: if endTag:
self.data.position -= 1 data.previous()
self.handleOther() self.handleOther()
return True return True
self.data.findNext(list(spaceCharacters) + ["<", ">"]) c = data.skipUntil(spacesAngleBrackets)
if self.data.currentByte == "<": if c == "<":
#return to the first step in the overall "two step" algorithm #return to the first step in the overall "two step" algorithm
#reprocessing the < byte #reprocessing the < byte
self.data.position -= 1 data.previous()
else: else:
#Read all attributes #Read all attributes
attr = self.getAttribute() attr = self.getAttribute()
@ -489,73 +658,75 @@ class EncodingParser(object):
def getAttribute(self): def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream, """Return a name,value pair for the next attribute in the stream,
if one is found, or None""" if one is found, or None"""
self.data.skip(list(spaceCharacters)+["/"]) data = self.data
if self.data.currentByte == "<": c = data.skip(spaceCharactersBytes | frozenset("/"))
self.data.position -= 1 if c == "<":
data.previous()
return None return None
elif self.data.currentByte == ">": elif c == ">" or c is None:
return None return None
attrName = [] attrName = []
attrValue = [] attrValue = []
spaceFound = False spaceFound = False
#Step 5 attribute name #Step 5 attribute name
while True: while True:
if self.data.currentByte == "=" and attrName: if c == "=" and attrName:
break break
elif self.data.currentByte in spaceCharacters: elif c in spaceCharactersBytes:
spaceFound=True spaceFound=True
break break
elif self.data.currentByte in ("/", "<", ">"): elif c in ("/", "<", ">"):
return "".join(attrName), "" return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase: elif c in asciiUppercaseBytes:
attrName.extend(self.data.currentByte.lower()) attrName.append(c.lower())
else: else:
attrName.extend(self.data.currentByte) attrName.append(c)
#Step 6 #Step 6
self.data.position += 1 c = data.next()
#Step 7 #Step 7
if spaceFound: if spaceFound:
self.data.skip() c = data.skip()
#Step 8 #Step 8
if self.data.currentByte != "=": if c != "=":
self.data.position -= 1 data.previous()
return "".join(attrName), "" return "".join(attrName), ""
#XXX need to advance position in both spaces and value case #XXX need to advance position in both spaces and value case
#Step 9 #Step 9
self.data.position += 1 data.next()
#Step 10 #Step 10
self.data.skip() c = data.skip()
#Step 11 #Step 11
if self.data.currentByte in ("'", '"'): if c in ("'", '"'):
#11.1 #11.1
quoteChar = self.data.currentByte quoteChar = c
while True: while True:
self.data.position+=1
#11.3 #11.3
if self.data.currentByte == quoteChar: c = data.next()
self.data.position += 1 if c == quoteChar:
data.next()
return "".join(attrName), "".join(attrValue) return "".join(attrName), "".join(attrValue)
#11.4 #11.4
elif self.data.currentByte in asciiUppercase: elif c in asciiUppercaseBytes:
attrValue.extend(self.data.currentByte.lower()) attrValue.append(c.lower())
#11.5 #11.5
else: else:
attrValue.extend(self.data.currentByte) attrValue.append(c)
elif self.data.currentByte in (">", '<'): elif c in (">", "<"):
return "".join(attrName), "" return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase: elif c in asciiUppercaseBytes:
attrValue.extend(self.data.currentByte.lower()) attrValue.append(c.lower())
elif c is None:
return None
else: else:
attrValue.extend(self.data.currentByte) attrValue.append(c)
while True: while True:
self.data.position +=1 c = data.next()
if self.data.currentByte in ( if c in spacesAngleBrackets:
list(spaceCharacters) + [">", '<']):
return "".join(attrName), "".join(attrValue) return "".join(attrName), "".join(attrValue)
elif self.data.currentByte in asciiUppercase: elif c in asciiUppercaseBytes:
attrValue.extend(self.data.currentByte.lower()) attrValue.append(c.lower())
else: else:
attrValue.extend(self.data.currentByte) attrValue.append(c)
class ContentAttrParser(object): class ContentAttrParser(object):
@ -588,7 +759,7 @@ class ContentAttrParser(object):
#Unquoted value #Unquoted value
oldPosition = self.data.position oldPosition = self.data.position
try: try:
self.data.findNext(spaceCharacters) self.data.skipUntil(spaceCharactersBytes)
return self.data[oldPosition:self.data.position] return self.data[oldPosition:self.data.position]
except StopIteration: except StopIteration:
#Return the whole remaining value #Return the whole remaining value
@ -596,7 +767,12 @@ class ContentAttrParser(object):
except StopIteration: except StopIteration:
return None return None
def isValidEncoding(encoding):
"""Determine if a string is a supported encoding""" def codecName(encoding):
return (encoding is not None and type(encoding) == types.StringType and """Return the python codec name corresponding to an encoding or None if the
encoding.lower().strip() in encodings) string doesn't correspond to a valid encoding."""
if (encoding is not None and type(encoding) in types.StringTypes):
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
else:
return None

View File

@ -1,147 +0,0 @@
"""
Warning: this module is experimental and subject to change and even removal
at any time.
For background/rationale, see:
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
* http://tinyurl.com/ylfj8k (and follow-ups)
References:
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Selectively lowercase only XHTML, but not foreign markup
"""
import html5parser
from constants import voidElements, contentModelFlags
from xml.dom import XHTML_NAMESPACE
from xml.sax.saxutils import unescape
class XMLParser(html5parser.HTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlRootPhase(self, self.tree)
def normalizeToken(self, token):
if token["type"] in ("StartTag", "EmptyTag"):
token["data"] = dict(token["data"][::-1])
# For EmptyTags, process both a Start and an End tag
if token["type"] == "EmptyTag":
save = self.tokenizer.contentModelFlag
self.phase.processStartTag(token["name"], token["data"])
self.tokenizer.contentModelFlag = save
token["data"] = {}
token["type"] = "EndTag"
elif token["type"] == "Characters":
# un-escape rcdataElements (e.g. style, script)
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
token["data"] = unescape(token["data"])
elif token["type"] == "Comment":
# Rescue CDATA from the comments
if (token["data"].startswith("[CDATA[") and
token["data"].endswith("]]")):
token["type"] = "Characters"
token["data"] = token["data"][7:-2]
return token
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
**kwargs):
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
encoding, lowercaseElementName=False,
lowercaseAttrName=False)
class XHTMLParser(XMLParser):
""" liberal XMTHML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlInitialPhase(self, self.tree)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
token = XMLParser.normalizeToken(self, token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag":
if token["name"] in voidElements:
if not self.tree.openElements or \
self.tree.openElements[-1].name != token["name"]:
token["type"] = "EmptyTag"
if not token.has_key("data"): token["data"] = {}
else:
if token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] != XHTML_NAMESPACE:
break
else:
self.tree.insertText('')
return token
class XhmlRootPhase(html5parser.RootElementPhase):
def insertHtmlElement(self):
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
self.tree.openElements.append(element)
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
class XmlInitialPhase(html5parser.InitialPhase):
""" Consume XML Prologs """
def processComment(self, data):
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
class XmlRootPhase(html5parser.Phase):
""" Consume XML Prologs """
def processComment(self, data):
print repr(data)
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
""" Prime the Xml parser """
def __getattr__(self, name):
self.tree.openElements.append(self.tree.document)
self.parser.phase = XmlElementPhase(self.parser, self.tree)
return getattr(self.parser.phase, name)
class XmlElementPhase(html5parser.Phase):
""" Generic handling for all XML elements """
def __init__(self, *args, **kwargs):
html5parser.Phase.__init__(self, *args, **kwargs)
self.startTagHandler = html5parser.utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = html5parser.utils.MethodDispatcher([])
self.endTagHandler.default = self.endTagOther
def startTagOther(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
def endTagOther(self, name):
for node in self.tree.openElements[::-1]:
if node.name == name:
while self.tree.openElements.pop() != node:
pass
break
else:
self.parser.parseError()
def processCharacters(self, data):
self.tree.insertText(data)

View File

@ -1,6 +1,8 @@
import re import re
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer from tokenizer import HTMLTokenizer
from constants import tokenTypes
class HTMLSanitizerMixin(object): class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
'arabic-form', 'ascent', 'attributeName', 'attributeType', 'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
'font-family', 'font-size', 'font-stretch', 'font-style', 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
@ -82,6 +84,13 @@ class HTMLSanitizerMixin(object):
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
'xlink:href', 'xml:base'] 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
'radialGradient', 'textpath', 'tref', 'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color', acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color', 'border-bottom-color', 'border-collapse', 'border-color',
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a> # => <a>Click here for $100</a>
def sanitize_token(self, token): def sanitize_token(self, token):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]: if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements: if token["name"] in self.allowed_elements:
if token.has_key("data"): if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) attrs = dict([(name,val) for name,val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri: for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue if not attrs.has_key(attr):
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() continue
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols): val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr] del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if attrs.has_key('style'): if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style']) attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()] token["data"] = [[name,val] for name,val in attrs.items()]
return token return token
else: else:
if token["type"] == "EndTag": if token["type"] == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"] token["data"] = "</%s>" % token["name"]
elif token["data"]: elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs) token["data"] = "<%s%s>" % (token["name"],attrs)
else: else:
token["data"] = "<%s>" % token["name"] token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag": if token["type"] == tokenTypes["EmptyTag"]:
token["data"]=token["data"][:-1] + "/>" token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters" token["type"] = tokenTypes["Characters"]
del token["name"] del token["name"]
return token return token
elif token["type"] == "Comment": elif token["type"] == tokenTypes["Comment"]:
pass pass
else: else:
return token return token
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
# gauntlet # gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
clean = [] clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue if not value: continue
if prop.lower() in self.allowed_css_properties: if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';') clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin','padding']: elif prop.split('-')[0].lower() in ['background','border','margin',
'padding']:
for keyword in value.split(): for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \ if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
return ' '.join(clean) return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False): lowercaseElementName=False, lowercaseAttrName=False):
#Change case matching defaults as we only output lowercase html anyway #Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal... #This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName) lowercaseElementName, lowercaseAttrName)
def __iter__(self): def __iter__(self):

View File

@ -1,3 +1,17 @@
from html5lib import treewalkers
from htmlserializer import HTMLSerializer from htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer from xhtmlserializer import XHTMLSerializer
def serialize(input, tree="simpletree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
elif format == "xhtml":
s = XHTMLSerializer(**serializer_opts)
else:
raise ValueError, "type must be either html or xhtml"
return s.render(walker(input), encoding)

View File

@ -147,7 +147,7 @@ class HTMLSerializer(object):
quote_attr = True quote_attr = True
else: else:
quote_attr = reduce(lambda x,y: x or (y in v), quote_attr = reduce(lambda x,y: x or (y in v),
spaceCharacters + "<>\"'", False) spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;") v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;") if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
if encoding: if encoding:

File diff suppressed because it is too large Load Diff

View File

@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
"simpletree" - a built-in DOM-ish tree type with support for some "simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms. more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation "dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation for the sake of
backwards compatibility (as releases up until 0.10 had a
builder called "dom" that was a minidom implemenation).
"etree" - A generic builder for tree implementations exposing an "etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree). ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed) "beautifulsoup" - Beautiful soup (if installed)
implementation - (Currently applies to the "etree" tree type only). A module implementation - (Currently applies to the "etree" and "dom" tree types). A
implementing the tree type e.g. xml.etree.ElementTree or module implementing the tree type e.g.
lxml.etree.""" xml.etree.ElementTree or lxml.etree."""
treeType = treeType.lower() treeType = treeType.lower()
if treeType not in treeBuilderCache: if treeType not in treeBuilderCache:
if treeType in ("dom", "simpletree"): if treeType == "dom":
mod = __import__(treeType, globals()) import dom
treeBuilderCache[treeType] = mod.TreeBuilder # XXX: Keep backwards compatibility by using minidom if no implementation is given
if implementation == None:
from xml.dom import minidom
implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "beautifulsoup": elif treeType == "beautifulsoup":
import soup import soup
treeBuilderCache[treeType] = soup.TreeBuilder treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "lxml":
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree": elif treeType == "etree":
import etree import etree
# XXX: NEVER cache here, caching is done in the etree submodule # XXX: NEVER cache here, caching is done in the etree submodule

View File

@ -1,3 +1,4 @@
import warnings
from html5lib.constants import scopingElements, tableInsertModeElements from html5lib.constants import scopingElements, tableInsertModeElements
try: try:
frozenset frozenset
@ -11,9 +12,6 @@ except NameError:
# from "leaking" into tables, buttons, object elements, and marquees. # from "leaking" into tables, buttons, object elements, and marquees.
Marker = None Marker = None
#XXX - TODO; make the default interface more ElementTree-like
# rather than DOM-like
class Node(object): class Node(object):
def __init__(self, name): def __init__(self, name):
"""Node representing an item in the tree. """Node representing an item in the tree.
@ -43,7 +41,7 @@ class Node(object):
return "<%s>"%(self.name) return "<%s>"%(self.name)
def __repr__(self): def __repr__(self):
return "<%s %s>" % (self.__class__, self.name) return "<%s>" % (self.name)
def appendChild(self, node): def appendChild(self, node):
"""Insert node as a child of the current node """Insert node as a child of the current node
@ -112,7 +110,12 @@ class TreeBuilder(object):
#Fragment class #Fragment class
fragmentClass = None fragmentClass = None
def __init__(self): def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
self.reset() self.reset()
def reset(self): def reset(self):
@ -140,7 +143,8 @@ class TreeBuilder(object):
return True return True
elif node.name == "table": elif node.name == "table":
return False return False
elif not tableVariant and node.name in scopingElements: elif (not tableVariant and (node.nameTuple in
scopingElements)):
return False return False
elif node.name == "html": elif node.name == "html":
return False return False
@ -179,7 +183,10 @@ class TreeBuilder(object):
clone = self.activeFormattingElements[i].cloneNode() clone = self.activeFormattingElements[i].cloneNode()
# Step 9 # Step 9
element = self.insertElement(clone.name, clone.attributes) element = self.insertElement({"type":"StartTag",
"name":clone.name,
"namespace":clone.namespace,
"data":clone.attributes})
# Step 10 # Step 10
self.activeFormattingElements[i] = element self.activeFormattingElements[i] = element
@ -207,21 +214,30 @@ class TreeBuilder(object):
return item return item
return False return False
def insertDoctype(self, name, publicId, systemId): def insertRoot(self, token):
doctype = self.doctypeClass(name) element = self.createElement(token)
doctype.publicId = publicId self.openElements.append(element)
doctype.systemId = systemId self.document.appendChild(element)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype) self.document.appendChild(doctype)
def insertComment(self, data, parent=None): def insertComment(self, token, parent=None):
if parent is None: if parent is None:
parent = self.openElements[-1] parent = self.openElements[-1]
parent.appendChild(self.commentClass(data)) parent.appendChild(self.commentClass(token["data"]))
def createElement(self, name, attributes): def createElement(self, token):
"""Create an element but don't insert it anywhere""" """Create an element but don't insert it anywhere"""
element = self.elementClass(name) name = token["name"]
element.attributes = attributes namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
return element return element
def _getInsertFromTable(self): def _getInsertFromTable(self):
@ -238,19 +254,20 @@ class TreeBuilder(object):
insertFromTable = property(_getInsertFromTable, _setInsertFromTable) insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, name, attributes): def insertElementNormal(self, token):
element = self.elementClass(name) name = token["name"]
element.attributes = attributes namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
self.openElements[-1].appendChild(element) self.openElements[-1].appendChild(element)
self.openElements.append(element) self.openElements.append(element)
return element return element
def insertElementTable(self, name, attributes): def insertElementTable(self, token):
"""Create an element and insert it into the tree""" """Create an element and insert it into the tree"""
element = self.elementClass(name) element = self.createElement(token)
element.attributes = attributes
if self.openElements[-1].name not in tableInsertModeElements: if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(name, attributes) return self.insertElementNormal(token)
else: else:
#We should be in the InTable mode. This means we want to do #We should be in the InTable mode. This means we want to do
#special magic element rearranging #special magic element rearranging
@ -267,32 +284,32 @@ class TreeBuilder(object):
if parent is None: if parent is None:
parent = self.openElements[-1] parent = self.openElements[-1]
if (not(self.insertFromTable) or (self.insertFromTable and if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name not in self.openElements[-1].name
tableInsertModeElements)): not in tableInsertModeElements)):
parent.insertText(data) parent.insertText(data)
else: else:
#We should be in the InTable mode. This means we want to do # We should be in the InTable mode. This means we want to do
#special magic element rearranging # special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition() parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore) parent.insertText(data, insertBefore)
def getTableMisnestedNodePosition(self): def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before """Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node""" (or None) when inserting a misnested table node"""
#The foster parent element is the one which comes before the most # The foster parent element is the one which comes before the most
#recently opened table element # recently opened table element
#XXX - this is really inelegant # XXX - this is really inelegant
lastTable=None lastTable=None
fosterParent = None fosterParent = None
insertBefore = None insertBefore = None
for elm in self.openElements[::-1]: for elm in self.openElements[::-1]:
if elm.name == u"table": if elm.name == "table":
lastTable = elm lastTable = elm
break break
if lastTable: if lastTable:
#XXX - we should really check that this parent is actually a # XXX - we should really check that this parent is actually a
#node here # node here
if lastTable.parent: if lastTable.parent:
fosterParent = lastTable.parent fosterParent = lastTable.parent
insertBefore = lastTable insertBefore = lastTable

View File

@ -1,203 +1,292 @@
import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
class AttrList: import _base
def __init__(self, element): from html5lib import constants, ihatexml
self.element = element from html5lib.constants import namespaces
def __iter__(self):
return self.element.attributes.items().__iter__()
def __setitem__(self, name, value):
value=illegal_xml_chars.sub(u'\uFFFD',value)
self.element.setAttribute(name, value)
def items(self):
return self.element.attributes.items()
def keys(self):
return self.element.attributes.keys()
def __getitem__(self, name):
return self.element.getAttribute(name)
class NodeBuilder(_base.Node): moduleCache = {}
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
self.element = element
def appendChild(self, node): def getDomModule(DomImplementation):
node.parent = self name = "_" + DomImplementation.__name__+"builder"
self.element.appendChild(node.element) if name in moduleCache:
return moduleCache[name]
def insertText(self, data, insertBefore=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
else:
self.element.appendChild(text)
def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element)
node.parent = self
def removeChild(self, node):
if node.element.parentNode == self.element:
self.element.removeChild(node.element)
node.parent = None
def reparentChildren(self, newParent):
while self.element.hasChildNodes():
child = self.element.firstChild
self.element.removeChild(child)
newParent.element.appendChild(child)
self.childNodes = []
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
value=illegal_xml_chars.sub(u'\uFFFD',value)
self.element.setAttribute(name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False))
def hasContent(self):
return self.element.hasChildNodes()
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
return self
def insertDoctype(self, name, publicId, systemId):
domimpl = minidom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
self.document.appendChild(NodeBuilder(doctype))
doctype.ownerDocument = self.dom
def elementClass(self, name):
return NodeBuilder(self.dom.createElement(name))
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
self.dom._child_node_types=list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))
name = None
def testSerializer(element):
element.normalize()
rv = []
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
if element.hasAttributes():
for name, value in element.attributes.items():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
for child in element.childNodes:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else: else:
attributes = dict(node.attributes.itemsNS()) mod = new.module(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
# gather namespace declarations def getDomBuilder(DomImplementation):
prefixes = [] Dom = DomImplementation
for attrname in node.attributes.keys(): infoset_filter = ihatexml.InfosetFilter()
attr = node.getAttributeNode(attrname) class AttrList:
if (attr.namespaceURI == XMLNS_NAMESPACE or def __init__(self, element):
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): self.element = element
prefix = (attr.localName != 'xmlns' and attr.localName or None) def __iter__(self):
handler.startPrefixMapping(prefix, attr.nodeValue) return self.element.attributes.items().__iter__()
prefixes.append(prefix) def __setitem__(self, name, value):
nsmap = nsmap.copy() self.element.setAttribute(infoset_filter.coerceAttribute(name),
nsmap[prefix] = attr.nodeValue infoset_filter.coerceCharacters(value))
del attributes[(attr.namespaceURI, attr.localName)] def items(self):
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
self.element.attributes.items()]
def keys(self):
return [infoset_filter.fromXmlName(item) for item in
self.element.attributes.keys()]
def __getitem__(self, name):
name = infoset_filter.toXmlName(name)
return self.element.getAttribute(name)
# apply namespace declarations def __contains__(self, name):
for attrname in node.attributes.keys(): if isinstance(name, tuple):
attr = node.getAttributeNode(attrname) raise NotImplementedError
if attr.namespaceURI == None and ':' in attr.nodeName: else:
prefix = attr.nodeName.split(':')[0] return self.element.hasAttribute(infoset_filter.toXmlName(name))
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)] class NodeBuilder(_base.Node):
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue def __init__(self, element):
_base.Node.__init__(self, element.localName)
self.element = element
# SAX events namespace = property(lambda self:hasattr(self.element, "namespaceURI")
ns = node.namespaceURI or nsmap.get(None,None) and self.element.namespaceURI or None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: def appendChild(self, node):
handler.characters(node.nodeValue) node.parent = self
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
data=infoset_filter.coerceCharacters(data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
else:
self.element.appendChild(text)
def insertBefore(self, node, refNode):
self.element.insertBefore(node.element, refNode.element)
node.parent = self
def removeChild(self, node):
if node.element.parentNode == self.element:
self.element.removeChild(node.element)
node.parent = None
def reparentChildren(self, newParent):
while self.element.hasChildNodes():
child = self.element.firstChild
self.element.removeChild(child)
newParent.element.appendChild(child)
self.childNodes = []
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" +
infoset_filter.coerceAttribute(
name[1]))
else:
qualifiedName = infoset_filter.coerceAttribute(
name[1])
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
infoset_filter.coerceAttribute(name), value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
return NodeBuilder(self.element.cloneNode(False))
def hasContent(self):
return self.element.hasChildNodes()
elif node.nodeType == Node.DOCUMENT_NODE: def getNameTuple(self):
handler.startDocument() if self.namespace == None:
for child in node.childNodes: dom2sax(child, handler, nsmap) return namespaces["html"], self.name
handler.endDocument() else:
return self.namespace, self.name
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: nameTuple = property(getNameTuple)
for child in node.childNodes: dom2sax(child, handler, nsmap)
else: class TreeBuilder(_base.TreeBuilder):
# ATTRIBUTE_NODE def documentClass(self):
# ENTITY_NODE self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
# PROCESSING_INSTRUCTION_NODE return self
# COMMENT_NODE
# DOCUMENT_TYPE_NODE def insertDoctype(self, token):
# NOTATION_NODE name = token["name"]
pass publicId = token["publicId"]
systemId = token["systemId"]
domimpl = Dom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom:
doctype.ownerDocument = self.dom
def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name)
else:
node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node)
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
def fragmentClass(self):
return NodeBuilder(self.dom.createDocumentFragment())
def appendChild(self, node):
self.dom.appendChild(node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.dom
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=infoset_filter.coerceCharacters(data)
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
# HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'):
if not Node.TEXT_NODE in self.dom._child_node_types:
self.dom._child_node_types=list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE)
self.dom.appendChild(self.dom.createTextNode(data))
name = None
def testSerializer(element):
element.normalize()
rv = []
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
if element.publicId or element.systemId:
publicId = element.publicId or ""
systemId = element.systemId or ""
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
' '*indent, element.name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI not in (None,
constants.namespaces["html"])):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name))
if element.hasAttributes():
i = 0
attr = element.attributes.item(i)
while attr:
name = infoset_filter.fromXmlName(attr.localName)
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], name)
i += 1
attr = element.attributes.item(i)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
for child in element.childNodes:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.localName != 'xmlns' and attr.localName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.localName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
return locals()
# Keep backwards compatibility with things that directly load
# classes/functions from this module
for key, value in getDomModule(minidom).__dict__.items():
globals()[key] = value

View File

@ -1,5 +1,12 @@
import _base
import new import new
import re
import _base
from html5lib import ihatexml
from html5lib import constants
from html5lib.constants import namespaces
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {} moduleCache = {}
@ -17,20 +24,43 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
def getETreeBuilder(ElementTreeImplementation, fullTree=False): def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation ElementTree = ElementTreeImplementation
class Element(_base.Node): class Element(_base.Node):
def __init__(self, name): def __init__(self, name, namespace=None):
self._element = ElementTree.Element(name) self._name = name
self.name = name self._namespace = namespace
self._element = ElementTree.Element(self._getETreeTag(name,
namespace))
if namespace is None:
self.nameTuple = namespaces["html"], self._name
else:
self.nameTuple = self._namespace, self._name
self.parent = None self.parent = None
self._childNodes = [] self._childNodes = []
self._flags = [] self._flags = []
def _getETreeTag(self, name, namespace):
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
return etree_tag
def _setName(self, name): def _setName(self, name):
self._element.tag = name self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self): def _getName(self):
return self._element.tag return self._name
name = property(_getName, _setName) name = property(_getName, _setName)
def _setNamespace(self, namespace):
self._namespace = namespace
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getNamespace(self):
return self._namespace
namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self): def _getAttributes(self):
return self._element.attrib return self._element.attrib
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
for key in self._element.attrib.keys(): for key in self._element.attrib.keys():
del self._element.attrib[key] del self._element.attrib[key]
for key, value in attributes.iteritems(): for key, value in attributes.iteritems():
self._element.set(key, value) if isinstance(key, tuple):
name = "{%s}%s"%(key[2], key[1])
else:
name = key
self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes) attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self): def _getChildNodes(self):
return self._childNodes return self._childNodes
def _setChildNodes(self, value): def _setChildNodes(self, value):
del self._element[:] del self._element[:]
self._childNodes = [] self._childNodes = []
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
data = property(_getData, _setData) data = property(_getData, _setData)
class DocumentType(Element): class DocumentType(Element):
def __init__(self, name): def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>") Element.__init__(self, "<!DOCTYPE>")
self._element.text = name self._element.text = name
self.publicId = publicId
self.systemId = systemId
def _getPublicId(self): def _getPublicId(self):
return self._element.get(u"publicId", None) return self._element.get(u"publicId", "")
def _setPublicId(self, value): def _setPublicId(self, value):
if value is not None: if value is not None:
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
publicId = property(_getPublicId, _setPublicId) publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self): def _getSystemId(self):
return self._element.get(u"systemId", None) return self._element.get(u"systemId", "")
def _setSystemId(self, value): def _setSystemId(self, value):
if value is not None: if value is not None:
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if not(hasattr(element, "tag")): if not(hasattr(element, "tag")):
element = element.getroot() element = element.getroot()
if element.tag == "<!DOCTYPE>": if element.tag == "<!DOCTYPE>":
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text)) if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>": elif element.tag == "<DOCUMENT_ROOT>":
rv.append("#document") rv.append("#document")
if element.text: if element.text:
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
elif type(element.tag) == type(ElementTree.Comment): elif type(element.tag) == type(ElementTree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else: else:
rv.append("|%s<%s>"%(' '*indent, element.tag)) nsmatch = tag_regexp.match(element.tag)
if nsmatch is None:
name = element.tag
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
if prefix != "html":
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"): if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems(): for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text: if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@ -201,12 +257,19 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
"""Serialize an element and its child nodes to a string""" """Serialize an element and its child nodes to a string"""
rv = [] rv = []
finalText = None finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element): def serializeElement(element):
if type(element) == type(ElementTree.ElementTree): if type(element) == type(ElementTree.ElementTree):
element = element.getroot() element = element.getroot()
if element.tag == "<!DOCTYPE>": if element.tag == "<!DOCTYPE>":
rv.append("<!DOCTYPE %s>"%(element.text,)) if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>": elif element.tag == "<DOCUMENT_ROOT>":
if element.text: if element.text:
rv.append(element.text) rv.append(element.text)
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else: else:
#This is assumed to be an ordinary element #This is assumed to be an ordinary element
if not element.attrib: if not element.attrib:
rv.append("<%s>"%(element.tag,)) rv.append("<%s>"%(filter.fromXmlName(element.tag),))
else: else:
attr = " ".join(["%s=\"%s\""%(name, value) attr = " ".join(["%s=\"%s\""%(
filter.fromXmlName(name), value)
for name, value in element.attrib.iteritems()]) for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr)) rv.append("<%s %s>"%(element.tag, attr))
if element.text: if element.text:

View File

@ -0,0 +1,331 @@
import new
import warnings
import re
import _base
from html5lib.constants import DataLossWarning
import html5lib.constants as constants
import etree as etree_builders
from html5lib import ihatexml
try:
import lxml.etree as etree
except ImportError:
pass
fullTree = True
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
#Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent+2)
next_element = next_element.getnext()
elif isinstance(element, basestring):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
else:
#Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
if prefix != "html":
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
nsmatch = etree_builders.tag_regexp.match(name)
if nsmatch:
ns = nsmatch.group(1)
name = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
prefix,
filter.fromXmlName(name),
value))
else:
rv.append('|%s%s="%s"' % (' '*(indent+2),
filter.fromXmlName(name),
value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif type(element.tag) == type(etree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, namespaceHTMLElements, fullTree = False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
filter = self.filter = ihatexml.InfosetFilter()
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
self._element = element
dict.__init__(self, value)
for key, value in self.iteritems():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = filter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return self._name
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = filter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = filter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = filter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
#self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name or ihatexml.nonXmlBMPRegexp.search(name):
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
def insertRoot(self, token):
"""Create the document root"""
#Because of the way libxml2 works, it doesn't seem to be possible to
#alter information like the doctype after the tree has been parsed.
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name:
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += ">"
#TODO - this needs to work when elements are not put into the default ns
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
try:
root = etree.fromstring(docStr)
except etree.XMLSyntaxError:
print docStr
raise
#Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
#Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
#Add the root element to the internal child/open data structures
namespace = token.get("namespace", None)
root_element = self.elementClass(token["name"], namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
#Reset to the default insert comment function
self.insertComment = super(TreeBuilder, self).insertComment

View File

@ -1,5 +1,5 @@
import _base import _base
from html5lib.constants import voidElements from html5lib.constants import voidElements, namespaces, prefixes
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing # Really crappy basic implementation of a DOM-core like thing
@ -63,6 +63,8 @@ class Node(_base.Node):
def cloneNode(self): def cloneNode(self):
newNode = type(self)(self.name) newNode = type(self)(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
if hasattr(self, 'attributes'): if hasattr(self, 'attributes'):
for attr, value in self.attributes.iteritems(): for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value newNode.attributes[attr] = value
@ -73,6 +75,14 @@ class Node(_base.Node):
"""Return true if the node has children or text""" """Return true if the node has children or text"""
return bool(self.childNodes) return bool(self.childNodes)
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class Document(Node): class Document(Node):
type = 1 type = 1
def __init__(self): def __init__(self):
@ -81,6 +91,9 @@ class Document(Node):
def __unicode__(self): def __unicode__(self):
return "#document" return "#document"
def appendChild(self, child):
Node.appendChild(self, child)
def toxml(self, encoding="utf=8"): def toxml(self, encoding="utf=8"):
result = "" result = ""
for child in self.childNodes: for child in self.childNodes:
@ -106,13 +119,21 @@ class DocumentFragment(Document):
class DocumentType(Node): class DocumentType(Node):
type = 3 type = 3
def __init__(self, name): def __init__(self, name, publicId, systemId):
Node.__init__(self, name) Node.__init__(self, name)
self.publicId = u"" self.publicId = publicId
self.systemId = u"" self.systemId = systemId
def __unicode__(self): def __unicode__(self):
return u"<!DOCTYPE %s>" % self.name if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
return """<!DOCTYPE %s "%s" "%s">"""%(
self.name, publicId, systemId)
else:
return u"<!DOCTYPE %s>" % self.name
toxml = __unicode__ toxml = __unicode__
@ -135,12 +156,16 @@ class TextNode(Node):
class Element(Node): class Element(Node):
type = 5 type = 5
def __init__(self, name): def __init__(self, name, namespace=None):
Node.__init__(self, name) Node.__init__(self, name)
self.namespace = namespace
self.attributes = {} self.attributes = {}
def __unicode__(self): def __unicode__(self):
return u"<%s>" % self.name if self.namespace in (None, namespaces["html"]):
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
def toxml(self): def toxml(self):
result = '<' + self.name result = '<' + self.name
@ -174,6 +199,8 @@ class Element(Node):
indent += 2 indent += 2
if self.attributes: if self.attributes:
for name, value in self.attributes.iteritems(): for name, value in self.attributes.iteritems():
if isinstance(name, tuple):
name = "%s %s"%(name[0], name[1])
tree += '\n|%s%s="%s"' % (' ' * indent, name, value) tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes: for child in self.childNodes:
tree += child.printTree(indent) tree += child.printTree(indent)

View File

@ -1,6 +1,9 @@
import warnings
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base import _base
from html5lib.constants import namespaces, DataLossWarning
class AttrList(object): class AttrList(object):
def __init__(self, element): def __init__(self, element):
@ -22,22 +25,39 @@ class AttrList(object):
class Element(_base.Node): class Element(_base.Node):
def __init__(self, element, soup): def __init__(self, element, soup, namespace):
_base.Node.__init__(self, element.name) _base.Node.__init__(self, element.name)
self.element = element self.element = element
self.soup=soup self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node): def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString): and self.element.contents[-1].__class__ == NavigableString):
newNode = TextNode(NavigableString( # Concatenate new text onto old text node
self.element.contents[-1]+node.element), self.soup) # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
self.element.contents[-1].extract() newStr = NavigableString(self.element.contents[-1]+node.element)
self.appendChild(newNode)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else: else:
self.element.insert(len(self.element.contents), node.element) self.element.insert(len(self.element.contents), node.element)
node.parent = self node.parent = self
def getAttributes(self): def getAttributes(self):
return AttrList(self.element) return AttrList(self.element)
@ -56,18 +76,25 @@ class Element(_base.Node):
self.appendChild(text) self.appendChild(text)
def insertBefore(self, node, refNode): def insertBefore(self, node, refNode):
index = self.element.contents.index(refNode.element) index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString): and self.element.contents[index-1].__class__ == NavigableString):
newNode = TextNode(NavigableString( # (See comments in appendChild)
self.element.contents[index-1]+node.element), self.soup) newStr = NavigableString(self.element.contents[index-1]+node.element)
self.element.contents[index-1].extract() oldNode = self.element.contents[index-1]
self.insertBefore(newNode, refNode) del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else: else:
self.element.insert(index, node.element) self.element.insert(index, node.element)
node.parent = self node.parent = self
def removeChild(self, node): def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract() node.element.extract()
node.parent = None node.parent = None
@ -76,12 +103,12 @@ class Element(_base.Node):
child = self.element.contents[0] child = self.element.contents[0]
child.extract() child.extract()
if isinstance(child, Tag): if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup)) newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else: else:
newParent.appendChild(TextNode(child, self.soup)) newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self): def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup) node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
for key,value in self.attributes: for key,value in self.attributes:
node.attributes[key] = value node.attributes[key] = value
return node return node
@ -89,11 +116,19 @@ class Element(_base.Node):
def hasContent(self): def hasContent(self):
return self.element.contents return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element): class TextNode(Element):
def __init__(self, element, soup): def __init__(self, element, soup):
_base.Node.__init__(self, None) _base.Node.__init__(self, None)
self.element = element self.element = element
self.soup=soup self.soup = soup
def cloneNode(self): def cloneNode(self):
raise NotImplementedError raise NotImplementedError
@ -101,13 +136,25 @@ class TextNode(Element):
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
def documentClass(self): def documentClass(self):
self.soup = BeautifulSoup("") self.soup = BeautifulSoup("")
return Element(self.soup, self.soup) return Element(self.soup, self.soup, None)
def insertDoctype(self, name, publicId, systemId): def insertDoctype(self, token):
self.soup.insert(0, Declaration(name)) name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration(name))
def elementClass(self, name): def elementClass(self, name, namespace):
return Element(Tag(self.soup, name), self.soup) if namespace not in (None, namespaces["html"]):
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data): def commentClass(self, data):
return TextNode(Comment(data), self.soup) return TextNode(Comment(data), self.soup)
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
def fragmentClass(self): def fragmentClass(self):
self.soup = BeautifulSoup("") self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]" self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup) return Element(self.soup, self.soup, None)
def appendChild(self, node): def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element) self.soup.insert(len(self.soup.contents), node.element)
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
return _base.TreeBuilder.getFragment(self).element return _base.TreeBuilder.getFragment(self).element
def testSerializer(element): def testSerializer(element):
import re
rv = [] rv = []
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if isinstance(element, Declaration): if isinstance(element, Declaration):
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string)) doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup): elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]": if element.name == "[document_fragment]":
rv.append("#document-fragment") rv.append("#document-fragment")

View File

@ -21,18 +21,24 @@ class TreeWalker(object):
attrs = attrs.items() attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs] return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, name, attrs, hasChildren=False): def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name), \ yield {"type": "EmptyTag", "name": unicode(name),
"data": self.normalizeAttrs(attrs)} "namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
if hasChildren: if hasChildren:
yield self.error(_("Void element has children")) yield self.error(_("Void element has children"))
def startTag(self, name, attrs): def startTag(self, namespace, name, attrs):
return {"type": "StartTag", "name": unicode(name), \ return {"type": "StartTag",
"data": self.normalizeAttrs(attrs)} "name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
def endTag(self, name): def endTag(self, namespace, name):
return {"type": "EndTag", "name": unicode(name), "data": []} return {"type": "EndTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": []}
def text(self, data): def text(self, data):
data = unicode(data) data = unicode(data)
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node): def walkChildren(self, node):
raise NodeImplementedError raise NodeImplementedError
def element(self, node, name, attrs, hasChildren): def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements: if name in voidElements:
for token in self.emptyTag(name, attrs, hasChildren): for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token yield token
else: else:
yield self.startTag(name, attrs) yield self.startTag(name, attrs)
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode) details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:] type, details = details[0], details[1:]
hasChildren = False hasChildren = False
endTag = None
if type == DOCTYPE: if type == DOCTYPE:
yield self.doctype(*details) yield self.doctype(*details)
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
yield token yield token
elif type == ELEMENT: elif type == ELEMENT:
name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name in voidElements: if name in voidElements:
for token in self.emptyTag(name, attributes, hasChildren): for token in self.emptyTag(namespace, name, attributes, hasChildren):
yield token yield token
hasChildren = False hasChildren = False
else: else:
yield self.startTag(name, attributes) endTag = name
yield self.startTag(namespace, name, attributes)
elif type == COMMENT: elif type == COMMENT:
yield self.comment(details[0]) yield self.comment(details[0])
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode) details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:] type, details = details[0], details[1:]
if type == ELEMENT: if type == ELEMENT:
name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name not in voidElements: if name not in voidElements:
yield self.endTag(name) yield self.endTag(namespace, name)
nextSibling = self.getNextSibling(currentNode) nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None: if nextSibling is not None:
currentNode = nextSibling currentNode = nextSibling

View File

@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.nodeValue return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE: elif node.nodeType == Node.ELEMENT_NODE:
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes return (_base.ELEMENT, node.namespaceURI, node.nodeName,
node.attributes.items(), node.hasChildNodes)
elif node.nodeType == Node.COMMENT_NODE: elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue return _base.COMMENT, node.nodeValue

View File

@ -3,10 +3,13 @@ _ = gettext.gettext
import new import new
import copy import copy
import re
import _base import _base
from html5lib.constants import voidElements from html5lib.constants import voidElements
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {} moduleCache = {}
def getETreeModule(ElementTreeImplementation): def getETreeModule(ElementTreeImplementation):
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
to avoid using recursion, returns "nodes" as tuples with the following to avoid using recursion, returns "nodes" as tuples with the following
content: content:
1. An Element node serving as *context* (it cannot be called the parent 1. The current element
node due to the particular ``tail`` text nodes.
2. The index of the element relative to its parent
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
3. A stack of ancestor elements
3. A list used as a stack of all ancestor *context nodes*. It is a
pair tuple whose first item is an Element and second item is a child 4. A flag "text", "tail" or None to indicate if the current node is a
index. text node; either the text or tail of the current element (1)
""" """
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element if isinstance(node, tuple): # It might be the root Element
elt, key, parents = node elt, key, parents, flag = node
if key in ("text", "tail"): if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, key) return _base.TEXT, getattr(elt, flag)
else: else:
node = elt[int(key)] node = elt
if not(hasattr(node, "tag")): if not(hasattr(node, "tag")):
node = node.getroot() node = node.getroot()
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
return (_base.DOCUMENT,) return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>": elif node.tag == "<!DOCTYPE>":
return _base.DOCTYPE, node.text return (_base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif type(node.tag) == type(ElementTree.Comment): elif type(node.tag) == type(ElementTree.Comment):
return _base.COMMENT, node.text return _base.COMMENT, node.text
else: else:
#This is assumed to be an ordinary element #This is assumed to be an ordinary element
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, tag,
node.attrib.items(), len(node) or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Element if isinstance(node, tuple):
elt, key, parents = node element, key, parents, flag = node
assert key not in ("text", "tail"), "Text nodes have no children"
parents.append((elt, int(key)))
node = elt[int(key)]
else: else:
parents = [] element, key, parents, flag = node, None, [], None
assert len(node) or node.text, "Node has no children" if flag in ("text", "tail"):
if node.text: return None
return (node, "text", parents)
else: else:
return (node, 0, parents) if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node): def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node) if isinstance(node, tuple):
element, key, parents, flag = node
elt, key, parents = node
if key == "text":
key = -1
elif key == "tail":
elt, key = parents.pop()
else:
# Look for "tail" of the "revisited" node
child = elt[key]
if child.tail:
parents.append((elt, key))
return (child, "tail", parents)
# case where key were "text" or "tail" or elt[key] had a tail
key += 1
if len(elt) > key:
return (elt, key, parents)
else: else:
return None return None
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key+1], key+1, parents, None
else:
return None
def getParentNode(self, node): def getParentNode(self, node):
assert isinstance(node, tuple) if isinstance(node, tuple):
elt, key, parents = node element, key, parents, flag = node
if parents:
elt, key = parents.pop()
return elt, key, parents
else: else:
# HACK: We could return ``elt`` but None will stop the algorithm the same way
return None return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
return parent, list(parents[-1]).index(parent), parents, None
return locals() return locals()

View File

@ -1,4 +1,4 @@
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \ from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener from genshi.output import NamespaceFlattener
@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
depth = 0 depth = 0
ignore_until = None ignore_until = None
previous = None previous = None
for event in NamespaceFlattener(prefixes={ for event in self.tree:
'http://www.w3.org/1999/xhtml': ''
})(self.tree):
if previous is not None: if previous is not None:
if previous[0] == START: if previous[0] == START:
depth += 1 depth += 1
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
kind, data, pos = event kind, data, pos = event
if kind == START: if kind == START:
tag, attrib = data tag, attrib = data
name = tag.localname
namespace = tag.namespace
if tag in voidElements: if tag in voidElements:
for token in self.emptyTag(tag, list(attrib), \ for token in self.emptyTag(namespace, name, list(attrib),
not next or next[0] != END or next[1] != tag): not next or next[0] != END
or next[1] != tag):
yield token yield token
else: else:
yield self.startTag(tag, list(attrib)) yield self.startTag(namespace, name, list(attrib))
elif kind == END: elif kind == END:
if data not in voidElements: name = data.localname
yield self.endTag(data) namespace = data.namespace
if (namespace, name) not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT: elif kind == COMMENT:
yield self.comment(data) yield self.comment(data)
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == DOCTYPE: elif kind == DOCTYPE:
yield self.doctype(*data) yield self.doctype(*data)
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \ elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
START_CDATA, END_CDATA, PI): START_CDATA, END_CDATA, PI):
pass pass

View File

@ -0,0 +1,175 @@
from lxml import etree
from html5lib.treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import voidElements
from html5lib import ihatexml
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self, et.docinfo.root_name,
et.docinfo.public_id,
et.docinfo.system_url))
root = et.getroot()
node = root
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = self.obj.text
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = self.obj.tail
else:
self.tail = None
self.isstring = isinstance(obj, basestring)
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __nonzero__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
tree = Root(tree)
elif isinstance(tree, list):
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
return _base.TEXT, getattr(node, key)
elif isinstance(node, Root):
return (_base.DOCUMENT,)
elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and node.isstring:
return _base.TEXT, node
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
[(self.filter.fromXmlName(name), value) for
name,value in node.attrib.iteritems()],
len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children")
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return node.tail and (node, "tail") or node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
return node
# else: fallback to "normal" processing
return node.getparent()

View File

@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
type, node = event type, node = event
if type == START_ELEMENT: if type == START_ELEMENT:
name = node.nodeName name = node.nodeName
namespace = node.namespaceURI
if name in voidElements: if name in voidElements:
for token in self.emptyTag(name, \ for token in self.emptyTag(namespace,
node.attributes.items(), not next or next[1] is not node): name,
node.attributes.items(),
not next or next[1] is not node):
yield token yield token
else: else:
yield self.startTag(name, node.attributes.items()) yield self.startTag(namespace, name, node.attributes.items())
elif type == END_ELEMENT: elif type == END_ELEMENT:
name = node.nodeName name = node.nodeName
namespace = node.namespaceURI
if name not in voidElements: if name not in voidElements:
yield self.endTag(name) yield self.endTag(namespace, name)
elif type == COMMENT: elif type == COMMENT:
yield self.comment(node.nodeValue) yield self.comment(node.nodeValue)

View File

@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.value return _base.TEXT, node.value
elif node.type == 5: # Element elif node.type == 5: # Element
return _base.ELEMENT, node.name, \ return (_base.ELEMENT, node.namespace, node.name,
node.attributes.items(), node.hasContent() node.attributes.items(), node.hasContent())
elif node.type == 6: # CommentNode elif node.type == 6: # CommentNode
return _base.COMMENT, node.data return _base.COMMENT, node.data

View File

@ -1,3 +1,4 @@
import re
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
import _base import _base
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,) return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType elif isinstance(node, Declaration): # DocumentType
#Slice needed to remove markup added during unicode conversion string = unicode(node.string)
return _base.DOCTYPE, unicode(node.string)[2:-1] #Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment): elif isinstance(node, Comment):
return _base.COMMENT, unicode(node.string)[4:-3] string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode elif isinstance(node, unicode): # TextNode
return _base.TEXT, node return _base.TEXT, node

View File

@ -34,3 +34,123 @@ class MethodDispatcher(dict):
def __getitem__(self, key): def __getitem__(self, key):
return dict.get(self, key, self.default) return dict.get(self, key, self.default)
#Pure python implementation of deque taken from the ASPN Python Cookbook
#Original code by Raymond Hettinger
class deque(object):
def __init__(self, iterable=(), maxsize=-1):
if not hasattr(self, 'data'):
self.left = self.right = 0
self.data = {}
self.maxsize = maxsize
self.extend(iterable)
def append(self, x):
self.data[self.right] = x
self.right += 1
if self.maxsize != -1 and len(self) > self.maxsize:
self.popleft()
def appendleft(self, x):
self.left -= 1
self.data[self.left] = x
if self.maxsize != -1 and len(self) > self.maxsize:
self.pop()
def pop(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
self.right -= 1
elem = self.data[self.right]
del self.data[self.right]
return elem
def popleft(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
elem = self.data[self.left]
del self.data[self.left]
self.left += 1
return elem
def clear(self):
self.data.clear()
self.left = self.right = 0
def extend(self, iterable):
for elem in iterable:
self.append(elem)
def extendleft(self, iterable):
for elem in iterable:
self.appendleft(elem)
def rotate(self, n=1):
if self:
n %= len(self)
for i in xrange(n):
self.appendleft(self.pop())
def __getitem__(self, i):
if i < 0:
i += len(self)
try:
return self.data[i + self.left]
except KeyError:
raise IndexError
def __setitem__(self, i, value):
if i < 0:
i += len(self)
try:
self.data[i + self.left] = value
except KeyError:
raise IndexError
def __delitem__(self, i):
size = len(self)
if not (-size <= i < size):
raise IndexError
data = self.data
if i < 0:
i += size
for j in xrange(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
def __len__(self):
return self.right - self.left
def __cmp__(self, other):
if type(self) != type(other):
return cmp(type(self), type(other))
return cmp(list(self), list(other))
def __repr__(self, _track=[]):
if id(self) in _track:
return '...'
_track.append(id(self))
r = 'deque(%r)' % (list(self),)
_track.remove(id(self))
return r
def __getstate__(self):
return (tuple(self),)
def __setstate__(self, s):
self.__init__(s[0])
def __hash__(self):
raise TypeError
def __copy__(self):
return self.__class__(self)
def __deepcopy__(self, memo={}):
from copy import deepcopy
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result

View File

@ -1,6 +1,6 @@
<!-- <!--
Description: illegal control character Description: illegal control character
Expect: content[0].value == u'Page 1\ufffdPage 2' Expect: content[0].value == u'Page 1 Page 2'
--> -->
<feed xmns="http://www.w3.org/2005/Atom"> <feed xmns="http://www.w3.org/2005/Atom">