diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 8607f95..f3f1a22 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -16,7 +16,7 @@ Todo: import re, time, sgmllib from xml.sax.saxutils import escape from xml.dom import minidom, Node -from html5lib import liberalxmlparser +from html5lib import html5parser from html5lib.treebuilders import dom import planet, config @@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo): bozo=1 if detail.type.find('xhtml')<0 or bozo: - parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder) + parser = html5parser.HTMLParser(tree=dom.TreeBuilder) html = parser.parse(xdiv % detail.value, encoding="utf-8") for body in html.documentElement.childNodes: if body.nodeType != Node.ELEMENT_NODE: continue diff --git a/planet/scrub.py b/planet/scrub.py index 9d48753..6d98a98 100644 --- a/planet/scrub.py +++ b/planet/scrub.py @@ -128,5 +128,11 @@ def scrub(feed_uri, data): node['value'] = feedparser._resolveRelativeURIs( node.value, node.base, 'utf-8', node.type) - node['value'] = feedparser._sanitizeHTML( - node.value, 'utf-8', node.type) + # Run this through HTML5's serializer + from html5lib import html5parser, sanitizer, treewalkers, serializer + p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) + doc = p.parseFragment(node.value, encoding='utf-8') + walker = treewalkers.getTreeWalker('simpletree') + xhtml = serializer.XHTMLSerializer() + tree = xhtml.serialize(walker(doc), encoding='utf-8') + node['value'] = ''.join([n for n in tree]) diff --git a/planet/vendor/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py index 4dbcb69..7a20994 100644 --- a/planet/vendor/html5lib/__init__.py +++ b/planet/vendor/html5lib/__init__.py @@ -11,5 +11,6 @@ f = open("my_document.html") p = html5lib.HTMLParser() tree = p.parse(f) """ -from html5parser import HTMLParser -from liberalxmlparser import XMLParser, XHTMLParser +from html5parser import HTMLParser, parse +from treebuilders import getTreeBuilder +from serializer import serialize diff --git a/planet/vendor/html5lib/constants.py b/planet/vendor/html5lib/constants.py index 459098f..c9f5883 100644 --- a/planet/vendor/html5lib/constants.py +++ b/planet/vendor/html5lib/constants.py @@ -1,4 +1,5 @@ -import string +import string, gettext +_ = gettext.gettext try: frozenset @@ -9,6 +10,260 @@ except NameError: EOF = None +E = { + "null-character": + _(u"Null character in input stream, replaced with U+FFFD."), + "invalid-character": + _(u"Invalid codepoint in stream."), + "incorrectly-placed-solidus": + _(u"Solidus (/) incorrectly placed in tag."), + "incorrect-cr-newline-entity": + _(u"Incorrect CR newline entity, replaced with LF."), + "illegal-windows-1252-entity": + _(u"Entity used with illegal number (windows-1252 reference)."), + "cant-convert-numeric-entity": + _(u"Numeric entity couldn't be converted to character " + u"(codepoint U+%(charAsInt)08x)."), + "illegal-codepoint-for-numeric-entity": + _(u"Numeric entity represents an illegal codepoint: " + u"U+%(charAsInt)08x."), + "numeric-entity-without-semicolon": + _(u"Numeric entity didn't end with ';'."), + "expected-numeric-entity-but-got-eof": + _(u"Numeric entity expected. Got end of file instead."), + "expected-numeric-entity": + _(u"Numeric entity expected but none found."), + "named-entity-without-semicolon": + _(u"Named entity didn't end with ';'."), + "expected-named-entity": + _(u"Named entity expected. Got none."), + "attributes-in-end-tag": + _(u"End tag contains unexpected attributes."), + "expected-tag-name-but-got-right-bracket": + _(u"Expected tag name. Got '>' instead."), + "expected-tag-name-but-got-question-mark": + _(u"Expected tag name. Got '?' instead. (HTML doesn't " + u"support processing instructions.)"), + "expected-tag-name": + _(u"Expected tag name. Got something else instead"), + "expected-closing-tag-but-got-right-bracket": + _(u"Expected closing tag. Got '>' instead. Ignoring '>'."), + "expected-closing-tag-but-got-eof": + _(u"Expected closing tag. Unexpected end of file."), + "expected-closing-tag-but-got-char": + _(u"Expected closing tag. Unexpected character '%(data)s' found."), + "eof-in-tag-name": + _(u"Unexpected end of file in the tag name."), + "expected-attribute-name-but-got-eof": + _(u"Unexpected end of file. Expected attribute name instead."), + "eof-in-attribute-name": + _(u"Unexpected end of file in attribute name."), + "invalid-character-in-attribute-name": + _(u"Invalid chracter in attribute name"), + "duplicate-attribute": + _(u"Dropped duplicate attribute on tag."), + "expected-end-of-tag-name-but-got-eof": + _(u"Unexpected end of file. Expected = or end of tag."), + "expected-attribute-value-but-got-eof": + _(u"Unexpected end of file. Expected attribute value."), + "expected-attribute-value-but-got-right-bracket": + _(u"Expected attribute value. Got '>' instead."), + "eof-in-attribute-value-double-quote": + _(u"Unexpected end of file in attribute value (\")."), + "eof-in-attribute-value-single-quote": + _(u"Unexpected end of file in attribute value (')."), + "eof-in-attribute-value-no-quotes": + _(u"Unexpected end of file in attribute value."), + "unexpected-EOF-after-solidus-in-tag": + _(u"Unexpected end of file in tag. Expected >"), + "unexpected-character-after-soldius-in-tag": + _(u"Unexpected character after / in tag. Expected >"), + "expected-dashes-or-doctype": + _(u"Expected '--' or 'DOCTYPE'. Not found."), + "incorrect-comment": + _(u"Incorrect comment."), + "eof-in-comment": + _(u"Unexpected end of file in comment."), + "eof-in-comment-end-dash": + _(u"Unexpected end of file in comment (-)"), + "unexpected-dash-after-double-dash-in-comment": + _(u"Unexpected '-' after '--' found in comment."), + "eof-in-comment-double-dash": + _(u"Unexpected end of file in comment (--)."), + "unexpected-char-in-comment": + _(u"Unexpected character in comment found."), + "need-space-after-doctype": + _(u"No space after literal string 'DOCTYPE'."), + "expected-doctype-name-but-got-right-bracket": + _(u"Unexpected > character. Expected DOCTYPE name."), + "expected-doctype-name-but-got-eof": + _(u"Unexpected end of file. Expected DOCTYPE name."), + "eof-in-doctype-name": + _(u"Unexpected end of file in DOCTYPE name."), + "eof-in-doctype": + _(u"Unexpected end of file in DOCTYPE."), + "expected-space-or-right-bracket-in-doctype": + _(u"Expected space or '>'. Got '%(data)s'"), + "unexpected-end-of-doctype": + _(u"Unexpected end of DOCTYPE."), + "unexpected-char-in-doctype": + _(u"Unexpected character in DOCTYPE."), + "eof-in-innerhtml": + _(u"XXX innerHTML EOF"), + "unexpected-doctype": + _(u"Unexpected DOCTYPE. Ignored."), + "non-html-root": + _(u"html needs to be the first start tag."), + "expected-doctype-but-got-eof": + _(u"Unexpected End of file. Expected DOCTYPE."), + "unknown-doctype": + _(u"Erroneous DOCTYPE."), + "expected-doctype-but-got-chars": + _(u"Unexpected non-space characters. Expected DOCTYPE."), + "expected-doctype-but-got-start-tag": + _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."), + "expected-doctype-but-got-end-tag": + _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."), + "end-tag-after-implied-root": + _(u"Unexpected end tag (%(name)s) after the (implied) root element."), + "expected-named-closing-tag-but-got-eof": + _(u"Unexpected end of file. Expected end tag (%(name)s)."), + "two-heads-are-not-better-than-one": + _(u"Unexpected start tag head in existing head. Ignored."), + "unexpected-end-tag": + _(u"Unexpected end tag (%(name)s). Ignored."), + "unexpected-start-tag-out-of-my-head": + _(u"Unexpected start tag (%(name)s) that can be in head. Moved."), + "unexpected-start-tag": + _(u"Unexpected start tag (%(name)s)."), + "missing-end-tag": + _(u"Missing end tag (%(name)s)."), + "missing-end-tags": + _(u"Missing end tags (%(name)s)."), + "unexpected-start-tag-implies-end-tag": + _(u"Unexpected start tag (%(startName)s) " + u"implies end tag (%(endName)s)."), + "unexpected-start-tag-treated-as": + _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "deprecated-tag": + _(u"Unexpected start tag %(name)s. Don't use it!"), + "unexpected-start-tag-ignored": + _(u"Unexpected start tag %(name)s. Ignored."), + "expected-one-end-tag-but-got-another": + _(u"Unexpected end tag (%(gotName)s). " + u"Missing end tag (%(expectedName)s)."), + "end-tag-too-early": + _(u"End tag (%(name)s) seen too early. Expected other end tag."), + "end-tag-too-early-named": + _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "end-tag-too-early-ignored": + _(u"End tag (%(name)s) seen too early. Ignored."), + "adoption-agency-1.1": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 1 of the adoption agency algorithm."), + "adoption-agency-1.2": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 2 of the adoption agency algorithm."), + "adoption-agency-1.3": + _(u"End tag (%(name)s) violates step 1, " + u"paragraph 3 of the adoption agency algorithm."), + "unexpected-end-tag-treated-as": + _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "no-end-tag": + _(u"This element (%(name)s) has no end tag."), + "unexpected-implied-end-tag-in-table": + _(u"Unexpected implied end tag (%(name)s) in the table phase."), + "unexpected-implied-end-tag-in-table-body": + _(u"Unexpected implied end tag (%(name)s) in the table body phase."), + "unexpected-char-implies-table-voodoo": + _(u"Unexpected non-space characters in " + u"table context caused voodoo mode."), + "unexpected-hidden-input-in-table": + _(u"Unexpected input with type hidden in table context."), + "unexpected-start-tag-implies-table-voodoo": + _(u"Unexpected start tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-end-tag-implies-table-voodoo": + _(u"Unexpected end tag (%(name)s) in " + u"table context caused voodoo mode."), + "unexpected-cell-in-table-body": + _(u"Unexpected table cell start tag (%(name)s) " + u"in the table body phase."), + "unexpected-cell-end-tag": + _(u"Got table cell end tag (%(name)s) " + u"while required end tags are missing."), + "unexpected-end-tag-in-table-body": + _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "unexpected-implied-end-tag-in-table-row": + _(u"Unexpected implied end tag (%(name)s) in the table row phase."), + "unexpected-end-tag-in-table-row": + _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "unexpected-select-in-select": + _(u"Unexpected select start tag in the select phase " + u"treated as select end tag."), + "unexpected-input-in-select": + _(u"Unexpected input start tag in the select phase."), + "unexpected-start-tag-in-select": + _(u"Unexpected start tag token (%(name)s in the select phase. " + u"Ignored."), + "unexpected-end-tag-in-select": + _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."), + "unexpected-table-element-start-tag-in-select-in-table": + _(u"Unexpected table element start tag (%(name)s) in the select in table phase."), + "unexpected-table-element-end-tag-in-select-in-table": + _(u"Unexpected table element end tag (%(name)s) in the select in table phase."), + "unexpected-char-after-body": + _(u"Unexpected non-space characters in the after body phase."), + "unexpected-start-tag-after-body": + _(u"Unexpected start tag token (%(name)s)" + u" in the after body phase."), + "unexpected-end-tag-after-body": + _(u"Unexpected end tag token (%(name)s)" + u" in the after body phase."), + "unexpected-char-in-frameset": + _(u"Unepxected characters in the frameset phase. Characters ignored."), + "unexpected-start-tag-in-frameset": + _(u"Unexpected start tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-frameset-in-frameset-innerhtml": + _(u"Unexpected end tag token (frameset) " + u"in the frameset phase (innerHTML)."), + "unexpected-end-tag-in-frameset": + _(u"Unexpected end tag token (%(name)s)" + u" in the frameset phase. Ignored."), + "unexpected-char-after-frameset": + _(u"Unexpected non-space characters in the " + u"after frameset phase. Ignored."), + "unexpected-start-tag-after-frameset": + _(u"Unexpected start tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-frameset": + _(u"Unexpected end tag (%(name)s)" + u" in the after frameset phase. Ignored."), + "unexpected-end-tag-after-body-innerhtml": + _(u"Unexpected end tag after body(innerHtml)"), + "expected-eof-but-got-char": + _(u"Unexpected non-space characters. Expected end of file."), + "expected-eof-but-got-start-tag": + _(u"Unexpected start tag (%(name)s)" + u". Expected end of file."), + "expected-eof-but-got-end-tag": + _(u"Unexpected end tag (%(name)s)" + u". Expected end of file."), + "eof-in-table": + _(u"Unexpected end of file. Expected table content."), + "eof-in-select": + _(u"Unexpected end of file. Expected select content."), + "eof-in-frameset": + _(u"Unexpected end of file. Expected frameset content."), + "non-void-element-with-trailing-solidus": + _(u"Trailing solidus not allowed on element %(name)s"), + "unexpected-html-element-in-foreign-content": + _(u"Element %(name)s not allowed in a non-html context"), + "XXX-undefined-error": + (u"Undefined error (this sucks and should be fixed)"), +} + contentModelFlags = { "PCDATA":0, "RCDATA":1, @@ -16,101 +271,126 @@ contentModelFlags = { "PLAINTEXT":3 } +namespaces = { + "html":"http://www.w3.org/1999/xhtml", + "mathml":"http://www.w3.org/1998/Math/MathML", + "svg":"http://www.w3.org/2000/svg", + "xlink":"http://www.w3.org/1999/xlink", + "xml":"http://www.w3.org/XML/1998/namespace", + "xmlns":"http://www.w3.org/2000/xmlns/" +} + scopingElements = frozenset(( - "button", - "caption", - "html", - "marquee", - "object", - "table", - "td", - "th" + (namespaces["html"], "applet"), + (namespaces["html"], "button"), + (namespaces["html"], "caption"), + (namespaces["html"], "html"), + (namespaces["html"], "marquee"), + (namespaces["html"], "object"), + (namespaces["html"], "table"), + (namespaces["html"], "td"), + (namespaces["html"], "th"), + (namespaces["svg"], "foreignObject") )) formattingElements = frozenset(( - "a", - "b", - "big", - "em", - "font", - "i", - "nobr", - "s", - "small", - "strike", - "strong", - "tt", - "u" + (namespaces["html"], "a"), + (namespaces["html"], "b"), + (namespaces["html"], "big"), + (namespaces["html"], "code"), + (namespaces["html"], "em"), + (namespaces["html"], "font"), + (namespaces["html"], "i"), + (namespaces["html"], "nobr"), + (namespaces["html"], "s"), + (namespaces["html"], "small"), + (namespaces["html"], "strike"), + (namespaces["html"], "strong"), + (namespaces["html"], "tt"), + (namespaces["html"], "u") )) specialElements = frozenset(( - "address", - "area", - "base", - "basefont", - "bgsound", - "blockquote", - "body", - "br", - "center", - "col", - "colgroup", - "dd", - "dir", - "div", - "dl", - "dt", - "embed", - "fieldset", - "form", - "frame", - "frameset", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "hr", - "iframe", - "image", - "img", - "input", - "isindex", - "li", - "link", - "listing", - "menu", - "meta", - "noembed", - "noframes", - "noscript", - "ol", - "optgroup", - "option", - "p", - "param", - "plaintext", - "pre", - "script", - "select", - "spacer", - "style", - "tbody", - "textarea", - "tfoot", - "thead", - "title", - "tr", - "ul", - "wbr" + (namespaces["html"], "address"), + (namespaces["html"], "area"), + (namespaces["html"], "article"), + (namespaces["html"], "aside"), + (namespaces["html"], "base"), + (namespaces["html"], "basefont"), + (namespaces["html"], "bgsound"), + (namespaces["html"], "blockquote"), + (namespaces["html"], "body"), + (namespaces["html"], "br"), + (namespaces["html"], "center"), + (namespaces["html"], "col"), + (namespaces["html"], "colgroup"), + (namespaces["html"], "command"), + (namespaces["html"], "datagrid"), + (namespaces["html"], "dd"), + (namespaces["html"], "details"), + (namespaces["html"], "dialog"), + (namespaces["html"], "dir"), + (namespaces["html"], "div"), + (namespaces["html"], "dl"), + (namespaces["html"], "dt"), + (namespaces["html"], "embed"), + (namespaces["html"], "event-source"), + (namespaces["html"], "fieldset"), + (namespaces["html"], "figure"), + (namespaces["html"], "footer"), + (namespaces["html"], "form"), + (namespaces["html"], "frame"), + (namespaces["html"], "frameset"), + (namespaces["html"], "h1"), + (namespaces["html"], "h2"), + (namespaces["html"], "h3"), + (namespaces["html"], "h4"), + (namespaces["html"], "h5"), + (namespaces["html"], "h6"), + (namespaces["html"], "head"), + (namespaces["html"], "header"), + (namespaces["html"], "hr"), + (namespaces["html"], "iframe"), + # Note that image is commented out in the spec as "this isn't an + # element that can end up on the stack, so it doesn't matter," + (namespaces["html"], "image"), + (namespaces["html"], "img"), + (namespaces["html"], "input"), + (namespaces["html"], "isindex"), + (namespaces["html"], "li"), + (namespaces["html"], "link"), + (namespaces["html"], "listing"), + (namespaces["html"], "menu"), + (namespaces["html"], "meta"), + (namespaces["html"], "nav"), + (namespaces["html"], "noembed"), + (namespaces["html"], "noframes"), + (namespaces["html"], "noscript"), + (namespaces["html"], "ol"), + (namespaces["html"], "optgroup"), + (namespaces["html"], "option"), + (namespaces["html"], "p"), + (namespaces["html"], "param"), + (namespaces["html"], "plaintext"), + (namespaces["html"], "pre"), + (namespaces["html"], "script"), + (namespaces["html"], "section"), + (namespaces["html"], "select"), + (namespaces["html"], "spacer"), + (namespaces["html"], "style"), + (namespaces["html"], "tbody"), + (namespaces["html"], "textarea"), + (namespaces["html"], "tfoot"), + (namespaces["html"], "thead"), + (namespaces["html"], "title"), + (namespaces["html"], "tr"), + (namespaces["html"], "ul"), + (namespaces["html"], "wbr") )) spaceCharacters = frozenset(( u"\t", u"\n", - u"\u000B", u"\u000C", u" ", u"\r" @@ -143,9 +423,10 @@ headingElements = ( "h6" ) -# XXX What about event-source and command? voidElements = frozenset(( "base", + "command", + "event-source", "link", "meta", "hr", @@ -155,7 +436,8 @@ voidElements = frozenset(( "param", "area", "col", - "input" + "input", + "source" )) cdataElements = frozenset(('title', 'textarea')) @@ -440,7 +722,7 @@ entities = { "kappa;": u"\u03BA", "lArr;": u"\u21D0", "lambda;": u"\u03BB", - "lang;": u"\u3008", + "lang;": u"\u27E8", "laquo;": u"\u00AB", "laquo": u"\u00AB", "larr;": u"\u2190", @@ -520,7 +802,7 @@ entities = { "quot": u"\u0022", "rArr;": u"\u21D2", "radic;": u"\u221A", - "rang;": u"\u3009", + "rang;": u"\u27E9", "raquo;": u"\u00BB", "raquo": u"\u00BB", "rarr;": u"\u2192", @@ -596,221 +878,255 @@ entities = { "zwnj;": u"\u200C" } -encodings = frozenset(( - "ansi_x3.4-1968", - "iso-ir-6", - "ansi_x3.4-1986", - "iso_646.irv:1991", - "ascii", - "iso646-us", - "us-ascii", - "us", - "ibm367", - "cp367", - "csascii", - "ks_c_5601-1987", - "korean", - "iso-2022-kr", - "csiso2022kr", - "euc-kr", - "iso-2022-jp", - "csiso2022jp", - "iso-2022-jp-2", - "iso-ir-58", - "chinese", - "csiso58gb231280", - "iso_8859-1:1987", - "iso-ir-100", - "iso_8859-1", - "iso-8859-1", - "latin1", - "l1", - "ibm819", - "cp819", - "csisolatin1", - "iso_8859-2:1987", - "iso-ir-101", - "iso_8859-2", - "iso-8859-2", - "latin2", - "l2", - "csisolatin2", - "iso_8859-3:1988", - "iso-ir-109", - "iso_8859-3", - "iso-8859-3", - "latin3", - "l3", - "csisolatin3", - "iso_8859-4:1988", - "iso-ir-110", - "iso_8859-4", - "iso-8859-4", - "latin4", - "l4", - "csisolatin4", - "iso_8859-6:1987", - "iso-ir-127", - "iso_8859-6", - "iso-8859-6", - "ecma-114", - "asmo-708", - "arabic", - "csisolatinarabic", - "iso_8859-7:1987", - "iso-ir-126", - "iso_8859-7", - "iso-8859-7", - "elot_928", - "ecma-118", - "greek", - "greek8", - "csisolatingreek", - "iso_8859-8:1988", - "iso-ir-138", - "iso_8859-8", - "iso-8859-8", - "hebrew", - "csisolatinhebrew", - "iso_8859-5:1988", - "iso-ir-144", - "iso_8859-5", - "iso-8859-5", - "cyrillic", - "csisolatincyrillic", - "iso_8859-9:1989", - "iso-ir-148", - "iso_8859-9", - "iso-8859-9", - "latin5", - "l5", - "csisolatin5", - "iso-8859-10", - "iso-ir-157", - "l6", - "iso_8859-10:1992", - "csisolatin6", - "latin6", - "hp-roman8", - "roman8", - "r8", - "ibm037", - "cp037", - "csibm037", - "ibm424", - "cp424", - "csibm424", - "ibm437", - "cp437", - "437", - "cspc8codepage437", - "ibm500", - "cp500", - "csibm500", - "ibm775", - "cp775", - "cspc775baltic", - "ibm850", - "cp850", - "850", - "cspc850multilingual", - "ibm852", - "cp852", - "852", - "cspcp852", - "ibm855", - "cp855", - "855", - "csibm855", - "ibm857", - "cp857", - "857", - "csibm857", - "ibm860", - "cp860", - "860", - "csibm860", - "ibm861", - "cp861", - "861", - "cp-is", - "csibm861", - "ibm862", - "cp862", - "862", - "cspc862latinhebrew", - "ibm863", - "cp863", - "863", - "csibm863", - "ibm864", - "cp864", - "csibm864", - "ibm865", - "cp865", - "865", - "csibm865", - "ibm866", - "cp866", - "866", - "csibm866", - "ibm869", - "cp869", - "869", - "cp-gr", - "csibm869", - "ibm1026", - "cp1026", - "csibm1026", - "koi8-r", - "cskoi8r", - "koi8-u", - "big5-hkscs", - "ptcp154", - "csptcp154", - "pt154", - "cp154", - "utf-7", - "utf-16be", - "utf-16le", - "utf-16", - "utf-8", - "iso-8859-13", - "iso-8859-14", - "iso-ir-199", - "iso_8859-14:1998", - "iso_8859-14", - "latin8", - "iso-celtic", - "l8", - "iso-8859-15", - "iso_8859-15", - "iso-8859-16", - "iso-ir-226", - "iso_8859-16:2001", - "iso_8859-16", - "latin10", - "l10", - "gbk", - "cp936", - "ms936", - "gb18030", - "shift_jis", - "ms_kanji", - "csshiftjis", - "euc-jp", - "gb2312", - "big5", - "csbig5", - "windows-1250", - "windows-1251", - "windows-1252", - "windows-1253", - "windows-1254", - "windows-1255", - "windows-1256", - "windows-1257", - "windows-1258", - "tis-620", - "hz-gb-2312", - )) \ No newline at end of file +encodings = { + '437': 'cp437', + '850': 'cp850', + '852': 'cp852', + '855': 'cp855', + '857': 'cp857', + '860': 'cp860', + '861': 'cp861', + '862': 'cp862', + '863': 'cp863', + '865': 'cp865', + '866': 'cp866', + '869': 'cp869', + 'ansix341968': 'ascii', + 'ansix341986': 'ascii', + 'arabic': 'iso8859-6', + 'ascii': 'ascii', + 'asmo708': 'iso8859-6', + 'big5': 'big5', + 'big5hkscs': 'big5hkscs', + 'chinese': 'gbk', + 'cp037': 'cp037', + 'cp1026': 'cp1026', + 'cp154': 'ptcp154', + 'cp367': 'ascii', + 'cp424': 'cp424', + 'cp437': 'cp437', + 'cp500': 'cp500', + 'cp775': 'cp775', + 'cp819': 'windows-1252', + 'cp850': 'cp850', + 'cp852': 'cp852', + 'cp855': 'cp855', + 'cp857': 'cp857', + 'cp860': 'cp860', + 'cp861': 'cp861', + 'cp862': 'cp862', + 'cp863': 'cp863', + 'cp864': 'cp864', + 'cp865': 'cp865', + 'cp866': 'cp866', + 'cp869': 'cp869', + 'cp936': 'gbk', + 'cpgr': 'cp869', + 'cpis': 'cp861', + 'csascii': 'ascii', + 'csbig5': 'big5', + 'cseuckr': 'cp949', + 'cseucpkdfmtjapanese': 'euc_jp', + 'csgb2312': 'gbk', + 'cshproman8': 'hp-roman8', + 'csibm037': 'cp037', + 'csibm1026': 'cp1026', + 'csibm424': 'cp424', + 'csibm500': 'cp500', + 'csibm855': 'cp855', + 'csibm857': 'cp857', + 'csibm860': 'cp860', + 'csibm861': 'cp861', + 'csibm863': 'cp863', + 'csibm864': 'cp864', + 'csibm865': 'cp865', + 'csibm866': 'cp866', + 'csibm869': 'cp869', + 'csiso2022jp': 'iso2022_jp', + 'csiso2022jp2': 'iso2022_jp_2', + 'csiso2022kr': 'iso2022_kr', + 'csiso58gb231280': 'gbk', + 'csisolatin1': 'windows-1252', + 'csisolatin2': 'iso8859-2', + 'csisolatin3': 'iso8859-3', + 'csisolatin4': 'iso8859-4', + 'csisolatin5': 'windows-1254', + 'csisolatin6': 'iso8859-10', + 'csisolatinarabic': 'iso8859-6', + 'csisolatincyrillic': 'iso8859-5', + 'csisolatingreek': 'iso8859-7', + 'csisolatinhebrew': 'iso8859-8', + 'cskoi8r': 'koi8-r', + 'csksc56011987': 'cp949', + 'cspc775baltic': 'cp775', + 'cspc850multilingual': 'cp850', + 'cspc862latinhebrew': 'cp862', + 'cspc8codepage437': 'cp437', + 'cspcp852': 'cp852', + 'csptcp154': 'ptcp154', + 'csshiftjis': 'shift_jis', + 'csunicode11utf7': 'utf-7', + 'cyrillic': 'iso8859-5', + 'cyrillicasian': 'ptcp154', + 'ebcdiccpbe': 'cp500', + 'ebcdiccpca': 'cp037', + 'ebcdiccpch': 'cp500', + 'ebcdiccphe': 'cp424', + 'ebcdiccpnl': 'cp037', + 'ebcdiccpus': 'cp037', + 'ebcdiccpwt': 'cp037', + 'ecma114': 'iso8859-6', + 'ecma118': 'iso8859-7', + 'elot928': 'iso8859-7', + 'eucjp': 'euc_jp', + 'euckr': 'cp949', + 'extendedunixcodepackedformatforjapanese': 'euc_jp', + 'gb18030': 'gb18030', + 'gb2312': 'gbk', + 'gb231280': 'gbk', + 'gbk': 'gbk', + 'greek': 'iso8859-7', + 'greek8': 'iso8859-7', + 'hebrew': 'iso8859-8', + 'hproman8': 'hp-roman8', + 'hzgb2312': 'hz', + 'ibm037': 'cp037', + 'ibm1026': 'cp1026', + 'ibm367': 'ascii', + 'ibm424': 'cp424', + 'ibm437': 'cp437', + 'ibm500': 'cp500', + 'ibm775': 'cp775', + 'ibm819': 'windows-1252', + 'ibm850': 'cp850', + 'ibm852': 'cp852', + 'ibm855': 'cp855', + 'ibm857': 'cp857', + 'ibm860': 'cp860', + 'ibm861': 'cp861', + 'ibm862': 'cp862', + 'ibm863': 'cp863', + 'ibm864': 'cp864', + 'ibm865': 'cp865', + 'ibm866': 'cp866', + 'ibm869': 'cp869', + 'iso2022jp': 'iso2022_jp', + 'iso2022jp2': 'iso2022_jp_2', + 'iso2022kr': 'iso2022_kr', + 'iso646irv1991': 'ascii', + 'iso646us': 'ascii', + 'iso88591': 'windows-1252', + 'iso885910': 'iso8859-10', + 'iso8859101992': 'iso8859-10', + 'iso885911987': 'windows-1252', + 'iso885913': 'iso8859-13', + 'iso885914': 'iso8859-14', + 'iso8859141998': 'iso8859-14', + 'iso885915': 'iso8859-15', + 'iso885916': 'iso8859-16', + 'iso8859162001': 'iso8859-16', + 'iso88592': 'iso8859-2', + 'iso885921987': 'iso8859-2', + 'iso88593': 'iso8859-3', + 'iso885931988': 'iso8859-3', + 'iso88594': 'iso8859-4', + 'iso885941988': 'iso8859-4', + 'iso88595': 'iso8859-5', + 'iso885951988': 'iso8859-5', + 'iso88596': 'iso8859-6', + 'iso885961987': 'iso8859-6', + 'iso88597': 'iso8859-7', + 'iso885971987': 'iso8859-7', + 'iso88598': 'iso8859-8', + 'iso885981988': 'iso8859-8', + 'iso88599': 'windows-1254', + 'iso885991989': 'windows-1254', + 'isoceltic': 'iso8859-14', + 'isoir100': 'windows-1252', + 'isoir101': 'iso8859-2', + 'isoir109': 'iso8859-3', + 'isoir110': 'iso8859-4', + 'isoir126': 'iso8859-7', + 'isoir127': 'iso8859-6', + 'isoir138': 'iso8859-8', + 'isoir144': 'iso8859-5', + 'isoir148': 'windows-1254', + 'isoir149': 'cp949', + 'isoir157': 'iso8859-10', + 'isoir199': 'iso8859-14', + 'isoir226': 'iso8859-16', + 'isoir58': 'gbk', + 'isoir6': 'ascii', + 'koi8r': 'koi8-r', + 'koi8u': 'koi8-u', + 'korean': 'cp949', + 'ksc5601': 'cp949', + 'ksc56011987': 'cp949', + 'ksc56011989': 'cp949', + 'l1': 'windows-1252', + 'l10': 'iso8859-16', + 'l2': 'iso8859-2', + 'l3': 'iso8859-3', + 'l4': 'iso8859-4', + 'l5': 'windows-1254', + 'l6': 'iso8859-10', + 'l8': 'iso8859-14', + 'latin1': 'windows-1252', + 'latin10': 'iso8859-16', + 'latin2': 'iso8859-2', + 'latin3': 'iso8859-3', + 'latin4': 'iso8859-4', + 'latin5': 'windows-1254', + 'latin6': 'iso8859-10', + 'latin8': 'iso8859-14', + 'latin9': 'iso8859-15', + 'ms936': 'gbk', + 'mskanji': 'shift_jis', + 'pt154': 'ptcp154', + 'ptcp154': 'ptcp154', + 'r8': 'hp-roman8', + 'roman8': 'hp-roman8', + 'shiftjis': 'shift_jis', + 'tis620': 'cp874', + 'unicode11utf7': 'utf-7', + 'us': 'ascii', + 'usascii': 'ascii', + 'utf16': 'utf-16', + 'utf16be': 'utf-16-be', + 'utf16le': 'utf-16-le', + 'utf8': 'utf-8', + 'windows1250': 'cp1250', + 'windows1251': 'cp1251', + 'windows1252': 'cp1252', + 'windows1253': 'cp1253', + 'windows1254': 'cp1254', + 'windows1255': 'cp1255', + 'windows1256': 'cp1256', + 'windows1257': 'cp1257', + 'windows1258': 'cp1258', + 'windows936': 'gbk', + 'x-x-big5': 'big5'} + +tokenTypes = { + "Doctype":0, + "Characters":1, + "SpaceCharacters":2, + "StartTag":3, + "EndTag":4, + "EmptyTag":5, + "Comment":6, + "ParseError":7 +} + +tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"])) + + +prefixes = dict([(v,k) for k,v in namespaces.iteritems()]) +prefixes["http://www.w3.org/1998/Math/MathML"] = "math" + +class DataLossWarning(UserWarning): + pass + +class ReparseException(Exception): + pass diff --git a/planet/vendor/html5lib/filters/formfiller.py b/planet/vendor/html5lib/filters/formfiller.py new file mode 100644 index 0000000..9400171 --- /dev/null +++ b/planet/vendor/html5lib/filters/formfiller.py @@ -0,0 +1,127 @@ +# +# The goal is to finally have a form filler where you pass data for +# each form, using the algorithm for "Seeding a form with initial values" +# See http://www.whatwg.org/specs/web-forms/current-work/#seeding +# + +import _base + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class SimpleFilter(_base.Filter): + def __init__(self, source, fieldStorage): + _base.Filter.__init__(self, source) + self.fieldStorage = fieldStorage + + def __iter__(self): + field_indices = {} + state = None + field_name = None + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"].lower() + if name == "input": + field_name = None + field_type = None + input_value_index = -1 + input_checked_index = -1 + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == u"name": + field_name = v.strip(spaceCharacters) + elif n == u"type": + field_type = v.strip(spaceCharacters) + elif n == u"checked": + input_checked_index = i + elif n == u"value": + input_value_index = i + + value_list = self.fieldStorage.getlist(field_name) + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + + if field_type in (u"checkbox", u"radio"): + if value_list: + if token["data"][input_value_index][1] == value: + if input_checked_index < 0: + token["data"].append((u"checked", u"")) + field_indices[field_name] = field_index + 1 + elif input_checked_index >= 0: + del token["data"][input_checked_index] + + elif field_type not in (u"button", u"submit", u"reset"): + if input_value_index >= 0: + token["data"][input_value_index] = (u"value", value) + else: + token["data"].append((u"value", value)) + field_indices[field_name] = field_index + 1 + + field_type = None + field_name = None + + elif name == "textarea": + field_type = "textarea" + field_name = dict((token["data"])[::-1])["name"] + + elif name == "select": + field_type = "select" + attributes = dict(token["data"][::-1]) + field_name = attributes.get("name") + is_select_multiple = "multiple" in attributes + is_selected_option_found = False + + elif field_type == "select" and field_name and name == "option": + option_selected_index = -1 + option_value = None + for i,(n,v) in enumerate(token["data"]): + n = n.lower() + if n == "selected": + option_selected_index = i + elif n == "value": + option_value = v.strip(spaceCharacters) + if option_value is None: + raise NotImplementedError("s without a value= attribute") + else: + value_list = self.fieldStorage.getlist(field_name) + if value_list: + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + if (is_select_multiple or not is_selected_option_found) and option_value == value: + if option_selected_index < 0: + token["data"].append((u"selected", u"")) + field_indices[field_name] = field_index + 1 + is_selected_option_found = True + elif option_selected_index >= 0: + del token["data"][option_selected_index] + + elif field_type is not None and field_name and type == "EndTag": + name = token["name"].lower() + if name == field_type: + if name == "textarea": + value_list = self.fieldStorage.getlist(field_name) + if value_list: + field_index = field_indices.setdefault(field_name, 0) + if field_index < len(value_list): + value = value_list[field_index] + else: + value = "" + yield {"type": "Characters", "data": value} + field_indices[field_name] = field_index + 1 + + field_name = None + + elif name == "option" and field_type == "select": + pass # TODO: part of "option without value= attribute" processing + + elif field_type == "textarea": + continue # ignore token + + yield token diff --git a/planet/vendor/html5lib/filters/optionaltags.py b/planet/vendor/html5lib/filters/optionaltags.py index 73da96c..a77aa72 100644 --- a/planet/vendor/html5lib/filters/optionaltags.py +++ b/planet/vendor/html5lib/filters/optionaltags.py @@ -14,7 +14,8 @@ class Filter(_base.Filter): for previous, token, next in self.slider(): type = token["type"] if type == "StartTag": - if token["data"] or not self.is_optional_start(token["name"], previous, next): + if (token["data"] or + not self.is_optional_start(token["name"], previous, next)): yield token elif type == "EndTag": if not self.is_optional_end(token["name"], next): @@ -31,7 +32,11 @@ class Filter(_base.Filter): elif tagname == 'head': # A head element's start tag may be omitted if the first thing # inside the head element is an element. - return type == "StartTag" + # XXX: we also omit the start tag if the head element is empty + if type in ("StartTag", "EmptyTag"): + return True + elif type == "EndTag": + return next["name"] == "head" elif tagname == 'body': # A body element's start tag may be omitted if the first thing # inside the body element is not a space character or a comment, @@ -52,7 +57,7 @@ class Filter(_base.Filter): # inside the colgroup element is a col element, and if the element # is not immediately preceeded by another colgroup element whose # end tag has been omitted. - if type == "StartTag": + if type in ("StartTag", "EmptyTag"): # XXX: we do not look at the preceding event, so instead we never # omit the colgroup element's end tag when it is immediately # followed by another colgroup element. See is_optional_end. @@ -81,16 +86,13 @@ class Filter(_base.Filter): # An html element's end tag may be omitted if the html element # is not immediately followed by a space character or a comment. return type not in ("Comment", "SpaceCharacters") - elif tagname in ('li', 'optgroup', 'option', 'tr'): + elif tagname in ('li', 'optgroup', 'tr'): # A li element's end tag may be omitted if the li element is # immediately followed by another li element or if there is # no more content in the parent element. # An optgroup element's end tag may be omitted if the optgroup # element is immediately followed by another optgroup element, # or if there is no more content in the parent element. - # An option element's end tag may be omitted if the option - # element is immediately followed by another option element, - # or if there is no more content in the parent element. # A tr element's end tag may be omitted if the tr element is # immediately followed by another tr element, or if there is # no more content in the parent element. @@ -112,14 +114,39 @@ class Filter(_base.Filter): return False elif tagname == 'p': # A p element's end tag may be omitted if the p element is - # immediately followed by an address, blockquote, dl, fieldset, - # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, - # or ul element, or if there is no more content in the parent + # immediately followed by an address, article, aside, + # blockquote, datagrid, dialog, dir, div, dl, fieldset, + # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu, + # nav, ol, p, pre, section, table, or ul, element, or if + # there is no more content in the parent element. + if type in ("StartTag", "EmptyTag"): + return next["name"] in ('address', 'article', 'aside', + 'blockquote', 'datagrid', 'dialog', + 'dir', 'div', 'dl', 'fieldset', 'footer', + 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'header', 'hr', 'menu', 'nav', 'ol', + 'p', 'pre', 'section', 'table', 'ul') + else: + return type == "EndTag" or type is None + elif tagname == 'option': + # An option element's end tag may be omitted if the option + # element is immediately followed by another option element, + # or if it is immediately followed by an optgroup + # element, or if there is no more content in the parent # element. if type == "StartTag": - return next["name"] in ('address', 'blockquote', \ - 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \ - 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul') + return next["name"] in ('option', 'optgroup') + else: + return type == "EndTag" or type is None + elif tagname in ('rt', 'rp'): + # An rt element's end tag may be omitted if the rt element is + # immediately followed by an rt or rp element, or if there is + # no more content in the parent element. + # An rp element's end tag may be omitted if the rp element is + # immediately followed by an rt or rp element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] in ('rt', 'rp') else: return type == "EndTag" or type is None elif tagname == 'colgroup': diff --git a/planet/vendor/html5lib/filters/sanitizer.py b/planet/vendor/html5lib/filters/sanitizer.py new file mode 100644 index 0000000..0023527 --- /dev/null +++ b/planet/vendor/html5lib/filters/sanitizer.py @@ -0,0 +1,8 @@ +import _base +from html5lib.sanitizer import HTMLSanitizerMixin + +class Filter(_base.Filter, HTMLSanitizerMixin): + def __iter__(self): + for token in _base.Filter.__iter__(self): + token = self.sanitize_token(token) + if token: yield token diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py index 1c0fd3e..a8e5a1f 100644 --- a/planet/vendor/html5lib/html5parser.py +++ b/planet/vendor/html5lib/html5parser.py @@ -1,19 +1,12 @@ -# Differences from the current specification are as follows: -# * Phases and insertion modes are one concept in parser.py. -# * EOF handling is slightly different to make sure , and -# always exist. - - try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset -import gettext -_ = gettext.gettext import sys +import inputstream import tokenizer import treebuilders @@ -25,64 +18,93 @@ from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower from constants import scopingElements, formattingElements, specialElements from constants import headingElements, tableInsertModeElements from constants import cdataElements, rcdataElements, voidElements +from constants import tokenTypes, ReparseException, namespaces + +def parse(doc, treebuilder="simpletree", encoding=None, + namespaceHTMLElements=True): + tb = treebuilders.getTreeBuilder(treebuilder) + p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) + return p.parse(doc, encoding=encoding) class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, strict = False, tree=simpletree.TreeBuilder, - tokenizer=tokenizer.HTMLTokenizer): + def __init__(self, tree = simpletree.TreeBuilder, + tokenizer = tokenizer.HTMLTokenizer, strict = False, + namespaceHTMLElements = True): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through html5lib.treebuilders.getTreeBuilder(treeType) + + tokenizer - a class that provides a stream of tokens to the treebuilder. + This may be replaced for e.g. a sanitizer which converts some tags to + text """ # Raise an exception on the first error encountered self.strict = strict - self.tree = tree() + self.tree = tree(namespaceHTMLElements) self.tokenizer_class = tokenizer self.errors = [] - # "quirks" / "almost-standards" / "standards" - self.quirksMode = "standards" - self.phases = { "initial": InitialPhase(self, self.tree), - "rootElement": RootElementPhase(self, self.tree), + "beforeHtml": BeforeHtmlPhase(self, self.tree), "beforeHead": BeforeHeadPhase(self, self.tree), "inHead": InHeadPhase(self, self.tree), # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree), "afterHead": AfterHeadPhase(self, self.tree), "inBody": InBodyPhase(self, self.tree), + "inCDataRCData": InCDataRCDataPhase(self, self.tree), "inTable": InTablePhase(self, self.tree), + "inTableText": InTableTextPhase(self, self.tree), "inCaption": InCaptionPhase(self, self.tree), "inColumnGroup": InColumnGroupPhase(self, self.tree), "inTableBody": InTableBodyPhase(self, self.tree), "inRow": InRowPhase(self, self.tree), "inCell": InCellPhase(self, self.tree), "inSelect": InSelectPhase(self, self.tree), + "inSelectInTable": InSelectInTablePhase(self, self.tree), + "inForeignContent": InForeignContentPhase(self, self.tree), "afterBody": AfterBodyPhase(self, self.tree), "inFrameset": InFramesetPhase(self, self.tree), "afterFrameset": AfterFramesetPhase(self, self.tree), - "trailingEnd": TrailingEndPhase(self, self.tree) + "afterAfterBody": AfterAfterBodyPhase(self, self.tree), + "afterAfterFrameset": AfterAfterFramesetPhase(self, self.tree), + # XXX after after frameset } def _parse(self, stream, innerHTML=False, container="div", - encoding=None, **kwargs): - + encoding=None, parseMeta=True, useChardet=True, **kwargs): + + self.innerHTMLMode = innerHTML + self.container = container + self.tokenizer = self.tokenizer_class(stream, encoding=encoding, + parseMeta=parseMeta, + useChardet=useChardet, **kwargs) + self.reset() + + while True: + try: + self.mainLoop() + break + except ReparseException, e: + self.reset() + + def reset(self): self.tree.reset() self.firstStartTag = False self.errors = [] + # "quirks" / "limited quirks" / "no quirks" + self.compatMode = "no quirks" - self.tokenizer = self.tokenizer_class(stream, encoding=encoding, - parseMeta=not innerHTML, **kwargs) - - if innerHTML: - self.innerHTML = container.lower() + if self.innerHTMLMode: + self.innerHTML = self.container.lower() if self.innerHTML in cdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] @@ -94,38 +116,73 @@ class HTMLParser(object): # contentModelFlag already is PCDATA #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"] pass - self.phase = self.phases["rootElement"] + self.phase = self.phases["beforeHtml"] self.phase.insertHtmlElement() self.resetInsertionMode() else: self.innerHTML = False self.phase = self.phases["initial"] - # We only seem to have InBodyPhase testcases where the following is - # relevant ... need others too self.lastPhase = None + self.secondaryPhase = None - # XXX This is temporary for the moment so there isn't any other - # changes needed for the parser to work with the iterable tokenizer - for token in self.tokenizer: - token = self.normalizeToken(token) + self.beforeRCDataPhase = None + + self.framesetOK = True + + def mainLoop(self): + (CharactersToken, + SpaceCharactersToken, + StartTagToken, + EndTagToken, + CommentToken, + DoctypeToken) = (tokenTypes["Characters"], + tokenTypes["SpaceCharacters"], + tokenTypes["StartTag"], + tokenTypes["EndTag"], + tokenTypes["Comment"], + tokenTypes["Doctype"]) + + CharactersToken = tokenTypes["Characters"] + SpaceCharactersToken = tokenTypes["SpaceCharacters"] + StartTagToken = tokenTypes["StartTag"] + EndTagToken = tokenTypes["EndTag"] + CommentToken = tokenTypes["Comment"] + DoctypeToken = tokenTypes["Doctype"] + + + for token in self.normalizedTokens(): + #print self.phase.__class__.__name__ + #print token type = token["type"] - method = getattr(self.phase, "process%s" % type, None) - if type in ("Characters", "SpaceCharacters", "Comment"): - method(token["data"]) - elif type == "StartTag": - method(token["name"], token["data"]) - elif type == "EndTag": - method(token["name"]) - elif type == "Doctype": - method(token["name"], token["publicId"], token["systemId"], token["correct"]) + if type == CharactersToken: + self.phase.processCharacters(token) + elif type == SpaceCharactersToken: + self.phase.processSpaceCharacters(token) + elif type == StartTagToken: + self.selfClosingAcknowledged = False + self.phase.processStartTag(token) + if (token["selfClosing"] + and not self.selfClosingAcknowledged): + self.parseError("non-void-element-with-trailing-solidus", + {"name":token["name"]}) + elif type == EndTagToken: + self.phase.processEndTag(token) + elif type == CommentToken: + self.phase.processComment(token) + elif type == DoctypeToken: + self.phase.processDoctype(token) else: - self.parseError(token["data"]) + self.parseError(token["data"], token.get("datavars", {})) # When the loop finishes it's EOF self.phase.processEOF() - def parse(self, stream, encoding=None): + def normalizedTokens(self): + for token in self.tokenizer: + yield self.normalizeToken(token) + + def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed @@ -135,10 +192,12 @@ class HTMLParser(object): regardless of any BOM or later declaration (such as in a meta element) """ - self._parse(stream, innerHTML=False, encoding=encoding) + self._parse(stream, innerHTML=False, encoding=encoding, + parseMeta=parseMeta, useChardet=useChardet) return self.tree.getDocument() - def parseFragment(self, stream, container="div", encoding=None): + def parseFragment(self, stream, container="div", encoding=None, + parseMeta=False, useChardet=True): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -154,31 +213,119 @@ class HTMLParser(object): self._parse(stream, True, container=container, encoding=encoding) return self.tree.getFragment() - def parseError(self, data="XXX ERROR MESSAGE NEEDED"): - # XXX The idea is to make data mandatory. - self.errors.append((self.tokenizer.stream.position(), data)) + def parseError(self, errorcode="XXX-undefined-error", datavars={}): + # XXX The idea is to make errorcode mandatory. + self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ - if token["type"] == "EmptyTag": - # When a solidus (/) is encountered within a tag name what happens - # depends on whether the current tag name matches that of a void - # element. If it matches a void element atheists did the wrong - # thing and if it doesn't it's wrong for everyone. - - if token["name"] not in voidElements: - self.parseError(_(u"Solidus (/) incorrectly placed in tag.")) - - token["type"] = "StartTag" - - if token["type"] == "StartTag": + if token["type"] == tokenTypes["StartTag"]: token["data"] = dict(token["data"][::-1]) return token + def adjustMathMLAttributes(self, token): + replacements = {"definitionurl":"definitionURL"} + for k,v in replacements.iteritems(): + if k in token["data"]: + token["data"][v] = token["data"][k] + del token["data"][k] + + def adjustSVGAttributes(self, token): + replacements = { + "attributename" : "attributeName", + "attributetype" : "attributeType", + "basefrequency" : "baseFrequency", + "baseprofile" : "baseProfile", + "calcmode" : "calcMode", + "clippathunits" : "clipPathUnits", + "contentscripttype" : "contentScriptType", + "contentstyletype" : "contentStyleType", + "diffuseconstant" : "diffuseConstant", + "edgemode" : "edgeMode", + "externalresourcesrequired" : "externalResourcesRequired", + "filterres" : "filterRes", + "filterunits" : "filterUnits", + "glyphref" : "glyphRef", + "gradienttransform" : "gradientTransform", + "gradientunits" : "gradientUnits", + "kernelmatrix" : "kernelMatrix", + "kernelunitlength" : "kernelUnitLength", + "keypoints" : "keyPoints", + "keysplines" : "keySplines", + "keytimes" : "keyTimes", + "lengthadjust" : "lengthAdjust", + "limitingconeangle" : "limitingConeAngle", + "markerheight" : "markerHeight", + "markerunits" : "markerUnits", + "markerwidth" : "markerWidth", + "maskcontentunits" : "maskContentUnits", + "maskunits" : "maskUnits", + "numoctaves" : "numOctaves", + "pathlength" : "pathLength", + "patterncontentunits" : "patternContentUnits", + "patterntransform" : "patternTransform", + "patternunits" : "patternUnits", + "pointsatx" : "pointsAtX", + "pointsaty" : "pointsAtY", + "pointsatz" : "pointsAtZ", + "preservealpha" : "preserveAlpha", + "preserveaspectratio" : "preserveAspectRatio", + "primitiveunits" : "primitiveUnits", + "refx" : "refX", + "refy" : "refY", + "repeatcount" : "repeatCount", + "repeatdur" : "repeatDur", + "requiredextensions" : "requiredExtensions", + "requiredfeatures" : "requiredFeatures", + "specularconstant" : "specularConstant", + "specularexponent" : "specularExponent", + "spreadmethod" : "spreadMethod", + "startoffset" : "startOffset", + "stddeviation" : "stdDeviation", + "stitchtiles" : "stitchTiles", + "surfacescale" : "surfaceScale", + "systemlanguage" : "systemLanguage", + "tablevalues" : "tableValues", + "targetx" : "targetX", + "targety" : "targetY", + "textlength" : "textLength", + "viewbox" : "viewBox", + "viewtarget" : "viewTarget", + "xchannelselector" : "xChannelSelector", + "ychannelselector" : "yChannelSelector", + "zoomandpan" : "zoomAndPan" + } + for originalName in token["data"].keys(): + if originalName in replacements: + svgName = replacements[originalName] + token["data"][svgName] = token["data"][originalName] + del token["data"][originalName] + + def adjustForeignAttributes(self, token): + replacements = { + "xlink:actuate":("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]), + "xlink:href":("xlink", "href", namespaces["xlink"]), + "xlink:role":("xlink", "role", namespaces["xlink"]), + "xlink:show":("xlink", "show", namespaces["xlink"]), + "xlink:title":("xlink", "title", namespaces["xlink"]), + "xlink:type":("xlink", "type", namespaces["xlink"]), + "xml:base":("xml", "base", namespaces["xml"]), + "xml:lang":("xml", "lang", namespaces["xml"]), + "xml:space":("xml", "space", namespaces["xml"]), + "xmlns":(None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"]) + } + + for originalName in token["data"].iterkeys(): + if originalName in replacements: + foreignName = replacements[originalName] + token["data"][foreignName] = token["data"][originalName] + del token["data"][originalName] def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the @@ -215,6 +362,10 @@ class HTMLParser(object): if nodeName in newModes: self.phase = self.phases[newModes[nodeName]] break + elif node.namespace in (namespaces["mathml"], namespaces["svg"]): + self.phase = self.phases["inForeignContent"] + self.secondaryPhase = self.phases["inBody"] + break elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] @@ -225,6 +376,19 @@ class HTMLParser(object): self.phase = self.phases["inBody"] break + def parseRCDataCData(self, token, contentType): + """Generic (R)CDATA Parsing algorithm + contentType - RCDATA or CDATA + """ + assert contentType in ("CDATA", "RCDATA") + + element = self.tree.insertElement(token) + self.tokenizer.contentModelFlag = contentModelFlags[contentType] + + self.originalPhase = self.phase + + self.phase = self.phases["inCDataRCData"] + class Phase(object): """Base class for helper object that implements each phase of processing """ @@ -244,48 +408,37 @@ class Phase(object): self.tree = tree def processEOF(self): - self.tree.generateImpliedEndTags() - if len(self.tree.openElements) > 2: - self.parser.parseError(_(u"Unexpected end of file. " - u"Missing closing tags.")) - elif len(self.tree.openElements) == 2 and\ - self.tree.openElements[1].name != "body": - # This happens for framesets or something? - self.parser.parseError(_(u"Unexpected end of file. Expected end " - u"tag (%s) first.") % (self.tree.openElements[1].name,)) - elif self.parser.innerHTML and len(self.tree.openElements) > 1 : - # XXX This is not what the specification says. Not sure what to do - # here. - self.parser.parseError(_(u"XXX innerHTML EOF")) - # Betting ends. + raise NotImplementedError - def processComment(self, data): + def processComment(self, token): # For most phases the following is correct. Where it's not it will be # overridden. - self.tree.insertComment(data, self.tree.openElements[-1]) + self.tree.insertComment(token, self.tree.openElements[-1]) - def processDoctype(self, name, publicId, systemId, correct): - self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored.")) + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") - def processSpaceCharacters(self, data): - self.tree.insertText(data) + def processCharacters(self, token): + self.tree.insertText(token["data"]) - def processStartTag(self, name, attributes): - self.startTagHandler[name](name, attributes) + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) - def startTagHtml(self, name, attributes): - if self.parser.firstStartTag == False and name == "html": - self.parser.parseError(_(u"html needs to be the first start tag.")) + def processStartTag(self, token): + self.startTagHandler[token["name"]](token) + + def startTagHtml(self, token): + if self.parser.firstStartTag == False and token["name"] == "html": + self.parser.parseError("non-html-root") # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). - for attr, value in attributes.iteritems(): + for attr, value in token["data"].iteritems(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False - def processEndTag(self, name): - self.endTagHandler[name](name) - + def processEndTag(self, token): + self.endTagHandler[token["name"]](token) class InitialPhase(Phase): # This phase deals with error handling as well which is currently not @@ -293,136 +446,153 @@ class InitialPhase(Phase): # "quirks mode". It is expected that a future version of HTML5 will defin # this. def processEOF(self): - self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE.")) - self.parser.phase = self.parser.phases["rootElement"] + self.parser.parseError("expected-doctype-but-got-eof") + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] self.parser.phase.processEOF() - def processComment(self, data): - self.tree.insertComment(data, self.tree.document) + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) - def processDoctype(self, name, publicId, systemId, correct): - nameLower = name.translate(asciiUpper2Lower) - if nameLower != "html" or publicId != None or\ - systemId != None: - self.parser.parseError(_(u"Erroneous DOCTYPE.")) - # XXX need to update DOCTYPE tokens - self.tree.insertDoctype(name, publicId, systemId) + def processDoctype(self, token): + + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId != None or + systemId != None): + self.parser.parseError("unknown-doctype") - if publicId == None: - publicId = "" + if publicId is None: + publicId = "" + if systemId is None: + systemId = "" + + self.tree.insertDoctype(token) + if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) + publicId = publicId.translate(asciiUpper2Lower) - if nameLower != "html": - # XXX quirks mode - pass - else: - if publicId in\ - ("+//silmaril//dtd html pro v0r11 19970101//en", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", - "-//as//dtd html 3.0 aswedit + extensions//en", - "-//ietf//dtd html 2.0 level 1//en", - "-//ietf//dtd html 2.0 level 2//en", - "-//ietf//dtd html 2.0 strict level 1//en", - "-//ietf//dtd html 2.0 strict level 2//en", - "-//ietf//dtd html 2.0 strict//en", - "-//ietf//dtd html 2.0//en", - "-//ietf//dtd html 2.1e//en", - "-//ietf//dtd html 3.0//en", - "-//ietf//dtd html 3.0//en//", - "-//ietf//dtd html 3.2 final//en", - "-//ietf//dtd html 3.2//en", - "-//ietf//dtd html 3//en", - "-//ietf//dtd html level 0//en", - "-//ietf//dtd html level 0//en//2.0", - "-//ietf//dtd html level 1//en", - "-//ietf//dtd html level 1//en//2.0", - "-//ietf//dtd html level 2//en", - "-//ietf//dtd html level 2//en//2.0", - "-//ietf//dtd html level 3//en", - "-//ietf//dtd html level 3//en//3.0", - "-//ietf//dtd html strict level 0//en", - "-//ietf//dtd html strict level 0//en//2.0", - "-//ietf//dtd html strict level 1//en", - "-//ietf//dtd html strict level 1//en//2.0", - "-//ietf//dtd html strict level 2//en", - "-//ietf//dtd html strict level 2//en//2.0", - "-//ietf//dtd html strict level 3//en", - "-//ietf//dtd html strict level 3//en//3.0", - "-//ietf//dtd html strict//en", - "-//ietf//dtd html strict//en//2.0", - "-//ietf//dtd html strict//en//3.0", - "-//ietf//dtd html//en", - "-//ietf//dtd html//en//2.0", - "-//ietf//dtd html//en//3.0", - "-//metrius//dtd metrius presentational//en", - "-//microsoft//dtd internet explorer 2.0 html strict//en", - "-//microsoft//dtd internet explorer 2.0 html//en", - "-//microsoft//dtd internet explorer 2.0 tables//en", - "-//microsoft//dtd internet explorer 3.0 html strict//en", - "-//microsoft//dtd internet explorer 3.0 html//en", - "-//microsoft//dtd internet explorer 3.0 tables//en", - "-//netscape comm. corp.//dtd html//en", - "-//netscape comm. corp.//dtd strict html//en", - "-//o'reilly and associates//dtd html 2.0//en", - "-//o'reilly and associates//dtd html extended 1.0//en", - "-//spyglass//dtd html 2.0 extended//en", - "-//sq//dtd html 2.0 hotmetal + extensions//en", - "-//sun microsystems corp.//dtd hotjava html//en", - "-//sun microsystems corp.//dtd hotjava strict html//en", - "-//w3c//dtd html 3 1995-03-24//en", - "-//w3c//dtd html 3.2 draft//en", - "-//w3c//dtd html 3.2 final//en", - "-//w3c//dtd html 3.2//en", - "-//w3c//dtd html 3.2s draft//en", - "-//w3c//dtd html 4.0 frameset//en", - "-//w3c//dtd html 4.0 transitional//en", - "-//w3c//dtd html experimental 19960712//en", - "-//w3c//dtd html experimental 970421//en", - "-//w3c//dtd w3 html//en", - "-//w3o//dtd w3 html 3.0//en", - "-//w3o//dtd w3 html 3.0//en//", - "-//w3o//dtd w3 html strict 3.0//en//", - "-//webtechs//dtd mozilla html 2.0//en", - "-//webtechs//dtd mozilla html//en", - "-/w3c/dtd html 4.0 transitional/en", - "html")\ - or (publicId in\ - ("-//w3c//dtd html 4.01 frameset//EN", - "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\ - or (systemId != None and\ - systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): - #XXX quirks mode - pass + if (not correct or token["name"] != "html" + or publicId in + ("+//silmaril//dtd html pro v0r11 19970101//en", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", + "-//as//dtd html 3.0 aswedit + extensions//en", + "-//ietf//dtd html 2.0 level 1//en", + "-//ietf//dtd html 2.0 level 2//en", + "-//ietf//dtd html 2.0 strict level 1//en", + "-//ietf//dtd html 2.0 strict level 2//en", + "-//ietf//dtd html 2.0 strict//en", + "-//ietf//dtd html 2.0//en", + "-//ietf//dtd html 2.1e//en", + "-//ietf//dtd html 3.0//en", + "-//ietf//dtd html 3.0//en//", + "-//ietf//dtd html 3.2 final//en", + "-//ietf//dtd html 3.2//en", + "-//ietf//dtd html 3//en", + "-//ietf//dtd html level 0//en", + "-//ietf//dtd html level 0//en//2.0", + "-//ietf//dtd html level 1//en", + "-//ietf//dtd html level 1//en//2.0", + "-//ietf//dtd html level 2//en", + "-//ietf//dtd html level 2//en//2.0", + "-//ietf//dtd html level 3//en", + "-//ietf//dtd html level 3//en//3.0", + "-//ietf//dtd html strict level 0//en", + "-//ietf//dtd html strict level 0//en//2.0", + "-//ietf//dtd html strict level 1//en", + "-//ietf//dtd html strict level 1//en//2.0", + "-//ietf//dtd html strict level 2//en", + "-//ietf//dtd html strict level 2//en//2.0", + "-//ietf//dtd html strict level 3//en", + "-//ietf//dtd html strict level 3//en//3.0", + "-//ietf//dtd html strict//en", + "-//ietf//dtd html strict//en//2.0", + "-//ietf//dtd html strict//en//3.0", + "-//ietf//dtd html//en", + "-//ietf//dtd html//en//2.0", + "-//ietf//dtd html//en//3.0", + "-//metrius//dtd metrius presentational//en", + "-//microsoft//dtd internet explorer 2.0 html strict//en", + "-//microsoft//dtd internet explorer 2.0 html//en", + "-//microsoft//dtd internet explorer 2.0 tables//en", + "-//microsoft//dtd internet explorer 3.0 html strict//en", + "-//microsoft//dtd internet explorer 3.0 html//en", + "-//microsoft//dtd internet explorer 3.0 tables//en", + "-//netscape comm. corp.//dtd html//en", + "-//netscape comm. corp.//dtd strict html//en", + "-//o'reilly and associates//dtd html 2.0//en", + "-//o'reilly and associates//dtd html extended 1.0//en", + "-//o'reilly and associates//dtd html extended relaxed 1.0//en", + "-//spyglass//dtd html 2.0 extended//en", + "-//sq//dtd html 2.0 hotmetal + extensions//en", + "-//sun microsystems corp.//dtd hotjava html//en", + "-//sun microsystems corp.//dtd hotjava strict html//en", + "-//w3c//dtd html 3 1995-03-24//en", + "-//w3c//dtd html 3.2 draft//en", + "-//w3c//dtd html 3.2 final//en", + "-//w3c//dtd html 3.2//en", + "-//w3c//dtd html 3.2s draft//en", + "-//w3c//dtd html 4.0 frameset//en", + "-//w3c//dtd html 4.0 transitional//en", + "-//w3c//dtd html experimental 19960712//en", + "-//w3c//dtd html experimental 970421//en", + "-//w3c//dtd w3 html//en", + "-//w3o//dtd w3 html 3.0//en", + "-//w3o//dtd w3 html 3.0//en//", + "-//w3o//dtd w3 html strict 3.0//en//", + "-//webtechs//dtd mozilla html 2.0//en", + "-//webtechs//dtd mozilla html//en", + "-/w3c/dtd html 4.0 transitional/en", + "html") + or (publicId in + ("-//w3c//dtd html 4.01 frameset//EN", + "-//w3c//dtd html 4.01 transitional//EN") and + systemId == None) + or (systemId != None and + systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")): + self.parser.compatMode = "quirks" + elif (publicId in + ("-//w3c//dtd xhtml 1.0 frameset//EN", + "-//w3c//dtd xhtml 1.0 transitional//EN") + or (publicId in + ("-//w3c//dtd html 4.01 frameset//EN", + "-//w3c//dtd html 4.01 transitional//EN") and + systemId == None)): + self.parser.compatMode = "limited quirks" - self.parser.phase = self.parser.phases["rootElement"] + self.parser.phase = self.parser.phases["beforeHtml"] - def processSpaceCharacters(self, data): + def processSpaceCharacters(self, token): pass - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters. " - u"Expected DOCTYPE.")) - self.parser.phase = self.parser.phases["rootElement"] - self.parser.phase.processCharacters(data) + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + self.parser.phase.processCharacters(token) - def processStartTag(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,)) - self.parser.phase = self.parser.phases["rootElement"] - self.parser.phase.processStartTag(name, attributes) + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + self.parser.phase.processStartTag(token) - def processEndTag(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,)) - self.parser.phase = self.parser.phases["rootElement"] - self.parser.phase.processEndTag(name) + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + self.parser.phase.processEndTag(token) -class RootElementPhase(Phase): +class BeforeHtmlPhase(Phase): # helper methods def insertHtmlElement(self): - element = self.tree.createElement("html", {}) - self.tree.openElements.append(element) - self.tree.document.appendChild(element) + self.tree.insertRoot(impliedTagToken("html", "StartTag")) self.parser.phase = self.parser.phases["beforeHead"] # other @@ -430,25 +600,25 @@ class RootElementPhase(Phase): self.insertHtmlElement() self.parser.phase.processEOF() - def processComment(self, data): - self.tree.insertComment(data, self.tree.document) + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) - def processSpaceCharacters(self, data): + def processSpaceCharacters(self, token): pass - def processCharacters(self, data): + def processCharacters(self, token): self.insertHtmlElement() - self.parser.phase.processCharacters(data) + self.parser.phase.processCharacters(token) - def processStartTag(self, name, attributes): - if name == "html": + def processStartTag(self, token): + if token["name"] == "html": self.parser.firstStartTag = True self.insertHtmlElement() - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def processEndTag(self, name): + def processEndTag(self, token): self.insertHtmlElement() - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) class BeforeHeadPhase(Phase): @@ -462,33 +632,37 @@ class BeforeHeadPhase(Phase): self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - (("html", "head", "body", "br", "p"), self.endTagImplyHead) + (("head", "br"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther def processEOF(self): - self.startTagHead("head", {}) + self.startTagHead(impliedTagToken("head", "StartTag")) self.parser.phase.processEOF() - def processCharacters(self, data): - self.startTagHead("head", {}) - self.parser.phase.processCharacters(data) + def processSpaceCharacters(self, token): + pass - def startTagHead(self, name, attributes): - self.tree.insertElement(name, attributes) + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processCharacters(token) + + def startTagHead(self, token): + self.tree.insertElement(token) self.tree.headPointer = self.tree.openElements[-1] self.parser.phase = self.parser.phases["inHead"] - def startTagOther(self, name, attributes): - self.startTagHead("head", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processStartTag(token) - def endTagImplyHead(self, name): - self.startTagHead("head", {}) - self.parser.phase.processEndTag(name) + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + self.parser.phase.processEndTag(token) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,)) + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) class InHeadPhase(Phase): def __init__(self, parser, tree): @@ -497,19 +671,18 @@ class InHeadPhase(Phase): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), - ("style", self.startTagStyle), - ("noscript", self.startTagNoScript), + (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), ("script", self.startTagScript), - (("base", "link", "meta"), self.startTagBaseLinkMeta), + (("base", "link", "command", "eventsource"), + self.startTagBaseLinkCommandEventsource), + ("meta", self.startTagMeta), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), - (("html", "body", "br", "p"), self.endTagImplyAfterHead), - (("title", "style", "script", "noscript"), - self.endTagTitleStyleScriptNoScript) + (("br", "html", "body"), self.endTagHtmlBodyBr) ]) self.endTagHandler.default = self.endTagOther @@ -519,104 +692,75 @@ class InHeadPhase(Phase): self.tree.headPointer.appendChild(element) else: assert self.parser.innerHTML - self.tree.openElements[-1].appendChild(element) + self.tree.openElementsw[-1].appendChild(element) # the real thing - def processEOF(self): - if self.tree.openElements[-1].name in ("title", "style", "script"): - self.parser.parseError(_(u"Unexpected end of file. " - u"Expected end tag (%s).") % (self.tree.openElements[-1].name,)) - self.tree.openElements.pop() + def processEOF (self): self.anythingElse() self.parser.phase.processEOF() - def processCharacters(self, data): - if self.tree.openElements[-1].name in\ - ("title", "style", "script", "noscript"): - self.tree.insertText(data) - else: - self.anythingElse() - self.parser.phase.processCharacters(data) - - def startTagHead(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored")) - - def startTagTitle(self, name, attributes): - element = self.tree.createElement(name, attributes) - self.appendToHead(element) - self.tree.openElements.append(element) - self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] - - def startTagStyle(self, name, attributes): - element = self.tree.createElement(name, attributes) - if self.tree.headPointer is not None and\ - self.parser.phase == self.parser.phases["inHead"]: - self.appendToHead(element) - else: - self.tree.openElements[-1].appendChild(element) - self.tree.openElements.append(element) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] - - def startTagNoScript(self, name, attributes): - # XXX Need to decide whether to implement the scripting disabled case. - element = self.tree.createElement(name, attributes) - if self.tree.headPointer is not None and\ - self.parser.phase == self.parser.phases["inHead"]: - self.appendToHead(element) - else: - self.tree.openElements[-1].appendChild(element) - self.tree.openElements.append(element) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] - - def startTagScript(self, name, attributes): - #XXX Inner HTML case may be wrong - element = self.tree.createElement(name, attributes) - element._flags.append("parser-inserted") - if (self.tree.headPointer is not None and - self.parser.phase == self.parser.phases["inHead"]): - self.appendToHead(element) - else: - self.tree.openElements[-1].appendChild(element) - self.tree.openElements.append(element) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] - - def startTagBaseLinkMeta(self, name, attributes): - element = self.tree.createElement(name, attributes) - if (self.tree.headPointer is not None and - self.parser.phase == self.parser.phases["inHead"]): - self.appendToHead(element) - else: - self.tree.openElements[-1].appendChild(element) - - def startTagOther(self, name, attributes): + def processCharacters(self, token): self.anythingElse() - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processCharacters(token) - def endTagHead(self, name): - if self.tree.openElements[-1].name == "head": - self.tree.openElements.pop() - else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head') + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") + + def startTagBaseLinkCommandEventsource(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif "content" in attributes: + data = inputstream.EncodingBytes( + attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0])) + parser = inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataCData(token, "RCDATA") + + def startTagNoScriptNoFramesStyle(self, token): + #Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataCData(token, "CDATA") + + def startTagScript(self, token): + #I think this is equivalent to the CDATA stuff since we don't execute script + #self.tree.insertElement(token) + self.parser.parseRCDataCData(token, "CDATA") + + def startTagOther(self, token): + self.anythingElse() + self.parser.phase.processStartTag(token) + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s"%node.name self.parser.phase = self.parser.phases["afterHead"] - def endTagImplyAfterHead(self, name): + def endTagHtmlBodyBr(self, token): self.anythingElse() - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagTitleStyleScriptNoScript(self, name): - if self.tree.openElements[-1].name == name: - self.tree.openElements.pop() - else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): - if self.tree.openElements[-1].name == "head": - self.endTagHead("head") - else: - self.parser.phase = self.parser.phases["afterHead"] + self.endTagHead(impliedTagToken("head")) + # XXX If we implement a parser for which scripting is disabled we need to # implement this phase. @@ -631,43 +775,61 @@ class AfterHeadPhase(Phase): ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), - (("base", "link", "meta", "script", "style", "title"), - self.startTagFromHead) + (("base", "link", "meta", "noframes", "script", "style", "title"), + self.startTagFromHead), + ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), + self.endTagHtmlBodyBr)]) + self.endTagHandler.default = self.endTagOther def processEOF(self): self.anythingElse() self.parser.phase.processEOF() - def processCharacters(self, data): + def processCharacters(self, token): self.anythingElse() - self.parser.phase.processCharacters(data) + self.parser.phase.processCharacters(token) - def startTagBody(self, name, attributes): - self.tree.insertElement(name, attributes) + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inBody"] - def startTagFrameset(self, name, attributes): - self.tree.insertElement(name, attributes) + def startTagFrameset(self, token): + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inFrameset"] - def startTagFromHead(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,)) - self.parser.phase = self.parser.phases["inHead"] - self.parser.phase.processStartTag(name, attributes) + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break - def startTagOther(self, name, attributes): - self.anythingElse() - self.parser.phase.processStartTag(name, attributes) + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name":token["name"]}) - def processEndTag(self, name): + def startTagOther(self, token): self.anythingElse() - self.parser.phase.processEndTag(name) + self.parser.phase.processStartTag(token) + + def endTagHtmlBodyBr(self, token): + #This is not currently in the spec + self.anythingElse() + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name":token["name"]}) def anythingElse(self): - self.tree.insertElement("body", {}) + self.tree.insertElement(impliedTagToken("body", "StartTag")) self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True class InBodyPhase(Phase): @@ -681,137 +843,158 @@ class InBodyPhase(Phase): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), - (("base", "link", "meta", "script", "style"), + (("base", "link", "meta", "script", "style", "title"), self.startTagProcessInHead), - ("title", self.startTagTitle), ("body", self.startTagBody), - (("address", "blockquote", "center", "dir", "div", "dl", - "fieldset", "listing", "menu", "ol", "p", "pre", "ul"), + ("frameset", self.startTagFrameset), + (("address", "article", "aside", "blockquote", "center", "datagrid", + "details", "dialog", "dir", "div", "dl", "fieldset", "figure", + "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing", + "menu", "nav", "ol", "p", "pre", "section", "ul"), self.startTagCloseP), ("form", self.startTagForm), (("li", "dd", "dt"), self.startTagListItem), ("plaintext",self.startTagPlaintext), (headingElements, self.startTagHeading), ("a", self.startTagA), - (("b", "big", "em", "font", "i", "s", "small", "strike", "strong", - "tt", "u"),self.startTagFormatting), + (("b", "big", "code", "em", "font", "i", "s", "small", "strike", + "strong", "tt", "u"),self.startTagFormatting), ("nobr", self.startTagNobr), ("button", self.startTagButton), - (("marquee", "object"), self.startTagMarqueeObject), + (("applet", "marquee", "object"), self.startTagAppletMarqueeObject), ("xmp", self.startTagXmp), ("table", self.startTagTable), - (("area", "basefont", "bgsound", "br", "embed", "img", "param", - "spacer", "wbr"), self.startTagVoidFormatting), + (("area", "basefont", "bgsound", "br", "embed", "img", "input", + "keygen", "param", "spacer", "wbr"), self.startTagVoidFormatting), ("hr", self.startTagHr), ("image", self.startTagImage), - ("input", self.startTagInput), ("isindex", self.startTagIsIndex), ("textarea", self.startTagTextarea), - (("iframe", "noembed", "noframes", "noscript"), self.startTagCdata), + ("iframe", self.startTagIFrame), + (("noembed", "noframes", "noscript"), self.startTagCdata), ("select", self.startTagSelect), - (("caption", "col", "colgroup", "frame", "frameset", "head", - "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", + (("rp", "rt"), self.startTagRpRt), + (("option", "optgroup"), self.startTagOpt), + (("math"), self.startTagMath), + (("svg"), self.startTagSvg), + (("caption", "col", "colgroup", "frame", "head", + "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagMisplaced), - (("event-source", "section", "nav", "article", "aside", "header", - "footer", "datagrid", "command"), self.startTagNew) + (("event-source", "command"), self.startTagNew) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - ("p",self.endTagP), ("body",self.endTagBody), ("html",self.endTagHtml), - (("address", "blockquote", "center", "div", "dl", "fieldset", - "listing", "menu", "ol", "pre", "ul"), self.endTagBlock), + (("address", "article", "aside", "blockquote", "center", "datagrid", + "details", "dialog", "dir", "div", "dl", "fieldset", "figure", + "footer", "header", "listing", "menu", "nav", "ol", "pre", "section", + "ul"), self.endTagBlock), ("form", self.endTagForm), + ("p",self.endTagP), (("dd", "dt", "li"), self.endTagListItem), (headingElements, self.endTagHeading), - (("a", "b", "big", "em", "font", "i", "nobr", "s", "small", + (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"), self.endTagFormatting), - (("marquee", "object", "button"), self.endTagButtonMarqueeObject), - (("head", "frameset", "select", "optgroup", "option", "table", - "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr", - "td", "th"), self.endTagMisplaced), + (("applet", "button", "marquee", "object"), self.endTagAppletButtonMarqueeObject), ("br", self.endTagBr), - (("area", "basefont", "bgsound", "embed", "hr", "image", - "img", "input", "isindex", "param", "spacer", "wbr", "frame"), - self.endTagNone), - (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"), - self.endTagCdataTextAreaXmp), - (("event-source", "section", "nav", "article", "aside", "header", - "footer", "datagrid", "command"), self.endTagNew) ]) self.endTagHandler.default = self.endTagOther # helper - def addFormattingElement(self, name, attributes): - self.tree.insertElement(name, attributes) + def addFormattingElement(self, token): + self.tree.insertElement(token) self.tree.activeFormattingElements.append( self.tree.openElements[-1]) # the real deal - def processSpaceCharactersDropNewline(self, data): - # Sometimes (start of and blocks) we want to drop - # leading newlines + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + #Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of , , and blocks) we + # want to drop leading newlines + data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "textarea") and - not self.tree.openElements[-1].hasContent()): + self.tree.openElements[-1].name in ("pre", "listing", "textarea") + and not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() self.tree.insertText(data) - def processCharacters(self, data): + def processCharacters(self, token): # XXX The specification says to do this for every character at the # moment, but apparently that doesn't match the real world so we don't # do it for space characters. self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) + self.framesetOK = False #This matches the current spec but may not match the real world - def processSpaceCharacters(self, data): + def processSpaceCharacters(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) - def startTagProcessInHead(self, name, attributes): - self.parser.phases["inHead"].processStartTag(name, attributes) + def startTagProcessInHead(self, token): + self.parser.phases["inHead"].processStartTag(token) - def startTagTitle(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,)) - self.parser.phases["inHead"].processStartTag(name, attributes) - - def startTagBody(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (body).")) + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: - for attr, value in attributes.iteritems(): + for attr, value in token["data"].iteritems(): if attr not in self.tree.openElements[1].attributes: self.tree.openElements[1].attributes[attr] = value - def startTagCloseP(self, name, attributes): + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagCloseP(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) - if name == "pre": + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + if token["name"] in ("pre", "listing"): + self.parser.framesetOK = False self.processSpaceCharacters = self.processSpaceCharactersDropNewline - def startTagForm(self, name, attributes): + def startTagForm(self, token): if self.tree.formPointer: - self.parser.parseError("Unexpected start tag (form). Ignored.") + self.parser.parseError(u"unexpected-start-tag", {"name": "form"}) else: if self.tree.elementInScope("p"): self.endTagP("p") - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.formPointer = self.tree.openElements[-1] - def startTagListItem(self, name, attributes): + def startTagListItem(self, token): + self.parser.framesetOK = False if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")} - stopName = stopNames[name] + stopName = stopNames[token["name"]] # AT Use reversed in Python 2.4... for i, node in enumerate(self.tree.openElements[::-1]): if node.name in stopName: @@ -820,251 +1003,340 @@ class InBodyPhase(Phase): poppedNodes.append(self.tree.openElements.pop()) if i >= 1: self.parser.parseError( - (i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)")) - % u", ".join([item.name for item in poppedNodes[:-1]])) + i == 1 and "missing-end-tag" or "missing-end-tags", + {"name": u", ".join([item.name + for item + in poppedNodes[:-1]])}) break # Phrasing elements are all non special, non scoping, non # formatting elements - if (node.name in (specialElements | scopingElements) - and node.name not in ("address", "div")): + if (node.nameTuple in + (specialElements | scopingElements) + and node.name not in ("address", "div")): break # Always insert an element. - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagPlaintext(self, name, attributes): + def startTagPlaintext(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"] - def startTagHeading(self, name, attributes): + def startTagHeading(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() # Uncomment the following for IE7 behavior: # #for item in headingElements: # if self.tree.elementInScope(item): - # self.parser.parseError(_(u"Unexpected start tag (" + name +\ - # ").")) + # self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) # item = self.tree.openElements.pop() # while item.name not in headingElements: # item = self.tree.openElements.pop() # break - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagA(self, name, attributes): + def startTagA(self, token): afeAElement = self.tree.elementInActiveFormattingElements("a") if afeAElement: - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'a', u'a')) - self.endTagFormatting("a") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) if afeAElement in self.tree.openElements: self.tree.openElements.remove(afeAElement) if afeAElement in self.tree.activeFormattingElements: self.tree.activeFormattingElements.remove(afeAElement) self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagFormatting(self, name, attributes): + def startTagFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagNobr(self, name, attributes): + def startTagNobr(self, token): self.tree.reconstructActiveFormattingElements() if self.tree.elementInScope("nobr"): - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'nobr', u'nobr')) - self.processEndTag("nobr") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) # XXX Need tests that trigger the following self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagButton(self, name, attributes): + def startTagButton(self, token): if self.tree.elementInScope("button"): - self.parser.parseError(_(u"Unexpected start tag (%s) implied " - u"end tag (%s).") % (u'button', u'button')) - self.processEndTag("button") - self.parser.phase.processStartTag(name, attributes) + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + self.parser.phase.processStartTag(token) else: self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagMarqueeObject(self, name, attributes): + def startTagAppletMarqueeObject(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagXmp(self, name, attributes): + def startTagXmp(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") + self.parser.framesetOK = False - def startTagTable(self, name, attributes): - if self.tree.elementInScope("p"): - self.processEndTag("p") - self.tree.insertElement(name, attributes) + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False self.parser.phase = self.parser.phases["inTable"] - def startTagVoidFormatting(self, name, attributes): + def startTagVoidFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagHr(self, name, attributes): + def startTagHr(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagImage(self, name, attributes): + def startTagImage(self, token): # No really... - self.parser.parseError(_(u"Unexpected start tag (image). Treated " - u"as img.")) - self.processStartTag("img", attributes) + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) - def startTagInput(self, name, attributes): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - if self.tree.formPointer: - # XXX Not exactly sure what to do here - self.tree.openElements[-1].form = self.tree.formPointer - self.tree.openElements.pop() - - def startTagIsIndex(self, name, attributes): - self.parser.parseError("Unexpected start tag isindex. Don't use it!") + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) if self.tree.formPointer: return - self.processStartTag("form", {}) - self.processStartTag("hr", {}) - self.processStartTag("p", {}) - self.processStartTag("label", {}) + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Insert your search keywords here: " self.processCharacters( - "This is a searchable index. Insert your search keywords here: ") + {"type":tokenTypes["Characters"], "data":prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] attributes["name"] = "isindex" - attrs = [[key,value] for key,value in attributes.iteritems()] - self.processStartTag("input", dict(attrs)) - self.processEndTag("label") - self.processEndTag("p") - self.processStartTag("hr", {}) - self.processEndTag("form") + self.processStartTag(impliedTagToken("input", "StartTag", + attributes = attributes, + selfClosing = + token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) - def startTagTextarea(self, name, attributes): + def startTagTextarea(self, token): # XXX Form element pointer checking here as well... - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False - def startTagCdata(self, name, attributes): + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagCdata(token) + + def startTagCdata(self, token): """iframe, noembed noframes, noscript(if scripting enabled)""" - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") - def startTagSelect(self, name, attributes): + def startTagOpt(self, token): + if self.tree.elementInScope("option"): + self.parser.phase.processEndTag(impliedTagToken("option")) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.phase = self.parser.phases["inSelect"] + self.parser.tree.insertElement(token) - def startTagMisplaced(self, name, attributes): + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] + + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + while self.tree.openElements[-1].name != "ruby": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMisplaced(self, token): """ Elements that should be children of other elements that have a different insertion mode; here they are ignored "caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "noscript" """ - self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - def startTagNew(self, name, attributes): + def startTagNew(self, token): """New HTML5 elements, "event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command" """ - sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) - self.startTagOther(name, attributes) + #2007-08-30 - MAP - commenting out this write to sys.stderr because + # it's really annoying me when I run the validator tests + #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) + self.startTagOther(token) #raise NotImplementedError - def startTagOther(self, name, attributes): + def startTagOther(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def endTagP(self, name): + def endTagP(self, token): if self.tree.elementInScope("p"): self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": - self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',)) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) if self.tree.elementInScope("p"): while self.tree.elementInScope("p"): self.tree.openElements.pop() else: - self.startTagCloseP("p", {}) - self.endTagP("p") + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.endTagP(impliedTagToken("p")) - def endTagBody(self, name): + def endTagBody(self, token): # XXX Need to take open tags into account here. We shouldn't imply # but we should not throw a parse error either. Specification is # likely to be updated. - if self.tree.openElements[1].name != "body": + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): # innerHTML case self.parser.parseError() return - if self.tree.openElements[-1].name != "body": - self.parser.parseError(_(u"Unexpected end tag (%s). Missing " - u"end tag (%s).") % (u'body', self.tree.openElements[-1].name)) + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "p", + "tbody", "td", "tfoot", + "th", "thead", "tr")): + #Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"expectedName": "body", "gotName": node.name}) + break self.parser.phase = self.parser.phases["afterBody"] - def endTagHtml(self, name): - self.endTagBody(name) + def endTagHtml(self, token): + self.endTagBody(impliedTagToken("body")) if not self.parser.innerHTML: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagBlock(self, name): + def endTagBlock(self, token): #Put us back in the right whitespace handling mode - if name == "pre": + if token["name"] == "pre": self.processSpaceCharacters = self.processSpaceCharactersNonPre - inScope = self.tree.elementInScope(name) + inScope = self.tree.elementInScope(token["name"]) if inScope: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) if inScope: node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagForm(self, name): - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (form) seen too early. Ignored.")) - else: - self.tree.openElements.pop() + def endTagForm(self, token): + node = self.tree.formPointer self.tree.formPointer = None + if node is None or not self.tree.elementInScope(token["name"]): + self.parser.parseError("unexpected-end-tag", + {"name":"form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) - def endTagListItem(self, name): + def endTagListItem(self, token): # AT Could merge this with the Block case - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags(name) + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags(token["name"]) - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagHeading(self, name): + def endTagHeading(self, token): for item in headingElements: if self.tree.elementInScope(item): self.tree.generateImpliedEndTags() break - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). " - u"Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) for item in headingElements: if self.tree.elementInScope(item): @@ -1073,38 +1345,37 @@ class InBodyPhase(Phase): item = self.tree.openElements.pop() break - def endTagFormatting(self, name): - """The much-feared adoption agency algorithm - """ + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # XXX Better parseError messages appreciated. + name = token["name"] while True: # Step 1 paragraph 1 - afeElement = self.tree.elementInActiveFormattingElements(name) + afeElement = self.tree.elementInActiveFormattingElements( + token["name"]) if not afeElement or (afeElement in self.tree.openElements and not self.tree.elementInScope(afeElement.name)): - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.1", {"name": token["name"]}) return # Step 1 paragraph 2 elif afeElement not in self.tree.openElements: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) self.tree.activeFormattingElements.remove(afeElement) return # Step 1 paragraph 3 if afeElement != self.tree.openElements[-1]: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) # Step 2 # Start of the adoption agency algorithm proper afeIndex = self.tree.openElements.index(afeElement) furthestBlock = None for element in self.tree.openElements[afeIndex:]: - if element.name in specialElements | scopingElements: + if (element.nameTuple in + specialElements | scopingElements): furthestBlock = element break @@ -1118,17 +1389,17 @@ class InBodyPhase(Phase): commonAncestor = self.tree.openElements[afeIndex-1] # Step 5 - if furthestBlock.parent: - furthestBlock.parent.removeChild(furthestBlock) + #if furthestBlock.parent: + # furthestBlock.parent.removeChild(furthestBlock) - # Step 6 + # Step 5 # The bookmark is supposed to help us identify where to reinsert # nodes in step 12. We have to ensure that we reinsert nodes after # the node before the active formatting element. Note the bookmark # can move in step 7.4 bookmark = self.tree.activeFormattingElements.index(afeElement) - # Step 7 + # Step 6 lastNode = node = furthestBlock while True: # AT replace this with a function and recursion? @@ -1140,26 +1411,24 @@ class InBodyPhase(Phase): node = self.tree.openElements[ self.tree.openElements.index(node)-1] self.tree.openElements.remove(tmpNode) - # Step 7.3 + # Step 6.3 if node == afeElement: break - # Step 7.4 + # Step 6.4 if lastNode == furthestBlock: - # XXX should this be index(node) or index(node)+1 - # Anne: I think +1 is ok. Given x = [2,3,4,5] - # x.index(3) gives 1 and then x[1 +1] gives 4... - bookmark = self.tree.activeFormattingElements.\ - index(node) + 1 - # Step 7.5 - cite = node.parent - if node.hasContent(): - clone = node.cloneNode() - # Replace node with clone - self.tree.activeFormattingElements[ - self.tree.activeFormattingElements.index(node)] = clone - self.tree.openElements[ - self.tree.openElements.index(node)] = clone - node = clone + bookmark = (self.tree.activeFormattingElements.index(node) + + 1) + # Step 6.5 + #cite = node.parent + #if node.hasContent(): + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + # Step 7.6 # Remove lastNode from its parents, if any if lastNode.parent: @@ -1167,87 +1436,101 @@ class InBodyPhase(Phase): node.appendChild(lastNode) # Step 7.7 lastNode = node - # End of inner loop + # End of inner loop - # Step 8 + # Step 7 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster parent the + # lastNode if lastNode.parent: lastNode.parent.removeChild(lastNode) commonAncestor.appendChild(lastNode) - # Step 9 + # Step 8 clone = afeElement.cloneNode() - # Step 10 + # Step 9 furthestBlock.reparentChildren(clone) - # Step 11 + # Step 10 furthestBlock.appendChild(clone) - # Step 12 + # Step 11 self.tree.activeFormattingElements.remove(afeElement) self.tree.activeFormattingElements.insert(bookmark, clone) - # Step 13 + # Step 12 self.tree.openElements.remove(afeElement) self.tree.openElements.insert( self.tree.openElements.index(furthestBlock) + 1, clone) - def endTagButtonMarqueeObject(self, name): - if self.tree.elementInScope(name): + def endTagAppletButtonMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): element = self.tree.openElements.pop() - while element.name != name: + while element.name != token["name"]: element = self.tree.openElements.pop() self.tree.clearActiveFormattingElements() - def endTagMisplaced(self, name): - # This handles elements with end tags in other insertion modes. - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagBr(self, name): - self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element.")) + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, {}) + self.tree.insertElement(impliedTagToken("br", "StartTag")) self.tree.openElements.pop() - def endTagNone(self, name): - # This handles elements with no end tag. - self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,)) - - def endTagCdataTextAreaXmp(self, name): - if self.tree.openElements[-1].name == name: - self.tree.openElements.pop() - else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagNew(self, name): - """New HTML5 elements, "event-source", "section", "nav", - "article", "aside", "header", "footer", "datagrid", "command" - """ - sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name) - self.endTagOther(name) - #raise NotImplementedError - - def endTagOther(self, name): - # XXX This logic should be moved into the treebuilder - # AT should use reversed instead of [::-1] when Python 2.4 == True. + def endTagOther(self, token): for node in self.tree.openElements[::-1]: - if node.name == name: + if node.name == token["name"]: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while self.tree.openElements.pop() != node: pass break else: - if node.name in specialElements | scopingElements: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + if (node.nameTuple in + specialElements | scopingElements): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break +class InCDataRCDataPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([ + ("script", self.endTagScript)]) + self.endTagHandler.default = self.endTagOther + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + self.tree.openElements[-1].name) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + self.parser.phase.processEOF() + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in (R)CDATA mode"%name + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + #The rest of this method is all stuff that only happens if + #document.write works + + def endTagOther(self, token): + node = self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): @@ -1259,7 +1542,9 @@ class InTablePhase(Phase): ("col", self.startTagCol), (("tbody", "tfoot", "thead"), self.startTagRowGroup), (("td", "th", "tr"), self.startTagImplyTbody), - ("table", self.startTagTable) + ("table", self.startTagTable), + (("style", "script"), self.startTagStyleScript), + ("input", self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1274,66 +1559,101 @@ class InTablePhase(Phase): def clearStackToTableContext(self): # "clear the stack back to a table context" while self.tree.openElements[-1].name not in ("table", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() # When the current node is it's an innerHTML case + def getCurrentTable(self): + i = -1 + while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table": + i -= 1 + if -i > len(self.tree.openElements): + return self.tree.openElements[0] + else: + return self.tree.openElements[i] + # processing methods - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters in " - u"table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + #Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.characterTokens.append(token) + + def processCharacters(self, token): + #If we get here there must be at least one non-whitespace character + # Do the table magic! self.tree.insertFromTable = True - # Process the character in the "in body" mode - self.parser.phases["inBody"].processCharacters(data) + self.parser.phases["inBody"].processCharacters(token) self.tree.insertFromTable = False - def startTagCaption(self, name, attributes): + def startTagCaption(self, token): self.clearStackToTableContext() self.tree.activeFormattingElements.append(Marker) - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCaption"] - def startTagColgroup(self, name, attributes): + def startTagColgroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inColumnGroup"] - def startTagCol(self, name, attributes): - self.startTagColgroup("colgroup", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagRowGroup(self, name, attributes): + def startTagRowGroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inTableBody"] - def startTagImplyTbody(self, name, attributes): - self.startTagRowGroup("tbody", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTable(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (table) in table " - u"phase. Implies end tag (table).")) - self.parser.phase.processEndTag("table") + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) if not self.parser.innerHTML: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def startTagStyleScript(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") + self.tree.insertElement(token) + # XXX associate with form + self.tree.openElements.pop() + else: + self.startTagOther(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the start tag in the "in body" mode - self.parser.phases["inBody"].processStartTag(name, attributes) + self.parser.phases["inBody"].processStartTag(token) self.tree.insertFromTable = False - def endTagTable(self, name): + def endTagTable(self, token): if self.tree.elementInScope("table", True): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": - self.parser.parseError(_(u"Unexpected end tag (table). " - u"Expected end tag (%s).") % (self.tree.openElements[-1].name,)) + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1343,18 +1663,61 @@ class InTablePhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the end tag in the "in body" mode - self.parser.phases["inBody"].processEndTag(name) + self.parser.phases["inBody"].processEndTag(token) self.tree.insertFromTable = False +class InTableTextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any([item not in spaceCharacters for item in data]): + token = {"type":tokenTypes["Characters"], "data":data} + self.originalPhase.processCharacters(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processComment(token) + + def processEOF(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEOF(token) + + def processCharacters(self, token): + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + #pretty sure we should never reach here + self.characterTokens.append(token) +# assert False + + def processStartTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processStartTag(token) + + def processEndTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEndTag(token) + class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption @@ -1379,27 +1742,31 @@ class InCaptionPhase(Phase): def ignoreEndTagCaption(self): return not self.tree.elementInScope("caption", True) - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() - def startTagTableElement(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): self.parser.parseError() #XXX Have to duplicate logic here to find out if the tag is ignored ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) - def endTagCaption(self, name): + def endTagCaption(self, token): if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": - self.parser.parseError(_(u"Unexpected end tag (caption). " - u"Missing end tags.")) + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1410,18 +1777,18 @@ class InCaptionPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): self.parser.parseError() ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) class InColumnGroupPhase(Phase): @@ -1445,23 +1812,33 @@ class InColumnGroupPhase(Phase): def ignoreEndTagColgroup(self): return self.tree.openElements[-1].name == "html" - def processCharacters(self, data): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup("colgroup") - if not ignoreEndTag: - self.parser.phase.processCharacters(data) + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processEOF() - def startTagCol(self, name ,attributes): - self.tree.insertElement(name, attributes) + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + self.parser.phase.processCharacters(token) + + def startTagCol(self, token): + self.tree.insertElement(token) self.tree.openElements.pop() - def startTagOther(self, name, attributes): + def startTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def endTagColgroup(self, name): + def endTagColgroup(self, token): if self.ignoreEndTagColgroup(): # innerHTML case assert self.parser.innerHTML @@ -1470,15 +1847,14 @@ class InColumnGroupPhase(Phase): self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] - def endTagCol(self, name): - self.parser.parseError(_(u"Unexpected end tag (col). " - u"col has no end tag.")) + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) - def endTagOther(self, name): + def endTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) class InTableBodyPhase(Phase): @@ -1489,7 +1865,8 @@ class InTableBodyPhase(Phase): ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther) + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther @@ -1505,62 +1882,76 @@ class InTableBodyPhase(Phase): def clearStackToTableBodyContext(self): while self.tree.openElements[-1].name not in ("tbody", "tfoot", "thead", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML # the rest - def processCharacters(self,data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTr(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTr(self, token): self.clearStackToTableBodyContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inRow"] - def startTagTableCell(self, name, attributes): - self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,)) - self.startTagTr("tr", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): # XXX AT Any ideas on how to share this with endTagTable? if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processStartTag(name, attributes) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.clearStackToTableBodyContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] else: - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagTable(self, name): + def endTagTable(self, token): if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processEndTag(name) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InRowPhase(Phase): @@ -1587,33 +1978,40 @@ class InRowPhase(Phase): # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): while self.tree.openElements[-1].name not in ("tr", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") % (self.tree.openElements[-1].name,)) + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() def ignoreEndTagTr(self): return not self.tree.elementInScope("tr", tableVariant=True) # the rest - def processCharacters(self, data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTableCell(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTableCell(self, token): self.clearStackToTableRowContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCell"] self.tree.activeFormattingElements.append(Marker) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTr(self, name): + def endTagTr(self, token): if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() @@ -1623,27 +2021,28 @@ class InRowPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.endTagTr("tr") - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell @@ -1666,60 +2065,63 @@ class InCellPhase(Phase): # helper def closeCell(self): if self.tree.elementInScope("td", True): - self.endTagTableCell("td") + self.endTagTableCell(impliedTagToken("td")) elif self.tree.elementInScope("th", True): - self.endTagTableCell("th") + self.endTagTableCell(impliedTagToken("th")) # the rest - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) - def startTagTableOther(self, name, attributes): - if self.tree.elementInScope("td", True) or \ - self.tree.elementInScope("th", True): + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", True) or + self.tree.elementInScope("th", True)): self.closeCell() - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.startTagHandler.default =\ self.parser.phases["inBody"].processStartTag - def endTagTableCell(self, name): - if self.tree.elementInScope(name, True): - self.tree.generateImpliedEndTags(name) - if self.tree.openElements[-1].name != name: - self.parser.parseError("Got table cell end tag (" + name +\ - ") while required end tags are missing.") + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], True): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", + {"name": token["name"]}) while True: node = self.tree.openElements.pop() - if node.name == name: + if node.name == token["name"]: break else: self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inRow"] else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagImply(self, name): - if self.tree.elementInScope(name, True): + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], True): self.closeCell() - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # sometimes innerHTML case self.parser.parseError() - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.endTagHandler.default = self.parser.phases["inBody"].processEndTag @@ -1733,7 +2135,8 @@ class InSelectPhase(Phase): ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), - ("select", self.startTagSelect) + ("select", self.startTagSelect), + (("input", "keygen", "textarea"), self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1747,52 +2150,63 @@ class InSelectPhase(Phase): self.endTagHandler.default = self.endTagOther # http://www.whatwg.org/specs/web-apps/current-work/#in-select - def processCharacters(self, data): - self.tree.insertText(data) + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML - def startTagOption(self, name, attributes): + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def startTagOption(self, token): # We need to imply if is the current node. if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagOptgroup(self, name, attributes): + def startTagOptgroup(self, token): if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() if self.tree.openElements[-1].name == "optgroup": self.tree.openElements.pop() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagSelect(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (select) in the " - u"select phase implies select start tag.")) + def startTagSelect(self, token): + self.parser.parseError("unexpected-select-in-select") self.endTagSelect("select") - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag token (%s)" - u" in the select phase. Ignored.") % (name,)) + def startTagInput(self, token): + self.parser.parseError("unexpected-input-in-select") + if self.tree.elementInScope("select", True): + self.endTagSelect("select") + self.parser.phase.processStartTag(token) - def endTagOption(self, name): + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-select", + {"name": token["name"]}) + + def endTagOption(self, token): if self.tree.openElements[-1].name == "option": self.tree.openElements.pop() else: - self.parser.parseError(_(u"Unexpected end tag (%s) in the " - u"select phase. Ignored.") % u'option') + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "option"}) - def endTagOptgroup(self, name): + def endTagOptgroup(self, token): # implicitly closes - if self.tree.openElements[-1].name == "option" and \ - self.tree.openElements[-2].name == "optgroup": + if (self.tree.openElements[-1].name == "option" and + self.tree.openElements[-2].name == "optgroup"): self.tree.openElements.pop() # It also closes if self.tree.openElements[-1].name == "optgroup": self.tree.openElements.pop() # But nothing else else: - self.parser.parseError(_(u"Unexpected end tag (%s) in the " - u"select phase. Ignored.") % u'optgroup') + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "optgroup"}) - def endTagSelect(self, name): + def endTagSelect(self, token): if self.tree.elementInScope("select", True): node = self.tree.openElements.pop() while node.name != "select": @@ -1802,60 +2216,221 @@ class InSelectPhase(Phase): # innerHTML case self.parser.parseError() - def endTagTableElements(self, name): - self.parser.parseError(_(u"Unexpected table end tag (%s)" - u" in the select phase.") % (name,)) - if self.tree.elementInScope(name, True): + def endTagTableElements(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) + if self.tree.elementInScope(token["name"], True): self.endTagSelect("select") - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag token (%s)" - u" in the select phase. Ignored.") % (name,)) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) +class InSelectInTablePhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.startTagTable) + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + self.endTagTable) + ]) + self.endTagHandler.default = self.endTagOther + + def processEOF(self): + self.parser.phases["inSelect"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inSelect"].processCharacters(token) + + def startTagTable(self, token): + self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) + self.endTagOther(impliedTagToken("select")) + self.parser.phase.processStartTag(token) + + def startTagOther(self, token): + self.parser.phases["inSelect"].processStartTag(token) + + def endTagTable(self, token): + self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) + if self.tree.elementInScope(token["name"], tableVariant=True): + self.endTagOther(impliedTagToken("select")) + self.parser.phase.processEndTag(token) + + def endTagOther(self, token): + self.parser.phases["inSelect"].processEndTag(token) + + +class InForeignContentPhase(Phase): + breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", + "center", "code", "dd", "div", "dl", "dt", + "em", "embed", "font", "h1", "h2", "h3", + "h4", "h5", "h6", "head", "hr", "i", "img", + "li", "listing", "menu", "meta", "nobr", + "ol", "p", "pre", "ruby", "s", "small", + "span", "strong", "strike", "sub", "sup", + "table", "tt", "u", "ul", "var"]) + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + def nonHTMLElementInScope(self): + for element in self.tree.openElements[::-1]: + if element.namespace == self.tree.defaultNamespace: + return self.tree.elementInScope(element) + assert False + for item in self.tree.openElements[::-1]: + if item.namespace == self.tree.defaultNamespace: + return True + elif item.nameTuple in scopingElements: + return False + return False + + def adjustSVGTagNames(self, token): + replacements = {"altglyph":"altGlyph", + "altglyphdef":"altGlyphDef", + "altglyphitem":"altGlyphItem", + "animatecolor":"animateColor", + "animatemotion":"animateMotion", + "animatetransform":"animateTransform", + "clippath":"clipPath", + "feblend":"feBlend", + "fecolormatrix":"feColorMatrix", + "fecomponenttransfer":"feComponentTransfer", + "fecomposite":"feComposite", + "feconvolvematrix":"feConvolveMatrix", + "fediffuselighting":"feDiffuseLighting", + "fedisplacementmap":"feDisplacementMap", + "fedistantlight":"feDistantLight", + "feflood":"feFlood", + "fefunca":"feFuncA", + "fefuncb":"feFuncB", + "fefuncg":"feFuncG", + "fefuncr":"feFuncR", + "fegaussianblur":"feGaussianBlur", + "feimage":"feImage", + "femerge":"feMerge", + "femergenode":"feMergeNode", + "femorphology":"feMorphology", + "feoffset":"feOffset", + "fepointlight":"fePointLight", + "fespecularlighting":"feSpecularLighting", + "fespotlight":"feSpotLight", + "fetile":"feTile", + "feturbulence":"feTurbulence", + "foreignobject":"foreignObject", + "glyphref":"glyphRef", + "lineargradient":"linearGradient", + "radialgradient":"radialGradient", + "textpath":"textPath"} + + if token["name"] in replacements: + token["name"] = replacements[token["name"]] + + def processCharacters(self, token): + self.parser.framesetOK = False + Phase.processCharacters(self, token) + + def processEOF(self): + pass + + def processStartTag(self, token): + currentNode = self.tree.openElements[-1] + if (currentNode.namespace == self.tree.defaultNamespace or + (currentNode.namespace == namespaces["mathml"] and + token["name"] not in frozenset(["mglyph", "malignmark"]) and + currentNode.name in frozenset(["mi", "mo", "mn", + "ms", "mtext"])) or + (currentNode.namespace == namespaces["mathml"] and + currentNode.name == "annotation-xml" and + token["name"] == "svg") or + (currentNode.namespace == namespaces["svg"] and + currentNode.name in frozenset(["foreignObject", + "desc", "title"]) + )): + assert self.parser.secondaryPhase != self + self.parser.secondaryPhase.processStartTag(token) + if self.parser.phase == self and self.nonHTMLElementInScope(): + self.parser.phase = self.parser.secondaryPhase + elif token["name"] in self.breakoutElements: + self.parser.parseError("unexpected-html-element-in-foreign-content", + token["name"]) + while (self.tree.openElements[-1].namespace != + self.tree.defaultNamespace): + self.tree.openElements.pop() + self.parser.phase = self.parser.secondaryPhase + self.parser.phase.processStartTag(token) + else: + if currentNode.namespace == namespaces["mathml"]: + self.parser.adjustMathMLAttributes(token) + elif currentNode.namespace == namespaces["svg"]: + self.adjustSVGTagNames(token) + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = currentNode.namespace + self.tree.insertElement(token) + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def processEndTag(self, token): + self.adjustSVGTagNames(token) + self.parser.secondaryPhase.processEndTag(token) + if self.parser.phase == self and self.nonHTMLElementInScope(): + self.parser.phase = self.parser.secondaryPhase + class AfterBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # XXX We should prolly add a handler for here as well... + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) self.endTagHandler.default = self.endTagOther - def processComment(self, data): + def processEOF(self): + #Stop parsing + pass + + def processComment(self, token): # This is needed because data is to be appended to the element # here and not to whatever is currently open. - self.tree.insertComment(data, self.tree.openElements[0]) + self.tree.insertComment(token, self.tree.openElements[0]) - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters in the " - u"after body phase.")) + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-body") self.parser.phase = self.parser.phases["inBody"] - self.parser.phase.processCharacters(data) + self.parser.phase.processCharacters(token) - def processStartTag(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag token (%s)" - u" in the after body phase.") % (name,)) + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-body", + {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) def endTagHtml(self,name): if self.parser.innerHTML: - self.parser.parseError() + self.parser.parseError("unexpected-end-tag-after-body-innerhtml") else: - # XXX: This may need to be done, not sure: - # Don't set lastPhase to the current phase but to the inBody phase - # instead. No need for extra parse errors if there's something - # after . - # Try "XX" for instance. - self.parser.lastPhase = self.parser.phase - self.parser.phase = self.parser.phases["trailingEnd"] + self.parser.phase = self.parser.phases["afterAfterBody"] - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag token (%s)" - u" in the after body phase.") % (name,)) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-body", + {"name": token["name"]}) self.parser.phase = self.parser.phases["inBody"] - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) class InFramesetPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset @@ -1876,29 +2451,33 @@ class InFramesetPhase(Phase): ]) self.endTagHandler.default = self.endTagOther - def processCharacters(self, data): - self.parser.parseError(_(u"Unepxected characters in " - u"the frameset phase. Characters ignored.")) + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-frameset") + else: + assert self.parser.innerHTML - def startTagFrameset(self, name, attributes): - self.tree.insertElement(name, attributes) + def processCharacters(self, token): + self.parser.parseError("unexpected-char-in-frameset") - def startTagFrame(self, name, attributes): - self.tree.insertElement(name, attributes) + def startTagFrameset(self, token): + self.tree.insertElement(token) + + def startTagFrame(self, token): + self.tree.insertElement(token) self.tree.openElements.pop() - def startTagNoframes(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagNoframes(self, token): + self.parser.phases["inBody"].processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag token (%s)" - u" in the frameset phase. Ignored") % (name,)) + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-frameset", + {"name": token["name"]}) - def endTagFrameset(self, name): + def endTagFrameset(self, token): if self.tree.openElements[-1].name == "html": # innerHTML case - self.parser.parseError(_(u"Unexpected end tag token (frameset)" - u"in the frameset phase (innerHTML).")) + self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") else: self.tree.openElements.pop() if (not self.parser.innerHTML and @@ -1907,12 +2486,12 @@ class InFramesetPhase(Phase): # "frameset" element (anymore) then switch. self.parser.phase = self.parser.phases["afterFrameset"] - def endTagNoframes(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagNoframes(self, token): + self.parser.phases["inBody"].processEndTag(token) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag token (%s)" - u" in the frameset phase. Ignored.") % (name,)) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-frameset", + {"name": token["name"]}) class AfterFramesetPhase(Phase): @@ -1931,54 +2510,114 @@ class AfterFramesetPhase(Phase): ]) self.endTagHandler.default = self.endTagOther - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters in the " - u"after frameset phase. Ignored.")) + def processEOF(self): + #Stop parsing + pass - def startTagNoframes(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-frameset") - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s)" - u" in the after frameset phase. Ignored.") % (name,)) + def startTagNoframes(self, token): + self.parser.phases["inHead"].processStartTag(token) - def endTagHtml(self, name): - self.parser.lastPhase = self.parser.phase - self.parser.phase = self.parser.phases["trailingEnd"] + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-frameset", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s)" - u" in the after frameset phase. Ignored.") % (name,)) + def endTagHtml(self, token): + self.parser.phase = self.parser.phases["afterAfterFrameset"] + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-frameset", + {"name": token["name"]}) -class TrailingEndPhase(Phase): +class AfterAfterBodyPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml) + ]) + self.startTagHandler.default = self.startTagOther + def processEOF(self): pass - def processComment(self, data): - self.tree.insertComment(data, self.tree.document) + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) - def processSpaceCharacters(self, data): - self.parser.lastPhase.processSpaceCharacters(data) + def processSpaceCharacters(self, token): + self.parser.phases["inBody"].processSpaceCharacters(token) - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters. " - u"Expected end of file.")) - self.parser.phase = self.parser.lastPhase - self.parser.phase.processCharacters(data) + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processCharacters(token) - def processStartTag(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s)" - u". Expected end of file.") % (name,)) - self.parser.phase = self.parser.lastPhase - self.parser.phase.processStartTag(name, attributes) + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) - def processEndTag(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s)" - u". Expected end of file.") % (name,)) - self.parser.phase = self.parser.lastPhase - self.parser.phase.processEndTag(name) + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processStartTag(token) + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processEndTag(token) + +class AfterAfterFramesetPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + ("noframes", self.startTagNoFrames) + ]) + self.startTagHandler.default = self.startTagOther + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processCharacters(token) + + def startTagHtml(self, token): + self.parser.phases["inBody"].processStartTag(token) + + def startTagNoFrames(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processStartTag(token) + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + self.parser.phase.processEndTag(token) + +def impliedTagToken(name, type="EndTag", attributes = None, + selfClosing = False): + if attributes is None: + attributes = {} + return {"type":tokenTypes[type], "name":name, "data":attributes, + "selfClosing":selfClosing} class ParseError(Exception): """Error in parsed document""" diff --git a/planet/vendor/html5lib/ihatexml.py b/planet/vendor/html5lib/ihatexml.py new file mode 100644 index 0000000..0803474 --- /dev/null +++ b/planet/vendor/html5lib/ihatexml.py @@ -0,0 +1,170 @@ +import re + +baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]""" + +ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]""" + +combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A""" + +digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]""" + +extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]""" + +letter = " | ".join([baseChar, ideographic]) + +#Without the +name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, + extender]) +nameFirst = " | ".join([letter, "_"]) + +reChar = re.compile(r"#x([\d|A-F]{4,4})") +reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]") + +def charStringToList(chars): + charRanges = [item.strip() for item in chars.split(" | ")] + rv = [] + for item in charRanges: + foundMatch = False + for regexp in (reChar, reCharRange): + match = regexp.match(item) + if match is not None: + rv.append([hexToInt(item) for item in match.groups()]) + if len(rv[-1]) == 1: + rv[-1] = rv[-1]*2 + foundMatch = True + break + if not foundMatch: + assert len(item) == 1 + + rv.append([ord(item)] * 2) + rv = normaliseCharList(rv) + return rv + +def normaliseCharList(charList): + charList = sorted(charList) + for item in charList: + assert item[1] >= item[0] + rv = [] + i = 0 + while i < len(charList): + j = 1 + rv.append(charList[i]) + while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1: + rv[-1][1] = charList[i+j][1] + j += 1 + i += j + return rv + +#We don't really support characters above the BMP :( +max_unicode = int("FFFF", 16) + +def missingRanges(charList): + rv = [] + if charList[0] != 0: + rv.append([0, charList[0][0] - 1]) + for i, item in enumerate(charList[:-1]): + rv.append([item[1]+1, charList[i+1][0] - 1]) + if charList[-1][1] != max_unicode: + rv.append([charList[-1][1] + 1, max_unicode]) + return rv + +def listToRegexpStr(charList): + rv = [] + for item in charList: + if item[0] == item[1]: + rv.append(intToUnicodeStr(item[0])) + else: + rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1])) + return "[%s]"%"|".join(rv) + +def hexToInt(hex_str): + return int(hex_str, 16) + +def intToUnicodeStr(intValue): + #There must be a better (non-evil) way to do this + return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0"))) + +def escapeRegexp(string): + specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}", + "[", "]", "|", "(", ")", "-") + for char in specialCharacters: + string = string.replace(char, r"\\" + char) + if char in string: + print string + + return string + +#output from the above +nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]') + +class InfosetFilter(object): + replacementRegexp = re.compile(r"U[\dA-F]{5,5}") + def __init__(self, replaceChars = None, + replaceRanges = None, + dropXmlnsLocalName = False, + dropXmlnsAttrNs = False, + preventDoubleDashComments = False, + preventDashAtCommentEnd = False, + replaceFormFeedCharacters = True): + if replaceRanges is not None or replaceChars is not None: + raise NotImplementedError + else: + self.replaceCharsRegexp = nonXmlBMPRegexp + + self.dropXmlnsLocalName = dropXmlnsLocalName + self.dropXmlnsAttrNs = dropXmlnsAttrNs + + self.preventDoubleDashComments = preventDoubleDashComments + self.preventDashAtCommentEnd = preventDashAtCommentEnd + + self.replaceFormFeedCharacters = replaceFormFeedCharacters + + self.replaceCache = {} + + def coerceAttribute(self, name, namespace=None): + if self.dropXmlnsLocalName and name.startswith("xmlns:"): + #Need a datalosswarning here + return None + elif (self.dropXmlnsAttrNs and + namespace == "http://www.w3.org/2000/xmlns/"): + return None + else: + return self.toXmlName(name) + + def coerceElement(self, name, namespace=None): + return self.toXmlName(name) + + def coerceComment(self, data): + if self.preventDoubleDashComments: + while "--" in data: + data = data.replace("--", "- -") + return data + + def coerceCharacters(self, data): + if self.replaceFormFeedCharacters: + data = data.replace("\x0C", " ") + #Other non-xml characters + return data + + def toXmlName(self, name): + replaceChars = set(self.replaceCharsRegexp.findall(name)) + for char in replaceChars: + if char in self.replaceCache: + replacement = self.replaceCache[char] + else: + replacement = self.escapeChar(char) + name = name.replace(char, replacement) + return name + + def fromXmlName(self, name): + for item in set(self.replacementRegexp.findall(name)): + name = name.replace(item, self.unescapeChar(item)) + return name + + def escapeChar(self, char): + replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0") + self.replaceCache[char] = replacement + return replacement + + def unescapeChar(self, charcode): + return unichr(int(charcode[1:], 16)) diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py index b38979d..bec848f 100644 --- a/planet/vendor/html5lib/inputstream.py +++ b/planet/vendor/html5lib/inputstream.py @@ -1,15 +1,109 @@ import codecs import re import types - -from gettext import gettext -_ = gettext +import sys from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase -from constants import encodings -from utils import MethodDispatcher +from constants import encodings, ReparseException -class HTMLInputStream(object): +#Non-unicode versions of constants for use in the pre-parser +spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters]) +asciiLettersBytes = frozenset([str(item) for item in asciiLetters]) +asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase]) +spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"]) + +invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, + 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, + 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, + 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, + 0x10FFFE, 0x10FFFF]) + +ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") + +# Cache for charsUntil() +charsUntilRegEx = {} + +class BufferedStream: + """Buffering for streams that do not have buffering of their own + + The buffer is implemented as a list of chunks on the assumption that + joining many strings will be slow since it is O(n**2) + """ + + def __init__(self, stream): + self.stream = stream + self.buffer = [] + self.position = [-1,0] #chunk number, offset + + def tell(self): + pos = 0 + for chunk in self.buffer[:self.position[0]]: + pos += len(chunk) + pos += self.position[1] + return pos + + def seek(self, pos): + assert pos < self._bufferedBytes() + offset = pos + i = 0 + while len(self.buffer[i]) < offset: + offset -= pos + i += 1 + self.position = [i, offset] + + def read(self, bytes): + if not self.buffer: + return self._readStream(bytes) + elif (self.position[0] == len(self.buffer) and + self.position[1] == len(self.buffer[-1])): + return self._readStream(bytes) + else: + return self._readFromBuffer(bytes) + + def _bufferedBytes(self): + return sum([len(item) for item in self.buffer]) + + def _readStream(self, bytes): + data = self.stream.read(bytes) + self.buffer.append(data) + self.position[0] += 1 + self.position[1] = len(data) + return data + + def _readFromBuffer(self, bytes): + remainingBytes = bytes + rv = [] + bufferIndex = self.position[0] + bufferOffset = self.position[1] + while bufferIndex < len(self.buffer) and remainingBytes != 0: + assert remainingBytes > 0 + bufferedData = self.buffer[bufferIndex] + + if remainingBytes <= len(bufferedData) - bufferOffset: + bytesToRead = remainingBytes + self.position = [bufferIndex, bufferOffset + bytesToRead] + else: + bytesToRead = len(bufferedData) - bufferOffset + self.position = [bufferIndex, len(bufferedData)] + bufferIndex += 1 + data = rv.append(bufferedData[bufferOffset: + bufferOffset + bytesToRead]) + remainingBytes -= bytesToRead + + bufferOffset = 0 + + if remainingBytes: + rv.append(self._readStream(remainingBytes)) + + return "".join(rv) + + + +class HTMLInputStream: """Provides a unicode stream of characters to the HTMLTokenizer. This class takes care of character encoding and removing or replacing @@ -17,11 +111,13 @@ class HTMLInputStream(object): """ + _defaultChunkSize = 10240 + def __init__(self, source, encoding=None, parseMeta=True, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source - for use by the HTML5Lib. + for use by html5lib. source can be either a file-object, local filename or a string. @@ -33,10 +129,17 @@ class HTMLInputStream(object): parseMeta - Look for a element containing encoding information """ + + #Craziness + if len(u"\U0010FFFF") == 1: + self.reportCharacterErrors = self.characterErrorsUCS4 + else: + self.reportCharacterErrors = self.characterErrorsUCS2 + # List of where new lines occur self.newLines = [0] - self.charEncoding = encoding + self.charEncoding = (codecName(encoding), "certain") # Raw Stream - for unicode objects this will encode to utf-8 and set # self.charEncoding as appropriate @@ -52,17 +155,25 @@ class HTMLInputStream(object): self.defaultEncoding = "windows-1252" #Detect encoding iff no explicit "transport level" encoding is supplied - if self.charEncoding is None or not isValidEncoding(self.charEncoding): + if (self.charEncoding[0] is None): self.charEncoding = self.detectEncoding(parseMeta, chardet) - self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, - 'replace') - self.queue = [] + self.reset() + + def reset(self): + self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, + 'replace') + + self.chunk = u"" + self.chunkSize = 0 + self.chunkOffset = 0 self.errors = [] - self.line = self.col = 0 - self.lineLengths = [] + # number of (complete) lines in previous chunks + self.prevNumLines = 0 + # number of columns in the last line of the previous chunk + self.prevNumCols = 0 #Flag to indicate we may have a CR LF broken across a data chunk self._lastChunkEndsWithCR = False @@ -80,22 +191,29 @@ class HTMLInputStream(object): # Otherwise treat source as a string and convert to a file object if isinstance(source, unicode): source = source.encode('utf-8') - self.charEncoding = "utf-8" + self.charEncoding = ("utf-8", "certain") import cStringIO stream = cStringIO.StringIO(str(source)) + + if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or + stream is sys.stdin): + stream = BufferedStream(stream) + return stream def detectEncoding(self, parseMeta=True, chardet=True): - #First look for a BOM #This will also read past the BOM if present encoding = self.detectBOM() + confidence = "certain" #If there is no BOM need to look for meta elements with encoding #information if encoding is None and parseMeta: encoding = self.detectEncodingMeta() + confidence = "tentative" #Guess with chardet, if avaliable if encoding is None and chardet: + confidence = "tentative" try: from chardet.universaldetector import UniversalDetector buffers = [] @@ -108,11 +226,12 @@ class HTMLInputStream(object): detector.feed(buffer) detector.close() encoding = detector.result['encoding'] - self.seek("".join(buffers), 0) + self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: + confidence="tentative" encoding = self.defaultEncoding #Substitute for equivalent encodings: @@ -121,8 +240,22 @@ class HTMLInputStream(object): if encoding.lower() in encodingSub: encoding = encodingSub[encoding.lower()] - return encoding + return encoding, confidence + def changeEncoding(self, newEncoding): + newEncoding = codecName(newEncoding) + if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): + newEncoding = "utf-8" + if newEncoding is None: + return + elif newEncoding == self.charEncoding[0]: + self.charEncoding = (self.charEncoding[0], "certain") + else: + self.rawStream.seek(0) + self.reset() + self.charEncoding = (newEncoding, "certain") + raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding) + def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the @@ -149,198 +282,219 @@ class HTMLInputStream(object): # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream - self.seek(string, encoding and seek or 0) + self.rawStream.seek(encoding and seek or 0) return encoding - def seek(self, buffer, n): - """Unget buffer[n:]""" - if hasattr(self.rawStream, 'unget'): - self.rawStream.unget(buffer[n:]) - return - - if hasattr(self.rawStream, 'seek'): - try: - self.rawStream.seek(n) - return - except IOError: - pass - - class BufferedStream: - def __init__(self, data, stream): - self.data = data - self.stream = stream - def read(self, chars=-1): - if chars == -1 or chars > len(self.data): - result = self.data - self.data = '' - if chars == -1: - return result + self.stream.read() - else: - return result + self.stream.read(chars-len(result)) - elif not self.data: - return self.stream.read(chars) - else: - result = self.data[:chars] - self.data = self.data[chars:] - return result - def unget(self, data): - if self.data: - self.data += data - else: - self.data = data - - self.rawStream = BufferedStream(buffer[n:], self.rawStream) - def detectEncodingMeta(self): """Report the encoding declared by the meta element """ buffer = self.rawStream.read(self.numBytesMeta) parser = EncodingParser(buffer) - self.seek(buffer, 0) - return parser.getEncoding() + self.rawStream.seek(0) + encoding = parser.getEncoding() + + if encoding in ("utf-16", "utf-16-be", "utf-16-le"): + encoding = "utf-8" + + return encoding + + def _position(self, offset): + chunk = self.chunk + nLines = chunk.count(u'\n', 0, offset) + positionLine = self.prevNumLines + nLines + lastLinePos = chunk.rfind(u'\n', 0, offset) + if lastLinePos == -1: + positionColumn = self.prevNumCols + offset + else: + positionColumn = offset - (lastLinePos + 1) + return (positionLine, positionColumn) def position(self): """Returns (line, col) of the current position in the stream.""" - line, col = self.line, self.col - return (line + 1, col) + line, col = self._position(self.chunkOffset) + return (line+1, col) def char(self): """ Read one character from the stream or queue if available. Return EOF when EOF is reached. """ - if not self.queue: - self.readChunk() - #If we still don't have a character we have reached EOF - if not self.queue: - return EOF - - char = self.queue.pop(0) - - # update position in stream - if char == '\n': - self.lineLengths.append(self.col) - self.line += 1 - self.col = 0 - else: - self.col += 1 + # Read a new chunk from the input stream if necessary + if self.chunkOffset >= self.chunkSize: + if not self.readChunk(): + return EOF + + chunkOffset = self.chunkOffset + char = self.chunk[chunkOffset] + self.chunkOffset = chunkOffset + 1 + return char - def readChunk(self, chunkSize=10240): + def readChunk(self, chunkSize=None): + if chunkSize is None: + chunkSize = self._defaultChunkSize + + self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) + + self.chunk = u"" + self.chunkSize = 0 + self.chunkOffset = 0 + data = self.dataStream.read(chunkSize) + if not data: - return - #Replace null characters - for i in xrange(data.count(u"\u0000")): - self.errors.append(_('null character found in input stream, ' - 'replaced with U+FFFD')) + return False + + self.reportCharacterErrors(data) + data = data.replace(u"\u0000", u"\ufffd") #Check for CR LF broken across chunks - if (self._lastChunkEndsWithCR and data[0] == "\n"): + if (self._lastChunkEndsWithCR and data[0] == u"\n"): data = data[1:] - self._lastChunkEndsWithCR = data[-1] == "\r" - data = data.replace("\r\n", "\n") - data = data.replace("\r", "\n") - - data = unicode(data) - self.queue.extend([char for char in data]) + # Stop if the chunk is now empty + if not data: + return False + self._lastChunkEndsWithCR = data[-1] == u"\r" + data = data.replace(u"\r\n", u"\n") + data = data.replace(u"\r", u"\n") + + self.chunk = data + self.chunkSize = len(data) + + return True + + def characterErrorsUCS4(self, data): + for i in xrange(data.count(u"\u0000")): + self.errors.append("null-character") + for i in xrange(len(invalid_unicode_re.findall(data))): + self.errors.append("invalid-codepoint") + + def characterErrorsUCS2(self, data): + #Someone picked the wrong compile option + #You lose + for i in xrange(data.count(u"\u0000")): + self.errors.append("null-character") + skip = False + import sys + for match in invalid_unicode_re.finditer(data): + if skip: + continue + codepoint = ord(match.group()) + pos = match.start() + #Pretty sure there should be endianness issues here + if (codepoint >= 0xD800 and codepoint <= 0xDBFF and + pos < len(data) - 1 and + ord(data[pos + 1]) >= 0xDC00 and + ord(data[pos + 1]) <= 0xDFFF): + #We have a surrogate pair! + #From a perl manpage + char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + + (ord(data[pos + 1]) - 0xDC00)) + if char_val in non_bmp_invalid_codepoints: + self.errors.append("invalid-codepoint") + skip = True + elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and + pos == len(data) - 1): + self.errors.append("invalid-codepoint") + else: + skip = False + self.errors.append("invalid-codepoint") + #This is still wrong if it is possible for a surrogate pair to break a + #chunk boundary def charsUntil(self, characters, opposite = False): """ Returns a string of characters from the stream up to but not - including any character in characters or EOF. characters can be - any container that supports the in method being called on it. + including any character in 'characters' or EOF. 'characters' must be + a container that supports the 'in' method and iteration over its + characters. """ - #This method is currently 40-50% of our total runtime and badly needs - #optimizing - #Possible improvements: - # - use regexp to find characters that match the required character set - # (with regexp cache since we do the same searches many many times) - # - improve EOF handling for fewer if statements + # Use a cache of regexps to find the required characters + try: + chars = charsUntilRegEx[(characters, opposite)] + except KeyError: + if __debug__: + for c in characters: + assert(ord(c) < 128) + regex = u"".join([u"\\x%02x" % ord(c) for c in characters]) + if not opposite: + regex = u"^%s" % regex + chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex) - if not self.queue: - self.readChunk() - #Break if we have reached EOF - if not self.queue or self.queue[0] == None: - return u"" - - i = 0 - while (self.queue[i] in characters) == opposite: - i += 1 - if i == len(self.queue): - self.readChunk() - #If the queue doesn't grow we have reached EOF - if i == len(self.queue) or self.queue[i] is EOF: - break - #XXX- wallpaper over bug in calculation below - #Otherwise change the stream position - if self.queue[i] == '\n': - self.lineLengths.append(self.col) - self.line += 1 - self.col = 0 + rv = [] + + while True: + # Find the longest matching prefix + m = chars.match(self.chunk, self.chunkOffset) + if m is None: + # If nothing matched, and it wasn't because we ran out of chunk, + # then stop + if self.chunkOffset != self.chunkSize: + break else: - self.col += 1 + end = m.end() + # If not the whole chunk matched, return everything + # up to the part that didn't match + if end != self.chunkSize: + rv.append(self.chunk[self.chunkOffset:end]) + self.chunkOffset = end + break + # If the whole remainder of the chunk matched, + # use it all and read the next chunk + rv.append(self.chunk[self.chunkOffset:]) + if not self.readChunk(): + # Reached EOF + break - rv = u"".join(self.queue[:i]) - self.queue = self.queue[i:] - - #Calculate where we now are in the stream - #One possible optimisation would be to store all read characters and - #Calculate this on an as-needed basis (perhaps flushing the read data - #every time we read a new chunk) rather than once per call here and - #in .char() - - #XXX Temporarily disable this because there is a bug - - #lines = rv.split("\n") - # - #if lines: - # #Add number of lines passed onto positon - # oldCol = self.col - # self.line += len(lines)-1 - # if len(lines) > 1: - # self.col = len(lines[-1]) - # else: - # self.col += len(lines[0]) - # - # if self.lineLengths and oldCol > 0: - # self.lineLengths[-1] += len(lines[0]) - # lines = lines[1:-1] - # else: - # lines = lines[:-1] - # - # for line in lines: - # self.lineLengths.append(len(line)) - # - - return rv + r = u"".join(rv) + return r - def unget(self, chars): - if chars: - self.queue = list(chars) + self.queue - #Alter the current line, col position - for c in chars[::-1]: - if c == '\n': - self.line -= 1 - self.col = self.lineLengths[self.line] - else: - self.col -= 1 + def unget(self, char): + # Only one character is allowed to be ungotten at once - it must + # be consumed again before any further call to unget + + if char is not None: + if self.chunkOffset == 0: + # unget is called quite rarely, so it's a good idea to do + # more work here if it saves a bit of work in the frequently + # called char and charsUntil. + # So, just prepend the ungotten character onto the current + # chunk: + self.chunk = char + self.chunk + self.chunkSize += 1 + else: + self.chunkOffset -= 1 + assert self.chunk[self.chunkOffset] == char class EncodingBytes(str): - """String-like object with an assosiated position and various extra methods + """String-like object with an associated position and various extra methods If the position is ever greater than the string length then an exception is raised""" + def __new__(self, value): + return str.__new__(self, value) + def __init__(self, value): - str.__init__(self, value) self._position=-1 def __iter__(self): return self def next(self): - self._position += 1 - rv = self[self.position] - return rv + p = self._position = self._position + 1 + if p >= len(self): + raise StopIteration + elif p < 0: + raise TypeError + return self[p] + + def previous(self): + p = self._position + if p >= len(self): + raise StopIteration + elif p < 0: + raise TypeError + self._position = p = p - 1 + return self[p] def setPosition(self, position): if self._position >= len(self): @@ -362,20 +516,39 @@ class EncodingBytes(str): currentByte = property(getCurrentByte) - def skip(self, chars=spaceCharacters): + def skip(self, chars=spaceCharactersBytes): """Skip past a list of characters""" - while self.currentByte in chars: - self.position += 1 + p = self.position # use property for the error-checking + while p < len(self): + c = self[p] + if c not in chars: + self._position = p + return c + p += 1 + self._position = p + return None + + def skipUntil(self, chars): + p = self.position + while p < len(self): + c = self[p] + if c in chars: + self._position = p + return c + p += 1 + self._position = p + return None def matchBytes(self, bytes, lower=False): """Look for a sequence of bytes at the start of a string. If the bytes are found return True and advance the position to the byte after the match. Otherwise return False and leave the position alone""" - data = self[self.position:self.position+len(bytes)] + p = self.position + data = self[p:p+len(bytes)] if lower: data = data.lower() rv = data.startswith(bytes) - if rv == True: + if rv: self.position += len(bytes) return rv @@ -388,12 +561,6 @@ class EncodingBytes(str): return True else: raise StopIteration - - def findNext(self, byteList): - """Move the pointer so it points to the next byte in a set of possible - bytes""" - while (self.currentByte not in byteList): - self.position += 1 class EncodingParser(object): """Mini parser for detecting character encoding from meta elements""" @@ -423,8 +590,7 @@ class EncodingParser(object): break if not keepParsing: break - if self.encoding is not None: - self.encoding = self.encoding.strip() + return self.encoding def handleComment(self): @@ -432,7 +598,7 @@ class EncodingParser(object): return self.data.jumpTo("-->") def handleMeta(self): - if self.data.currentByte not in spaceCharacters: + if self.data.currentByte not in spaceCharactersBytes: #if we have "]) - if self.data.currentByte == "<": + c = data.skipUntil(spacesAngleBrackets) + if c == "<": #return to the first step in the overall "two step" algorithm #reprocessing the < byte - self.data.position -= 1 + data.previous() else: #Read all attributes attr = self.getAttribute() @@ -489,73 +658,75 @@ class EncodingParser(object): def getAttribute(self): """Return a name,value pair for the next attribute in the stream, if one is found, or None""" - self.data.skip(list(spaceCharacters)+["/"]) - if self.data.currentByte == "<": - self.data.position -= 1 + data = self.data + c = data.skip(spaceCharactersBytes | frozenset("/")) + if c == "<": + data.previous() return None - elif self.data.currentByte == ">": + elif c == ">" or c is None: return None attrName = [] attrValue = [] spaceFound = False #Step 5 attribute name while True: - if self.data.currentByte == "=" and attrName: + if c == "=" and attrName: break - elif self.data.currentByte in spaceCharacters: + elif c in spaceCharactersBytes: spaceFound=True break - elif self.data.currentByte in ("/", "<", ">"): + elif c in ("/", "<", ">"): return "".join(attrName), "" - elif self.data.currentByte in asciiUppercase: - attrName.extend(self.data.currentByte.lower()) + elif c in asciiUppercaseBytes: + attrName.append(c.lower()) else: - attrName.extend(self.data.currentByte) + attrName.append(c) #Step 6 - self.data.position += 1 + c = data.next() #Step 7 if spaceFound: - self.data.skip() + c = data.skip() #Step 8 - if self.data.currentByte != "=": - self.data.position -= 1 + if c != "=": + data.previous() return "".join(attrName), "" #XXX need to advance position in both spaces and value case #Step 9 - self.data.position += 1 + data.next() #Step 10 - self.data.skip() + c = data.skip() #Step 11 - if self.data.currentByte in ("'", '"'): + if c in ("'", '"'): #11.1 - quoteChar = self.data.currentByte + quoteChar = c while True: - self.data.position+=1 #11.3 - if self.data.currentByte == quoteChar: - self.data.position += 1 + c = data.next() + if c == quoteChar: + data.next() return "".join(attrName), "".join(attrValue) #11.4 - elif self.data.currentByte in asciiUppercase: - attrValue.extend(self.data.currentByte.lower()) + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) #11.5 else: - attrValue.extend(self.data.currentByte) - elif self.data.currentByte in (">", '<'): - return "".join(attrName), "" - elif self.data.currentByte in asciiUppercase: - attrValue.extend(self.data.currentByte.lower()) + attrValue.append(c) + elif c in (">", "<"): + return "".join(attrName), "" + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) + elif c is None: + return None else: - attrValue.extend(self.data.currentByte) + attrValue.append(c) while True: - self.data.position +=1 - if self.data.currentByte in ( - list(spaceCharacters) + [">", '<']): + c = data.next() + if c in spacesAngleBrackets: return "".join(attrName), "".join(attrValue) - elif self.data.currentByte in asciiUppercase: - attrValue.extend(self.data.currentByte.lower()) + elif c in asciiUppercaseBytes: + attrValue.append(c.lower()) else: - attrValue.extend(self.data.currentByte) + attrValue.append(c) class ContentAttrParser(object): @@ -588,7 +759,7 @@ class ContentAttrParser(object): #Unquoted value oldPosition = self.data.position try: - self.data.findNext(spaceCharacters) + self.data.skipUntil(spaceCharactersBytes) return self.data[oldPosition:self.data.position] except StopIteration: #Return the whole remaining value @@ -596,7 +767,12 @@ class ContentAttrParser(object): except StopIteration: return None -def isValidEncoding(encoding): - """Determine if a string is a supported encoding""" - return (encoding is not None and type(encoding) == types.StringType and - encoding.lower().strip() in encodings) + +def codecName(encoding): + """Return the python codec name corresponding to an encoding or None if the + string doesn't correspond to a valid encoding.""" + if (encoding is not None and type(encoding) in types.StringTypes): + canonicalName = ascii_punctuation_re.sub("", encoding).lower() + return encodings.get(canonicalName, None) + else: + return None diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py deleted file mode 100644 index 89e9f00..0000000 --- a/planet/vendor/html5lib/liberalxmlparser.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Warning: this module is experimental and subject to change and even removal -at any time. - -For background/rationale, see: - * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib - * http://tinyurl.com/ylfj8k (and follow-ups) - -References: - * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html - * http://wiki.whatwg.org/wiki/HtmlVsXhtml - -@@TODO: - * Selectively lowercase only XHTML, but not foreign markup -""" - -import html5parser -from constants import voidElements, contentModelFlags - -from xml.dom import XHTML_NAMESPACE -from xml.sax.saxutils import unescape - -class XMLParser(html5parser.HTMLParser): - """ liberal XML parser """ - - def __init__(self, *args, **kwargs): - html5parser.HTMLParser.__init__(self, *args, **kwargs) - - self.phases["initial"] = XmlRootPhase(self, self.tree) - - def normalizeToken(self, token): - - if token["type"] in ("StartTag", "EmptyTag"): - token["data"] = dict(token["data"][::-1]) - - # For EmptyTags, process both a Start and an End tag - if token["type"] == "EmptyTag": - save = self.tokenizer.contentModelFlag - self.phase.processStartTag(token["name"], token["data"]) - self.tokenizer.contentModelFlag = save - token["data"] = {} - token["type"] = "EndTag" - - elif token["type"] == "Characters": - # un-escape rcdataElements (e.g. style, script) - if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]: - token["data"] = unescape(token["data"]) - - elif token["type"] == "Comment": - # Rescue CDATA from the comments - if (token["data"].startswith("[CDATA[") and - token["data"].endswith("]]")): - token["type"] = "Characters" - token["data"] = token["data"][7:-2] - - return token - - def _parse(self, stream, innerHTML=False, container="div", encoding=None, - **kwargs): - - html5parser.HTMLParser._parse(self, stream, innerHTML, container, - encoding, lowercaseElementName=False, - lowercaseAttrName=False) - -class XHTMLParser(XMLParser): - """ liberal XMTHML parser """ - - def __init__(self, *args, **kwargs): - html5parser.HTMLParser.__init__(self, *args, **kwargs) - self.phases["initial"] = XmlInitialPhase(self, self.tree) - self.phases["rootElement"] = XhmlRootPhase(self, self.tree) - - def normalizeToken(self, token): - token = XMLParser.normalizeToken(self, token) - - # ensure that non-void XHTML elements have content so that separate - # open and close tags are emitted - if token["type"] == "EndTag": - if token["name"] in voidElements: - if not self.tree.openElements or \ - self.tree.openElements[-1].name != token["name"]: - token["type"] = "EmptyTag" - if not token.has_key("data"): token["data"] = {} - else: - if token["name"] == self.tree.openElements[-1].name and \ - not self.tree.openElements[-1].hasContent(): - for e in self.tree.openElements: - if 'xmlns' in e.attributes.keys(): - if e.attributes['xmlns'] != XHTML_NAMESPACE: - break - else: - self.tree.insertText('') - - return token - -class XhmlRootPhase(html5parser.RootElementPhase): - def insertHtmlElement(self): - element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'}) - self.tree.openElements.append(element) - self.tree.document.appendChild(element) - self.parser.phase = self.parser.phases["beforeHead"] - -class XmlInitialPhase(html5parser.InitialPhase): - """ Consume XML Prologs """ - def processComment(self, data): - if not data.startswith('?xml') or not data.endswith('?'): - html5parser.InitialPhase.processComment(self, data) - -class XmlRootPhase(html5parser.Phase): - """ Consume XML Prologs """ - def processComment(self, data): - print repr(data) - if not data.startswith('?xml') or not data.endswith('?'): - html5parser.InitialPhase.processComment(self, data) - - """ Prime the Xml parser """ - def __getattr__(self, name): - self.tree.openElements.append(self.tree.document) - self.parser.phase = XmlElementPhase(self.parser, self.tree) - return getattr(self.parser.phase, name) - -class XmlElementPhase(html5parser.Phase): - """ Generic handling for all XML elements """ - - def __init__(self, *args, **kwargs): - html5parser.Phase.__init__(self, *args, **kwargs) - self.startTagHandler = html5parser.utils.MethodDispatcher([]) - self.startTagHandler.default = self.startTagOther - self.endTagHandler = html5parser.utils.MethodDispatcher([]) - self.endTagHandler.default = self.endTagOther - - def startTagOther(self, name, attributes): - element = self.tree.createElement(name, attributes) - self.tree.openElements[-1].appendChild(element) - self.tree.openElements.append(element) - - def endTagOther(self, name): - for node in self.tree.openElements[::-1]: - if node.name == name: - while self.tree.openElements.pop() != node: - pass - break - else: - self.parser.parseError() - - def processCharacters(self, data): - self.tree.insertText(data) diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py index ccbc16b..79e358f 100644 --- a/planet/vendor/html5lib/sanitizer.py +++ b/planet/vendor/html5lib/sanitizer.py @@ -1,6 +1,8 @@ import re from xml.sax.saxutils import escape, unescape + from tokenizer import HTMLTokenizer +from constants import tokenTypes class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object): svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face', - 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image', + 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] @@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object): 'arabic-form', 'ascent', 'attributeName', 'attributeType', 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', - 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', - 'font-family', 'font-size', 'font-stretch', 'font-style', + 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', + 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', @@ -82,6 +84,13 @@ class HTMLSanitizerMixin(object): attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'xlink:href', 'xml:base'] + + svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', + 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke'] + + svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion', + 'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern', + 'radialGradient', 'textpath', 'tref', 'set', 'use'] acceptable_css_properties = ['azimuth', 'background-color', 'border-bottom-color', 'border-collapse', 'border-color', @@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object): # sanitize_html('Click here for $100') # => Click here for $100 def sanitize_token(self, token): - if token["type"] in ["StartTag", "EndTag", "EmptyTag"]: + if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: if token.has_key("data"): - attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) + attrs = dict([(name,val) for name,val in + token["data"][::-1] + if name in self.allowed_attributes]) for attr in self.attr_val_is_uri: - if not attrs.has_key(attr): continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() - if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols): + if not attrs.has_key(attr): + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and + (val_unescaped.split(':')[0] not in + self.allowed_protocols)): del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token["name"] in self.svg_allow_local_href and + 'xlink:href' in attrs and re.search('^\s*[^#\s].*', + attrs['xlink:href'])): + del attrs['xlink:href'] if attrs.has_key('style'): attrs['style'] = self.sanitize_css(attrs['style']) token["data"] = [[name,val] for name,val in attrs.items()] return token else: - if token["type"] == "EndTag": + if token["type"] == tokenTypes["EndTag"]: token["data"] = "%s>" % token["name"] elif token["data"]: attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) token["data"] = "<%s%s>" % (token["name"],attrs) else: token["data"] = "<%s>" % token["name"] - if token["type"] == "EmptyTag": + if token["type"] == tokenTypes["EmptyTag"]: token["data"]=token["data"][:-1] + "/>" - token["type"] = "Characters" + token["type"] = tokenTypes["Characters"] del token["name"] return token - elif token["type"] == "Comment": + elif token["type"] == tokenTypes["Comment"]: pass else: return token @@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object): # gauntlet if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' - if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' + if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): if not value: continue if prop.lower() in self.allowed_css_properties: clean.append(prop + ': ' + value + ';') - elif prop.split('-')[0].lower() in ['background','border','margin','padding']: + elif prop.split('-')[0].lower() in ['background','border','margin', + 'padding']: for keyword in value.split(): if not keyword in self.acceptable_css_keywords and \ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): @@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object): return ' '.join(clean) class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): - def __init__(self, stream, encoding=None, parseMeta=True, + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, lowercaseElementName=False, lowercaseAttrName=False): #Change case matching defaults as we only output lowercase html anyway #This solution doesn't seem ideal... - HTMLTokenizer.__init__(self, stream, encoding, parseMeta, + HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, lowercaseElementName, lowercaseAttrName) def __iter__(self): diff --git a/planet/vendor/html5lib/serializer/__init__.py b/planet/vendor/html5lib/serializer/__init__.py index c0030f2..1b74665 100644 --- a/planet/vendor/html5lib/serializer/__init__.py +++ b/planet/vendor/html5lib/serializer/__init__.py @@ -1,3 +1,17 @@ +from html5lib import treewalkers + from htmlserializer import HTMLSerializer from xhtmlserializer import XHTMLSerializer + +def serialize(input, tree="simpletree", format="html", encoding=None, + **serializer_opts): + # XXX: Should we cache this? + walker = treewalkers.getTreeWalker(tree) + if format == "html": + s = HTMLSerializer(**serializer_opts) + elif format == "xhtml": + s = XHTMLSerializer(**serializer_opts) + else: + raise ValueError, "type must be either html or xhtml" + return s.render(walker(input), encoding) diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py index c5d6c51..a2e2f45 100644 --- a/planet/vendor/html5lib/serializer/htmlserializer.py +++ b/planet/vendor/html5lib/serializer/htmlserializer.py @@ -147,7 +147,7 @@ class HTMLSerializer(object): quote_attr = True else: quote_attr = reduce(lambda x,y: x or (y in v), - spaceCharacters + "<>\"'", False) + spaceCharacters + ">\"'=", False) v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") if encoding: diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py index 31f8494..d884782 100644 --- a/planet/vendor/html5lib/tokenizer.py +++ b/planet/vendor/html5lib/tokenizer.py @@ -4,17 +4,25 @@ except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset -import gettext -_ = gettext.gettext - +try: + from collections import deque +except ImportError: + from utils import deque + from constants import contentModelFlags, spaceCharacters from constants import entitiesWindows1252, entities from constants import asciiLowercase, asciiLetters, asciiUpper2Lower from constants import digits, hexDigits, EOF +from constants import tokenTypes, tagTokenTypes from inputstream import HTMLInputStream -class HTMLTokenizer(object): +# Group entities by their first character, for faster lookups +entitiesByFirstChar = {} +for e in entities: + entitiesByFirstChar.setdefault(e[0], []).append(e) + +class HTMLTokenizer: """ This class takes care of tokenizing HTML. * self.currentToken @@ -23,70 +31,31 @@ class HTMLTokenizer(object): * self.state Holds a reference to the method to be invoked... XXX - * self.states - Holds a mapping between states and methods that implement the state. - * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation - def __init__(self, stream, encoding=None, parseMeta=True, - lowercaseElementName=True, lowercaseAttrName=True,): - self.stream = HTMLInputStream(stream, encoding, parseMeta) + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, + lowercaseElementName=True, lowercaseAttrName=True): + + self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) #Perform case conversions? self.lowercaseElementName = lowercaseElementName self.lowercaseAttrName = lowercaseAttrName - self.states = { - "data":self.dataState, - "entityData":self.entityDataState, - "tagOpen":self.tagOpenState, - "closeTagOpen":self.closeTagOpenState, - "tagName":self.tagNameState, - "beforeAttributeName":self.beforeAttributeNameState, - "attributeName":self.attributeNameState, - "afterAttributeName":self.afterAttributeNameState, - "beforeAttributeValue":self.beforeAttributeValueState, - "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, - "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, - "attributeValueUnQuoted":self.attributeValueUnQuotedState, - "bogusComment":self.bogusCommentState, - "markupDeclarationOpen":self.markupDeclarationOpenState, - "commentStart":self.commentStartState, - "commentStartDash":self.commentStartDashState, - "comment":self.commentState, - "commentEndDash":self.commentEndDashState, - "commentEnd":self.commentEndState, - "doctype":self.doctypeState, - "beforeDoctypeName":self.beforeDoctypeNameState, - "doctypeName":self.doctypeNameState, - "afterDoctypeName":self.afterDoctypeNameState, - "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState, - "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState, - "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState, - "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState, - "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState, - "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState, - "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState, - "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState, - "bogusDoctype":self.bogusDoctypeState - } - # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.escapeFlag = False self.lastFourChars = [] - self.state = self.states["data"] + self.state = self.dataState + self.escape = False # The current token being created self.currentToken = None - # Tokens to be processed. - self.tokenQueue = [] - def __iter__(self): """ This is where the magic happens. @@ -94,43 +63,21 @@ class HTMLTokenizer(object): to return we yield the token which pauses processing until the next token is requested. """ - self.tokenQueue = [] + self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: - yield {"type": "ParseError", "data": self.stream.errors.pop(0)} + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} while self.tokenQueue: - yield self.tokenQueue.pop(0) - - # Below are various helper functions the tokenizer states use worked out. - def processSolidusInTag(self): - """If the next character is a '>', convert the currentToken into - an EmptyTag - """ - - # We need to consume another character to make sure it's a ">" - data = self.stream.char() - - if self.currentToken["type"] == "StartTag" and data == u">": - self.currentToken["type"] = "EmptyTag" - else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Solidus (/) incorrectly placed in tag.")}) - - # The character we just consumed need to be put back on the stack so it - # doesn't get lost... - self.stream.unget(data) + yield self.tokenQueue.popleft() def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. - If not present self.tokenQueue.append({"type": "ParseError"}) is invoked. + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. """ - # XXX More need to be done here. For instance, #13 should prolly be - # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and - # such. Thoughts on this appreciated. allowed = digits radix = 10 if isHex: @@ -150,19 +97,28 @@ class HTMLTokenizer(object): charAsInt = int("".join(charStack), radix) if charAsInt == 13: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Incorrect CR newline entity. Replaced with LF.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-cr-newline-entity"}) charAsInt = 10 elif 127 < charAsInt < 160: # If the integer is between 127 and 160 (so 128 and bigger and 159 # and smaller) we need to do the "windows trick". - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Entity used with illegal number (windows-1252 reference).")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-windows-1252-entity"}) charAsInt = entitiesWindows1252[charAsInt - 128] - # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF). - if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343): + # Certain characters get replaced with U+FFFD + if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F) + or (0x007F <= charAsInt <= 0x009F) + or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF) + or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10 + or (0x10FFFF < charAsInt)): + char = u"\uFFFD" + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "illegal-codepoint-for-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) + else: try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work @@ -172,65 +128,61 @@ class HTMLTokenizer(object): try: char = eval("u'\\U%08x'" % charAsInt) except: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt}) - else: - char = u"\uFFFD" - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "cant-convert-numeric-entity", + "datavars": {"charAsInt": charAsInt}}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Numeric entity didn't end with ';'.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "numeric-entity-without-semicolon"}) self.stream.unget(c) return char - def consumeEntity(self, fromAttribute=False): - char = None + def consumeEntity(self, allowedChar=None, fromAttribute=False): + # Initialise to the default output for when no entity is matched + output = u"&" + charStack = [self.stream.char()] - if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"): - self.stream.unget(charStack) + if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \ + or (allowedChar is not None and allowedChar == charStack[0]): + self.stream.unget(charStack[0]) + elif charStack[0] == u"#": - # We might have a number entity here. - charStack.extend([self.stream.char(), self.stream.char()]) - if EOF in charStack[:2]: - # If we reach the end of the file put everything up to EOF - # back in the queue - charStack = charStack[:charStack.index(EOF)] - self.stream.unget(charStack) - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Numeric entity expected. Got end of file instead.")}) + # Read the next character to see if it's hex or decimal + hex = False + charStack.append(self.stream.char()) + if charStack[-1] in (u"x", u"X"): + hex = True + charStack.append(self.stream.char()) + + # charStack[-1] should be the first digit + if (hex and charStack[-1] in hexDigits) \ + or (not hex and charStack[-1] in digits): + # At least one digit found, so consume the whole number + self.stream.unget(charStack[-1]) + output = self.consumeNumberEntity(hex) else: - if charStack[1].lower() == u"x" \ - and charStack[2] in hexDigits: - # Hexadecimal entity detected. - self.stream.unget(charStack[2]) - char = self.consumeNumberEntity(True) - elif charStack[1] in digits: - # Decimal entity detected. - self.stream.unget(charStack[1:]) - char = self.consumeNumberEntity(False) - else: - # No number entity detected. - self.stream.unget(charStack) - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Numeric entity expected but none found.")}) + # No digits found + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": "expected-numeric-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) + else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. - filteredEntityList = [e for e in entities if \ - e.startswith(charStack[0])] + filteredEntityList = entitiesByFirstChar.get(charStack[0], []) def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] - while charStack[-1] != EOF and\ + while charStack[-1] is not EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) @@ -240,7 +192,7 @@ class HTMLTokenizer(object): # Try to find the longest entity the string will match to take care # of ¬i for instance. - for entityLength in xrange(len(charStack)-1,1,-1): + for entityLength in xrange(len(charStack)-1, 1, -1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName @@ -248,29 +200,32 @@ class HTMLTokenizer(object): if entityName is not None: if entityName[-1] != ";": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Named entity didn't end with ';'.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "named-entity-without-semicolon"}) if entityName[-1] != ";" and fromAttribute and \ (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits): - self.stream.unget(charStack) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) else: - char = entities[entityName] - self.stream.unget(charStack[entityLength:]) + output = entities[entityName] + self.stream.unget(charStack.pop()) + output += u"".join(charStack[entityLength:]) else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Named entity expected. Got none.")}) - self.stream.unget(charStack) - return char + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-named-entity"}) + self.stream.unget(charStack.pop()) + output = u"&" + u"".join(charStack) - def processEntityInAttribute(self): + if fromAttribute: + self.currentToken["data"][-1][1] += output + else: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output}) + + def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ - entity = self.consumeEntity(True) - if entity: - self.currentToken["data"][-1][1] += entity - else: - self.currentToken["data"][-1][1] += u"&" + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets @@ -279,196 +234,215 @@ class HTMLTokenizer(object): """ token = self.currentToken # Add token to the queue to be yielded - if (token["type"] in ("StartTag", "EndTag", "EmptyTag")): + if (token["type"] in tagTokenTypes): if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower) - if token["type"] == "EndTag" and token["data"]: - self.tokenQueue.append({"type":"ParseError", - "data":_(u"End tag contains unexpected attributes.")}) + if token["type"] == tokenTypes["EndTag"]: + if token["data"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"attributes-in-end-tag"}) + if token["selfClosing"]: + self.tokenQueue.append({"type":tokenTypes["ParseError"], + "data":"self-closing-flag-on-end-tag"}) self.tokenQueue.append(token) - self.state = self.states["data"] + self.state = self.dataState # Below are the various tokenizer states worked out. - # XXX AT Perhaps we should have Hixie run some evaluation on billions of - # documents to figure out what the order of the various if and elif - # statements should be. - def dataState(self): + #XXX - consider splitting this state based on the content model flag data = self.stream.char() # Keep a charbuffer to handle the escapeFlag - if self.contentModelFlag in\ - (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]): + if (self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): if len(self.lastFourChars) == 4: self.lastFourChars.pop(0) self.lastFourChars.append(data) # The rest of the logic - if data == "&" and self.contentModelFlag in\ - (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\ - self.escapeFlag: - self.state = self.states["entityData"] - elif data == "-" and self.contentModelFlag in\ - (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\ - self.escapeFlag and "".join(self.lastFourChars) == "": + self.tokenQueue.append({"type": tokenTypes["Characters"], + "data":data}) + elif (data == "<" and (self.contentModelFlag == + contentModelFlags["PCDATA"] + or (self.contentModelFlag in + (contentModelFlags["CDATA"], + contentModelFlags["RCDATA"]) and + self.escapeFlag == False))): + self.state = self.tagOpenState + elif (data == ">" and self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and + self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"): self.escapeFlag = False - self.tokenQueue.append({"type": "Characters", "data":data}) - elif data == EOF: + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data}) + elif data is EOF: # Tokenization ends. return False + elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. - self.tokenQueue.append({"type": "SpaceCharacters", "data": + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) + # No need to update lastFourChars here, since the first space will + # have already been appended to lastFourChars and will have broken + # any sequences else: - self.tokenQueue.append({"type": "Characters", "data": - data + self.stream.charsUntil(("&", "<", ">", "-"))}) + if (self.contentModelFlag in + (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])): + chars = self.stream.charsUntil((u"&", u"<", u">", u"-")) + self.lastFourChars += chars[-4:] + self.lastFourChars = self.lastFourChars[-4:] + else: + chars = self.stream.charsUntil((u"&", u"<")) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": + data + chars}) return True def entityDataState(self): - entity = self.consumeEntity() - if entity: - self.tokenQueue.append({"type": "Characters", "data": entity}) - else: - self.tokenQueue.append({"type": "Characters", "data": u"&"}) - self.state = self.states["data"] + self.consumeEntity() + self.state = self.dataState return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": - self.state = self.states["markupDeclarationOpen"] + self.state = self.markupDeclarationOpenState elif data == u"/": - self.state = self.states["closeTagOpen"] + self.state = self.closeTagOpenState elif data in asciiLetters: - self.currentToken =\ - {"type": "StartTag", "name": data, "data": []} - self.state = self.states["tagName"] + self.currentToken = {"type": tokenTypes["StartTag"], + "name": data, "data": [], + "selfClosing": False, + "selfClosingAcknowledged": False} + self.state = self.tagNameState elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected tag name. Got '>' instead.")}) - self.tokenQueue.append({"type": "Characters", "data": u"<>"}) - self.state = self.states["data"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-right-bracket"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) + self.state = self.dataState elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected tag name. Got '?' instead (HTML doesn't " - "support processing instructions).")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) - self.state = self.states["bogusComment"] + self.state = self.bogusCommentState else: # XXX - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected tag name. Got something else instead")}) - self.tokenQueue.append({"type": "Characters", "data": u"<"}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-tag-name"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) - self.state = self.states["data"] + self.state = self.dataState else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": - self.state = self.states["closeTagOpen"] + self.state = self.closeTagOpenState else: - self.tokenQueue.append({"type": "Characters", "data": u"<"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"}) self.stream.unget(data) - self.state = self.states["data"] + self.state = self.dataState return True def closeTagOpenState(self): if (self.contentModelFlag in (contentModelFlags["RCDATA"], contentModelFlags["CDATA"])): - if self.currentToken: - charStack = [] + charStack = [] + if self.currentToken: # So far we know that "" has been consumed. We now need to know # whether the next few characters match the name of last emitted - # start tag which also happens to be the currentToken. We also need - # to have the character directly after the characters that could - # match the start tag name. - for x in xrange(len(self.currentToken["name"]) + 1): + # start tag which also happens to be the currentToken. + matched = True + for expected in self.currentToken["name"].lower(): charStack.append(self.stream.char()) - # Make sure we don't get hit by EOF - if charStack[-1] == EOF: + if charStack[-1] not in (expected, expected.upper()): + matched = False break - # Since this is just for checking. We put the characters back on - # the stack. - self.stream.unget(charStack) + # If the tag name prefix matched, we also need to check the + # subsequent character + if matched: + charStack.append(self.stream.char()) + if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))): + self.contentModelFlag = contentModelFlags["PCDATA"] + # Unget the last character, so it can be re-processed + # in the next state + self.stream.unget(charStack.pop()) + # The remaining characters in charStack are the tag name + self.currentToken = {"type": tokenTypes["EndTag"], + "name": u"".join(charStack), + "data": [], + "selfClosing":False} + self.state = self.tagNameState + return True - if self.currentToken \ - and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \ - and charStack[-1] in (spaceCharacters | - frozenset((u">", u"/", u"<", EOF))): - # Because the characters are correct we can safely switch to - # PCDATA mode now. This also means we don't have to do it when - # emitting the end tag token. - self.contentModelFlag = contentModelFlags["PCDATA"] - else: - self.tokenQueue.append({"type": "Characters", "data": u""}) - self.state = self.states["data"] + # Didn't find the end tag. The last character in charStack could be + # anything, so it has to be re-processed in the data state + self.stream.unget(charStack.pop()) - # Need to return here since we don't want the rest of the - # method to be walked through. - return True + # The remaining characters are a prefix of the tag name, so they're + # just letters and digits, so they can be output as character + # tokens immediately + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"" + u"".join(charStack)}) + self.state = self.dataState + return True data = self.stream.char() if data in asciiLetters: - self.currentToken = {"type":"EndTag", "name":data, "data":[]} - self.state = self.states["tagName"] + self.currentToken = {"type": tokenTypes["EndTag"], "name": data, + "data": [], "selfClosing":False} + self.state = self.tagNameState elif data == u">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected closing tag. Got '>' instead. Ignoring '>'.")}) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected closing tag. Unexpected end of file.")}) - self.tokenQueue.append({"type": "Characters", "data": u""}) - self.state = self.states["data"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-right-bracket"}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-eof"}) + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u""}) + self.state = self.dataState else: # XXX data can be _'_... - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected closing tag. Unexpected character '%s' found.") % (data,)}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-closing-tag-but-got-char", + "datavars": {"data": data}}) self.stream.unget(data) - self.state = self.states["bogusComment"] + self.state = self.bogusCommentState return True def tagNameState(self): data = self.stream.char() if data in spaceCharacters: - self.state = self.states["beforeAttributeName"] - elif data in asciiLetters: - self.currentToken["name"] += data +\ - self.stream.charsUntil(asciiLetters, True) + self.state = self.beforeAttributeNameState elif data == u">": self.emitCurrentToken() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in the tag name.")}) - self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-tag-name"}) + self.state = self.dataState elif data == u"/": - self.processSolidusInTag() - self.state = self.states["beforeAttributeName"] + self.state = self.selfClosingStartTagState else: self.currentToken["name"] += data + # (Don't use charsUntil here, because tag names are + # very short and it's faster to not do anything fancy) return True def beforeAttributeNameState(self): @@ -477,18 +451,23 @@ class HTMLTokenizer(object): self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) - self.state = self.states["attributeName"] + self.state = self.attributeNameState elif data == u">": self.emitCurrentToken() elif data == u"/": - self.processSolidusInTag() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file. Expected attribute name instead.")}) - self.emitCurrentToken() + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-name-but-got-eof"}) + self.state = self.dataState else: self.currentToken["data"].append([data, ""]) - self.state = self.states["attributeName"] + self.state = self.attributeNameState return True def attributeNameState(self): @@ -496,7 +475,7 @@ class HTMLTokenizer(object): leavingThisState = True emitToken = False if data == u"=": - self.state = self.states["beforeAttributeValue"] + self.state = self.beforeAttributeValueState elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) @@ -507,14 +486,18 @@ class HTMLTokenizer(object): # because data is a dict not a list emitToken = True elif data in spaceCharacters: - self.state = self.states["afterAttributeName"] + self.state = self.afterAttributeNameState elif data == u"/": - self.processSolidusInTag() - self.state = self.states["beforeAttributeName"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in attribute name.")}) - self.state = self.states["data"] + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-in-attribute-name"}) + self.currentToken["data"][-1][0] += data + leavingThisState = False + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-name"}) + self.state = self.dataState emitToken = True else: self.currentToken["data"][-1][0] += data @@ -529,8 +512,8 @@ class HTMLTokenizer(object): self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Dropped duplicate attribute on tag.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: @@ -542,22 +525,26 @@ class HTMLTokenizer(object): if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": - self.state = self.states["beforeAttributeValue"] + self.state = self.beforeAttributeValueState elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) - self.state = self.states["attributeName"] + self.state = self.attributeNameState elif data == u"/": - self.processSolidusInTag() - self.state = self.states["beforeAttributeName"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file. Expected = or end of tag.")}) + self.state = self.selfClosingStartTagState + elif data in (u"'", u'"', u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "invalid-character-after-attribute-name"}) + self.currentToken["data"].append([data, ""]) + self.state = self.attributeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-end-of-tag-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) - self.state = self.states["attributeName"] + self.state = self.attributeNameState return True def beforeAttributeValueState(self): @@ -565,32 +552,39 @@ class HTMLTokenizer(object): if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": - self.state = self.states["attributeValueDoubleQuoted"] + self.state = self.attributeValueDoubleQuotedState elif data == u"&": - self.state = self.states["attributeValueUnQuoted"] + self.state = self.attributeValueUnQuotedState self.stream.unget(data); elif data == u"'": - self.state = self.states["attributeValueSingleQuoted"] + self.state = self.attributeValueSingleQuotedState elif data == u">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-right-bracket"}) self.emitCurrentToken() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file. Expected attribute value.")}) + elif data in (u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "equals-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + self.state = self.attributeValueUnQuotedState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-attribute-value-but-got-eof"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data - self.state = self.states["attributeValueUnQuoted"] + self.state = self.attributeValueUnQuotedState return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": - self.state = self.states["beforeAttributeName"] + self.state = self.afterAttributeValueState elif data == u"&": - self.processEntityInAttribute() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in attribute value (\").")}) + self.processEntityInAttribute(u'"') + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-double-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ @@ -600,12 +594,12 @@ class HTMLTokenizer(object): def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": - self.state = self.states["beforeAttributeName"] + self.state = self.afterAttributeValueState elif data == u"&": - self.processEntityInAttribute() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in attribute value (').")}) + self.processEntityInAttribute(u"'") + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-single-quote"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ @@ -615,18 +609,61 @@ class HTMLTokenizer(object): def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: - self.state = self.states["beforeAttributeName"] + self.state = self.beforeAttributeNameState elif data == u"&": - self.processEntityInAttribute() + self.processEntityInAttribute(None) elif data == u">": self.emitCurrentToken() - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in attribute value.")}) + elif data in (u'"', u"'", u"=", u"<"): + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-in-unquoted-attribute-value"}) + self.currentToken["data"][-1][1] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-attribute-value-no-quotes"}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ - frozenset(("&", ">","<")) | spaceCharacters) + frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) + return True + + def afterAttributeValueState(self): + data = self.stream.char() + if data in spaceCharacters: + self.state = self.beforeAttributeNameState + elif data == u">": + self.emitCurrentToken() + elif data == u"/": + self.state = self.selfClosingStartTagState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-EOF-after-attribute-value"}) + self.emitCurrentToken() + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-attribute-value"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState + return True + + def selfClosingStartTagState(self): + data = self.stream.char() + if data == ">": + self.currentToken["selfClosing"] = True + self.emitCurrentToken() + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], + "data": + "unexpected-EOF-after-solidus-in-tag"}) + self.stream.unget(data) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-character-after-soldius-in-tag"}) + self.stream.unget(data) + self.state = self.beforeAttributeNameState return True def bogusCommentState(self): @@ -634,83 +671,109 @@ class HTMLTokenizer(object): # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append( - {"type": "Comment", "data": self.stream.charsUntil((u">"))}) + {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() - self.state = self.states["data"] + self.state = self.dataState + return True + + def bogusCommentContinuationState(self): + # Like bogusCommentState, but the caller must create the comment token + # and this state just adds more characters to it + self.currentToken["data"] += self.stream.charsUntil(u">") + self.tokenQueue.append(self.currentToken) + + # Eat the character directly after the bogus comment which is either a + # ">" or an EOF. + self.stream.char() + self.state = self.dataState return True def markupDeclarationOpenState(self): - charStack = [self.stream.char(), self.stream.char()] - if charStack == [u"-", u"-"]: - self.currentToken = {"type": "Comment", "data": u""} - self.state = self.states["commentStart"] - else: - for x in xrange(5): + charStack = [self.stream.char()] + if charStack[-1] == u"-": + charStack.append(self.stream.char()) + if charStack[-1] == u"-": + self.currentToken = {"type": tokenTypes["Comment"], "data": u""} + self.state = self.commentStartState + return True + elif charStack[-1] in (u'd', u'D'): + matched = True + for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'), + (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')): charStack.append(self.stream.char()) - # Put in explicit EOF check - if (not EOF in charStack and - "".join(charStack).upper() == u"DOCTYPE"): - self.currentToken = {"type":"Doctype", "name":u"", - "publicId":None, "systemId":None, "correct":True} - self.state = self.states["doctype"] - else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected '--' or 'DOCTYPE'. Not found.")}) - self.stream.unget(charStack) - self.state = self.states["bogusComment"] + if charStack[-1] not in expected: + matched = False + break + if matched: + self.currentToken = {"type": tokenTypes["Doctype"], + "name": u"", + "publicId": None, "systemId": None, + "correct": True} + self.state = self.doctypeState + return True + + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-dashes-or-doctype"}) + # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc) + # so they can be copied directly into the bogus comment data, and only + # the last character might be '>' or EOF and needs to be ungetted + self.stream.unget(charStack.pop()) + self.currentToken = {"type": tokenTypes["Comment"], + "data": u"".join(charStack)} + self.state = self.bogusCommentContinuationState return True def commentStartState(self): data = self.stream.char() if data == "-": - self.state = self.states["commentStartDash"] + self.state = self.commentStartDashState elif data == ">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Incorrect comment.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in comment.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") - self.state = self.states["comment"] + self.state = self.commentState return True def commentStartDashState(self): data = self.stream.char() if data == "-": - self.state = self.states["commentEnd"] + self.state = self.commentEndState elif data == ">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Incorrect comment.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "incorrect-comment"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in comment.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-") - self.state = self.states["comment"] + self.state = self.commentState return True def commentState(self): data = self.stream.char() if data == u"-": - self.state = self.states["commentEndDash"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in comment.")}) + self.state = self.commentEndDashState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True @@ -718,12 +781,12 @@ class HTMLTokenizer(object): def commentEndDashState(self): data = self.stream.char() if data == u"-": - self.state = self.states["commentEnd"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in comment (-)")}) + self.state = self.commentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") @@ -737,33 +800,85 @@ class HTMLTokenizer(object): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState elif data == u"-": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected '-' after '--' found in comment.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in comment (--).")}) + elif data in spaceCharacters: + self.currentToken["data"] += "--" + data + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-space-after-double-dash-in-comment"}) + self.state = self.commentEndSpaceState + elif data == "!": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-bang-after-double-dash-in-comment"}) + self.state = self.commentEndBangState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: # XXX - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in comment found.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-comment"}) self.currentToken["data"] += u"--" + data - self.state = self.states["comment"] + self.state = self.commentState + return True + + def commentEndBangState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.currentToken["data"] += "--!" + self.state = self.commentEndDashState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-bang-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += u"--!" + data + self.state = self.commentState + return True + + def commentEndSpaceState(self): + data = self.stream.char() + if data == u">": + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + elif data == u"-": + self.state = self.commentEndDashState + elif data in spaceCharacters: + self.currentToken["data"] += data + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-comment-end-space-state"}) + self.tokenQueue.append(self.currentToken) + self.state = self.dataState + else: + self.currentToken["data"] += data + self.state = self.commentState return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: - self.state = self.states["beforeDoctypeName"] + self.state = self.beforeDoctypeNameState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"No space after literal string 'DOCTYPE'.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "need-space-after-doctype"}) self.stream.unget(data) - self.state = self.states["beforeDoctypeName"] + self.state = self.beforeDoctypeNameState return True def beforeDoctypeNameState(self): @@ -771,35 +886,38 @@ class HTMLTokenizer(object): if data in spaceCharacters: pass elif data == u">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected > character. Expected DOCTYPE name.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file. Expected DOCTYPE name.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["name"] = data - self.state = self.states["doctypeName"] + self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: - self.state = self.states["afterDoctypeName"] + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.state = self.afterDoctypeNameState elif data == u">": + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE name.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype-name"}) self.currentToken["correct"] = False + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: self.currentToken["name"] += data return True @@ -810,69 +928,96 @@ class HTMLTokenizer(object): pass elif data == u">": self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: + self.state = self.dataState + elif data is EOF: self.currentToken["correct"] = False self.stream.unget(data) - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: - charStack = [data] - for x in xrange(5): - charStack.append(self.stream.char()) - if EOF not in charStack and\ - "".join(charStack).translate(asciiUpper2Lower) == "public": - self.state = self.states["beforeDoctypePublicIdentifier"] - elif EOF not in charStack and\ - "".join(charStack).translate(asciiUpper2Lower) == "system": - self.state = self.states["beforeDoctypeSystemIdentifier"] - else: - self.stream.unget(charStack) - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Expected space or '>'. Got '%s'") % (data,)}) - self.state = self.states["bogusDoctype"] + if data in (u"p", u"P"): + matched = True + for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"), + (u"i", u"I"), (u"c", u"C")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.beforeDoctypePublicIdentifierState + return True + elif data in (u"s", u"S"): + matched = True + for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"), + (u"e", u"E"), (u"m", u"M")): + data = self.stream.char() + if data not in expected: + matched = False + break + if matched: + self.state = self.beforeDoctypeSystemIdentifierState + return True + + # All the characters read before the current 'data' will be + # [a-zA-Z], so they're garbage in the bogus doctype and can be + # discarded; only the latest character might be '>' or EOF + # and needs to be ungetted + self.stream.unget(data) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-space-or-right-bracket-in-doctype", "datavars": + {"data": data}}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState + return True - + def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = u"" - self.state = self.states["doctypePublicIdentifierDoubleQuoted"] + self.state = self.doctypePublicIdentifierDoubleQuotedState elif data == "'": self.currentToken["publicId"] = u"" - self.state = self.states["doctypePublicIdentifierSingleQuoted"] + self.state = self.doctypePublicIdentifierSingleQuotedState elif data == ">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of DOCTYPE.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in DOCTYPE.")}) - self.state = self.states["bogusDoctype"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": - self.state = self.states["afterDoctypePublicIdentifier"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.afterDoctypePublicIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState else: self.currentToken["publicId"] += data return True @@ -880,13 +1025,19 @@ class HTMLTokenizer(object): def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": - self.state = self.states["afterDoctypePublicIdentifier"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.afterDoctypePublicIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState else: self.currentToken["publicId"] += data return True @@ -897,23 +1048,24 @@ class HTMLTokenizer(object): pass elif data == "\"": self.currentToken["systemId"] = u"" - self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] + self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = u"" - self.state = self.states["doctypeSystemIdentifierSingleQuoted"] + self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in DOCTYPE.")}) - self.state = self.states["bogusDoctype"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState return True def beforeDoctypeSystemIdentifierState(self): @@ -922,38 +1074,45 @@ class HTMLTokenizer(object): pass elif data == "\"": self.currentToken["systemId"] = u"" - self.state = self.states["doctypeSystemIdentifierDoubleQuoted"] + self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = u"" - self.state = self.states["doctypeSystemIdentifierSingleQuoted"] + self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in DOCTYPE.")}) + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in DOCTYPE.")}) - self.state = self.states["bogusDoctype"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.currentToken["correct"] = False + self.state = self.bogusDoctypeState return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": - self.state = self.states["afterDoctypeSystemIdentifier"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.afterDoctypeSystemIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState else: self.currentToken["systemId"] += data return True @@ -961,13 +1120,19 @@ class HTMLTokenizer(object): def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": - self.state = self.states["afterDoctypeSystemIdentifier"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.afterDoctypeSystemIdentifierState + elif data == ">": + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) + self.currentToken["correct"] = False + self.tokenQueue.append(self.currentToken) + self.state = self.dataState else: self.currentToken["systemId"] += data return True @@ -978,32 +1143,29 @@ class HTMLTokenizer(object): pass elif data == ">": self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in DOCTYPE.")}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected character in DOCTYPE.")}) - self.state = self.states["bogusDoctype"] + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "unexpected-char-in-doctype"}) + self.state = self.bogusDoctypeState return True def bogusDoctypeState(self): data = self.stream.char() - self.currentToken["correct"] = False if data == u">": self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] - elif data == EOF: + self.state = self.dataState + elif data is EOF: # XXX EMIT self.stream.unget(data) - self.tokenQueue.append({"type": "ParseError", "data": - _(u"Unexpected end of file in bogus doctype.")}) self.tokenQueue.append(self.currentToken) - self.state = self.states["data"] + self.state = self.dataState else: pass return True diff --git a/planet/vendor/html5lib/treebuilders/__init__.py b/planet/vendor/html5lib/treebuilders/__init__.py index 7a421b8..635f426 100755 --- a/planet/vendor/html5lib/treebuilders/__init__.py +++ b/planet/vendor/html5lib/treebuilders/__init__.py @@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs): "simpletree" - a built-in DOM-ish tree type with support for some more pythonic idioms. - "dom" - The xml.dom.minidom DOM implementation + "dom" - A generic builder for DOM implementations, defaulting to + a xml.dom.minidom based implementation for the sake of + backwards compatibility (as releases up until 0.10 had a + builder called "dom" that was a minidom implemenation). "etree" - A generic builder for tree implementations exposing an elementtree-like interface (known to work with ElementTree, cElementTree and lxml.etree). "beautifulsoup" - Beautiful soup (if installed) - implementation - (Currently applies to the "etree" tree type only). A module - implementing the tree type e.g. xml.etree.ElementTree or - lxml.etree.""" + implementation - (Currently applies to the "etree" and "dom" tree types). A + module implementing the tree type e.g. + xml.etree.ElementTree or lxml.etree.""" treeType = treeType.lower() if treeType not in treeBuilderCache: - if treeType in ("dom", "simpletree"): - mod = __import__(treeType, globals()) - treeBuilderCache[treeType] = mod.TreeBuilder + if treeType == "dom": + import dom + # XXX: Keep backwards compatibility by using minidom if no implementation is given + if implementation == None: + from xml.dom import minidom + implementation = minidom + # XXX: NEVER cache here, caching is done in the dom submodule + return dom.getDomModule(implementation, **kwargs).TreeBuilder + elif treeType == "simpletree": + import simpletree + treeBuilderCache[treeType] = simpletree.TreeBuilder elif treeType == "beautifulsoup": import soup treeBuilderCache[treeType] = soup.TreeBuilder + elif treeType == "lxml": + import etree_lxml + treeBuilderCache[treeType] = etree_lxml.TreeBuilder elif treeType == "etree": import etree # XXX: NEVER cache here, caching is done in the etree submodule diff --git a/planet/vendor/html5lib/treebuilders/_base.py b/planet/vendor/html5lib/treebuilders/_base.py index a5ae31d..7b2ce4b 100755 --- a/planet/vendor/html5lib/treebuilders/_base.py +++ b/planet/vendor/html5lib/treebuilders/_base.py @@ -1,3 +1,4 @@ +import warnings from html5lib.constants import scopingElements, tableInsertModeElements try: frozenset @@ -11,9 +12,6 @@ except NameError: # from "leaking" into tables, buttons, object elements, and marquees. Marker = None -#XXX - TODO; make the default interface more ElementTree-like -# rather than DOM-like - class Node(object): def __init__(self, name): """Node representing an item in the tree. @@ -43,7 +41,7 @@ class Node(object): return "<%s>"%(self.name) def __repr__(self): - return "<%s %s>" % (self.__class__, self.name) + return "<%s>" % (self.name) def appendChild(self, node): """Insert node as a child of the current node @@ -112,7 +110,12 @@ class TreeBuilder(object): #Fragment class fragmentClass = None - def __init__(self): + def __init__(self, namespaceHTMLElements): + if namespaceHTMLElements: + self.defaultNamespace = "http://www.w3.org/1999/xhtml" + else: + self.defaultNamespace = None + warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it") self.reset() def reset(self): @@ -140,7 +143,8 @@ class TreeBuilder(object): return True elif node.name == "table": return False - elif not tableVariant and node.name in scopingElements: + elif (not tableVariant and (node.nameTuple in + scopingElements)): return False elif node.name == "html": return False @@ -179,7 +183,10 @@ class TreeBuilder(object): clone = self.activeFormattingElements[i].cloneNode() # Step 9 - element = self.insertElement(clone.name, clone.attributes) + element = self.insertElement({"type":"StartTag", + "name":clone.name, + "namespace":clone.namespace, + "data":clone.attributes}) # Step 10 self.activeFormattingElements[i] = element @@ -207,21 +214,30 @@ class TreeBuilder(object): return item return False - def insertDoctype(self, name, publicId, systemId): - doctype = self.doctypeClass(name) - doctype.publicId = publicId - doctype.systemId = systemId + def insertRoot(self, token): + element = self.createElement(token) + self.openElements.append(element) + self.document.appendChild(element) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = self.doctypeClass(name, publicId, systemId) self.document.appendChild(doctype) - def insertComment(self, data, parent=None): + def insertComment(self, token, parent=None): if parent is None: parent = self.openElements[-1] - parent.appendChild(self.commentClass(data)) + parent.appendChild(self.commentClass(token["data"])) - def createElement(self, name, attributes): + def createElement(self, token): """Create an element but don't insert it anywhere""" - element = self.elementClass(name) - element.attributes = attributes + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] return element def _getInsertFromTable(self): @@ -238,19 +254,20 @@ class TreeBuilder(object): insertFromTable = property(_getInsertFromTable, _setInsertFromTable) - def insertElementNormal(self, name, attributes): - element = self.elementClass(name) - element.attributes = attributes + def insertElementNormal(self, token): + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + element = self.elementClass(name, namespace) + element.attributes = token["data"] self.openElements[-1].appendChild(element) self.openElements.append(element) return element - def insertElementTable(self, name, attributes): + def insertElementTable(self, token): """Create an element and insert it into the tree""" - element = self.elementClass(name) - element.attributes = attributes + element = self.createElement(token) if self.openElements[-1].name not in tableInsertModeElements: - return self.insertElementNormal(name, attributes) + return self.insertElementNormal(token) else: #We should be in the InTable mode. This means we want to do #special magic element rearranging @@ -267,32 +284,32 @@ class TreeBuilder(object): if parent is None: parent = self.openElements[-1] - if (not(self.insertFromTable) or (self.insertFromTable and - self.openElements[-1].name not in - tableInsertModeElements)): + if (not self.insertFromTable or (self.insertFromTable and + self.openElements[-1].name + not in tableInsertModeElements)): parent.insertText(data) else: - #We should be in the InTable mode. This means we want to do - #special magic element rearranging + # We should be in the InTable mode. This means we want to do + # special magic element rearranging parent, insertBefore = self.getTableMisnestedNodePosition() parent.insertText(data, insertBefore) def getTableMisnestedNodePosition(self): """Get the foster parent element, and sibling to insert before (or None) when inserting a misnested table node""" - #The foster parent element is the one which comes before the most - #recently opened table element - #XXX - this is really inelegant + # The foster parent element is the one which comes before the most + # recently opened table element + # XXX - this is really inelegant lastTable=None fosterParent = None insertBefore = None for elm in self.openElements[::-1]: - if elm.name == u"table": + if elm.name == "table": lastTable = elm break if lastTable: - #XXX - we should really check that this parent is actually a - #node here + # XXX - we should really check that this parent is actually a + # node here if lastTable.parent: fosterParent = lastTable.parent insertBefore = lastTable diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py index 1259a24..8de1bdc 100644 --- a/planet/vendor/html5lib/treebuilders/dom.py +++ b/planet/vendor/html5lib/treebuilders/dom.py @@ -1,203 +1,292 @@ -import _base + from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE - +import new import re -illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") -class AttrList: - def __init__(self, element): - self.element = element - def __iter__(self): - return self.element.attributes.items().__iter__() - def __setitem__(self, name, value): - value=illegal_xml_chars.sub(u'\uFFFD',value) - self.element.setAttribute(name, value) - def items(self): - return self.element.attributes.items() - def keys(self): - return self.element.attributes.keys() - def __getitem__(self, name): - return self.element.getAttribute(name) +import _base +from html5lib import constants, ihatexml +from html5lib.constants import namespaces -class NodeBuilder(_base.Node): - def __init__(self, element): - _base.Node.__init__(self, element.nodeName) - self.element = element +moduleCache = {} - def appendChild(self, node): - node.parent = self - self.element.appendChild(node.element) - - def insertText(self, data, insertBefore=None): - data=illegal_xml_chars.sub(u'\uFFFD',data) - text = self.element.ownerDocument.createTextNode(data) - if insertBefore: - self.element.insertBefore(text, insertBefore.element) - else: - self.element.appendChild(text) - - def insertBefore(self, node, refNode): - self.element.insertBefore(node.element, refNode.element) - node.parent = self - - def removeChild(self, node): - if node.element.parentNode == self.element: - self.element.removeChild(node.element) - node.parent = None - - def reparentChildren(self, newParent): - while self.element.hasChildNodes(): - child = self.element.firstChild - self.element.removeChild(child) - newParent.element.appendChild(child) - self.childNodes = [] - - def getAttributes(self): - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes: - for name, value in attributes.items(): - value=illegal_xml_chars.sub(u'\uFFFD',value) - self.element.setAttribute(name, value) - - attributes = property(getAttributes, setAttributes) - - def cloneNode(self): - return NodeBuilder(self.element.cloneNode(False)) - - def hasContent(self): - return self.element.hasChildNodes() - -class TreeBuilder(_base.TreeBuilder): - def documentClass(self): - self.dom = minidom.getDOMImplementation().createDocument(None,None,None) - return self - - def insertDoctype(self, name, publicId, systemId): - domimpl = minidom.getDOMImplementation() - doctype = domimpl.createDocumentType(name, publicId, systemId) - self.document.appendChild(NodeBuilder(doctype)) - doctype.ownerDocument = self.dom - - def elementClass(self, name): - return NodeBuilder(self.dom.createElement(name)) - - def commentClass(self, data): - return NodeBuilder(self.dom.createComment(data)) - - def fragmentClass(self): - return NodeBuilder(self.dom.createDocumentFragment()) - - def appendChild(self, node): - self.dom.appendChild(node.element) - - def testSerializer(self, element): - return testSerializer(element) - - def getDocument(self): - return self.dom - - def getFragment(self): - return _base.TreeBuilder.getFragment(self).element - - def insertText(self, data, parent=None): - data=illegal_xml_chars.sub(u'\uFFFD',data) - if parent <> self: - _base.TreeBuilder.insertText(self, data, parent) - else: - # HACK: allow text nodes as children of the document node - if hasattr(self.dom, '_child_node_types'): - if not Node.TEXT_NODE in self.dom._child_node_types: - self.dom._child_node_types=list(self.dom._child_node_types) - self.dom._child_node_types.append(Node.TEXT_NODE) - self.dom.appendChild(self.dom.createTextNode(data)) - - name = None - -def testSerializer(element): - element.normalize() - rv = [] - def serializeElement(element, indent=0): - if element.nodeType == Node.DOCUMENT_TYPE_NODE: - if element.name: - rv.append("|%s"%(' '*indent, element.name)) - else: - rv.append("|%s"%(' '*indent,)) - elif element.nodeType == Node.DOCUMENT_NODE: - rv.append("#document") - elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: - rv.append("#document-fragment") - elif element.nodeType == Node.COMMENT_NODE: - rv.append("|%s"%(' '*indent, element.nodeValue)) - elif element.nodeType == Node.TEXT_NODE: - rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) - else: - rv.append("|%s<%s>"%(' '*indent, element.nodeName)) - if element.hasAttributes(): - for name, value in element.attributes.items(): - rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) - indent += 2 - for child in element.childNodes: - serializeElement(child, indent) - serializeElement(element, 0) - - return "\n".join(rv) - -def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): - if node.nodeType == Node.ELEMENT_NODE: - if not nsmap: - handler.startElement(node.nodeName, node.attributes) - for child in node.childNodes: dom2sax(child, handler, nsmap) - handler.endElement(node.nodeName) +def getDomModule(DomImplementation): + name = "_" + DomImplementation.__name__+"builder" + if name in moduleCache: + return moduleCache[name] else: - attributes = dict(node.attributes.itemsNS()) + mod = new.module(name) + objs = getDomBuilder(DomImplementation) + mod.__dict__.update(objs) + moduleCache[name] = mod + return mod - # gather namespace declarations - prefixes = [] - for attrname in node.attributes.keys(): - attr = node.getAttributeNode(attrname) - if (attr.namespaceURI == XMLNS_NAMESPACE or - (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): - prefix = (attr.localName != 'xmlns' and attr.localName or None) - handler.startPrefixMapping(prefix, attr.nodeValue) - prefixes.append(prefix) - nsmap = nsmap.copy() - nsmap[prefix] = attr.nodeValue - del attributes[(attr.namespaceURI, attr.localName)] +def getDomBuilder(DomImplementation): + Dom = DomImplementation + infoset_filter = ihatexml.InfosetFilter() + class AttrList: + def __init__(self, element): + self.element = element + def __iter__(self): + return self.element.attributes.items().__iter__() + def __setitem__(self, name, value): + self.element.setAttribute(infoset_filter.coerceAttribute(name), + infoset_filter.coerceCharacters(value)) + def items(self): + return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in + self.element.attributes.items()] + def keys(self): + return [infoset_filter.fromXmlName(item) for item in + self.element.attributes.keys()] + def __getitem__(self, name): + name = infoset_filter.toXmlName(name) + return self.element.getAttribute(name) - # apply namespace declarations - for attrname in node.attributes.keys(): - attr = node.getAttributeNode(attrname) - if attr.namespaceURI == None and ':' in attr.nodeName: - prefix = attr.nodeName.split(':')[0] - if nsmap.has_key(prefix): - del attributes[(attr.namespaceURI, attr.localName)] - attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue + def __contains__(self, name): + if isinstance(name, tuple): + raise NotImplementedError + else: + return self.element.hasAttribute(infoset_filter.toXmlName(name)) + + class NodeBuilder(_base.Node): + def __init__(self, element): + _base.Node.__init__(self, element.localName) + self.element = element - # SAX events - ns = node.namespaceURI or nsmap.get(None,None) - handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) - for child in node.childNodes: dom2sax(child, handler, nsmap) - handler.endElementNS((ns, node.nodeName), node.nodeName) - for prefix in prefixes: handler.endPrefixMapping(prefix) + namespace = property(lambda self:hasattr(self.element, "namespaceURI") + and self.element.namespaceURI or None) - elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: - handler.characters(node.nodeValue) + def appendChild(self, node): + node.parent = self + self.element.appendChild(node.element) + + def insertText(self, data, insertBefore=None): + data=infoset_filter.coerceCharacters(data) + text = self.element.ownerDocument.createTextNode(data) + if insertBefore: + self.element.insertBefore(text, insertBefore.element) + else: + self.element.appendChild(text) + + def insertBefore(self, node, refNode): + self.element.insertBefore(node.element, refNode.element) + node.parent = self + + def removeChild(self, node): + if node.element.parentNode == self.element: + self.element.removeChild(node.element) + node.parent = None + + def reparentChildren(self, newParent): + while self.element.hasChildNodes(): + child = self.element.firstChild + self.element.removeChild(child) + newParent.element.appendChild(child) + self.childNodes = [] + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + if attributes: + for name, value in attributes.items(): + if isinstance(name, tuple): + if name[0] is not None: + qualifiedName = (name[0] + ":" + + infoset_filter.coerceAttribute( + name[1])) + else: + qualifiedName = infoset_filter.coerceAttribute( + name[1]) + self.element.setAttributeNS(name[2], qualifiedName, + value) + else: + self.element.setAttribute( + infoset_filter.coerceAttribute(name), value) + attributes = property(getAttributes, setAttributes) + + def cloneNode(self): + return NodeBuilder(self.element.cloneNode(False)) + + def hasContent(self): + return self.element.hasChildNodes() - elif node.nodeType == Node.DOCUMENT_NODE: - handler.startDocument() - for child in node.childNodes: dom2sax(child, handler, nsmap) - handler.endDocument() + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name - elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: - for child in node.childNodes: dom2sax(child, handler, nsmap) + nameTuple = property(getNameTuple) - else: - # ATTRIBUTE_NODE - # ENTITY_NODE - # PROCESSING_INSTRUCTION_NODE - # COMMENT_NODE - # DOCUMENT_TYPE_NODE - # NOTATION_NODE - pass + class TreeBuilder(_base.TreeBuilder): + def documentClass(self): + self.dom = Dom.getDOMImplementation().createDocument(None,None,None) + return self + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + domimpl = Dom.getDOMImplementation() + doctype = domimpl.createDocumentType(name, publicId, systemId) + self.document.appendChild(NodeBuilder(doctype)) + if Dom == minidom: + doctype.ownerDocument = self.dom + + def elementClass(self, name, namespace=None): + if namespace is None and self.defaultNamespace is None: + node = self.dom.createElement(name) + else: + node = self.dom.createElementNS(namespace, name) + + return NodeBuilder(node) + + def commentClass(self, data): + return NodeBuilder(self.dom.createComment(data)) + + def fragmentClass(self): + return NodeBuilder(self.dom.createDocumentFragment()) + + def appendChild(self, node): + self.dom.appendChild(node.element) + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.dom + + def getFragment(self): + return _base.TreeBuilder.getFragment(self).element + + def insertText(self, data, parent=None): + data=infoset_filter.coerceCharacters(data) + if parent <> self: + _base.TreeBuilder.insertText(self, data, parent) + else: + # HACK: allow text nodes as children of the document node + if hasattr(self.dom, '_child_node_types'): + if not Node.TEXT_NODE in self.dom._child_node_types: + self.dom._child_node_types=list(self.dom._child_node_types) + self.dom._child_node_types.append(Node.TEXT_NODE) + self.dom.appendChild(self.dom.createTextNode(data)) + + name = None + + def testSerializer(element): + element.normalize() + rv = [] + def serializeElement(element, indent=0): + if element.nodeType == Node.DOCUMENT_TYPE_NODE: + if element.name: + if element.publicId or element.systemId: + publicId = element.publicId or "" + systemId = element.systemId or "" + rv.append( """|%s"""%( + ' '*indent, element.name, publicId, systemId)) + else: + rv.append("|%s"%(' '*indent, element.name)) + else: + rv.append("|%s"%(' '*indent,)) + elif element.nodeType == Node.DOCUMENT_NODE: + rv.append("#document") + elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + rv.append("#document-fragment") + elif element.nodeType == Node.COMMENT_NODE: + rv.append("|%s"%(' '*indent, element.nodeValue)) + elif element.nodeType == Node.TEXT_NODE: + rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) + else: + if (hasattr(element, "namespaceURI") and + element.namespaceURI not in (None, + constants.namespaces["html"])): + name = "%s %s"%(constants.prefixes[element.namespaceURI], + element.nodeName) + else: + name = element.nodeName + rv.append("|%s<%s>"%(' '*indent, name)) + if element.hasAttributes(): + i = 0 + attr = element.attributes.item(i) + while attr: + name = infoset_filter.fromXmlName(attr.localName) + value = attr.value + ns = attr.namespaceURI + if ns: + name = "%s %s"%(constants.prefixes[ns], name) + i += 1 + attr = element.attributes.item(i) + + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + indent += 2 + for child in element.childNodes: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + + def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): + if node.nodeType == Node.ELEMENT_NODE: + if not nsmap: + handler.startElement(node.nodeName, node.attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElement(node.nodeName) + else: + attributes = dict(node.attributes.itemsNS()) + + # gather namespace declarations + prefixes = [] + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if (attr.namespaceURI == XMLNS_NAMESPACE or + (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): + prefix = (attr.localName != 'xmlns' and attr.localName or None) + handler.startPrefixMapping(prefix, attr.nodeValue) + prefixes.append(prefix) + nsmap = nsmap.copy() + nsmap[prefix] = attr.nodeValue + del attributes[(attr.namespaceURI, attr.localName)] + + # apply namespace declarations + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if attr.namespaceURI == None and ':' in attr.nodeName: + prefix = attr.nodeName.split(':')[0] + if nsmap.has_key(prefix): + del attributes[(attr.namespaceURI, attr.localName)] + attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue + + # SAX events + ns = node.namespaceURI or nsmap.get(None,None) + handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElementNS((ns, node.nodeName), node.nodeName) + for prefix in prefixes: handler.endPrefixMapping(prefix) + + elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: + handler.characters(node.nodeValue) + + elif node.nodeType == Node.DOCUMENT_NODE: + handler.startDocument() + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endDocument() + + elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: + for child in node.childNodes: dom2sax(child, handler, nsmap) + + else: + # ATTRIBUTE_NODE + # ENTITY_NODE + # PROCESSING_INSTRUCTION_NODE + # COMMENT_NODE + # DOCUMENT_TYPE_NODE + # NOTATION_NODE + pass + + return locals() + +# Keep backwards compatibility with things that directly load +# classes/functions from this module +for key, value in getDomModule(minidom).__dict__.items(): + globals()[key] = value diff --git a/planet/vendor/html5lib/treebuilders/etree.py b/planet/vendor/html5lib/treebuilders/etree.py index f78762b..6815582 100755 --- a/planet/vendor/html5lib/treebuilders/etree.py +++ b/planet/vendor/html5lib/treebuilders/etree.py @@ -1,5 +1,12 @@ -import _base import new +import re + +import _base +from html5lib import ihatexml +from html5lib import constants +from html5lib.constants import namespaces + +tag_regexp = re.compile("{([^}]*)}(.*)") moduleCache = {} @@ -17,20 +24,43 @@ def getETreeModule(ElementTreeImplementation, fullTree=False): def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation class Element(_base.Node): - def __init__(self, name): - self._element = ElementTree.Element(name) - self.name = name + def __init__(self, name, namespace=None): + self._name = name + self._namespace = namespace + self._element = ElementTree.Element(self._getETreeTag(name, + namespace)) + if namespace is None: + self.nameTuple = namespaces["html"], self._name + else: + self.nameTuple = self._namespace, self._name self.parent = None self._childNodes = [] self._flags = [] + + def _getETreeTag(self, name, namespace): + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s"%(namespace, name) + return etree_tag def _setName(self, name): - self._element.tag = name + self._name = name + self._element.tag = self._getETreeTag(self._name, self._namespace) def _getName(self): - return self._element.tag - + return self._name + name = property(_getName, _setName) + + def _setNamespace(self, namespace): + self._namespace = namespace + self._element.tag = self._getETreeTag(self._name, self._namespace) + + def _getNamespace(self): + return self._namespace + + namespace = property(_getNamespace, _setNamespace) def _getAttributes(self): return self._element.attrib @@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): for key in self._element.attrib.keys(): del self._element.attrib[key] for key, value in attributes.iteritems(): - self._element.set(key, value) + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], key[1]) + else: + name = key + self._element.set(name, value) attributes = property(_getAttributes, _setAttributes) def _getChildNodes(self): - return self._childNodes - + return self._childNodes def _setChildNodes(self, value): del self._element[:] self._childNodes = [] @@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): data = property(_getData, _setData) class DocumentType(Element): - def __init__(self, name): + def __init__(self, name, publicId, systemId): Element.__init__(self, "") self._element.text = name + self.publicId = publicId + self.systemId = systemId def _getPublicId(self): - return self._element.get(u"publicId", None) + return self._element.get(u"publicId", "") def _setPublicId(self, value): if value is not None: @@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): publicId = property(_getPublicId, _setPublicId) def _getSystemId(self): - return self._element.get(u"systemId", None) + return self._element.get(u"systemId", "") def _setSystemId(self, value): if value is not None: @@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): if not(hasattr(element, "tag")): element = element.getroot() if element.tag == "": - rv.append("|%s"%(' '*indent, element.text)) + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """"""%( + element.text, publicId, systemId)) + else: + rv.append(""%(element.text,)) elif element.tag == "": rv.append("#document") if element.text: @@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): elif type(element.tag) == type(ElementTree.Comment): rv.append("|%s"%(' '*indent, element.text)) else: - rv.append("|%s<%s>"%(' '*indent, element.tag)) + nsmatch = tag_regexp.match(element.tag) + + if nsmatch is None: + name = element.tag + else: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + if prefix != "html": + name = "%s %s"%(prefix, name) + rv.append("|%s<%s>"%(' '*indent, name)) + if hasattr(element, "attrib"): for name, value in element.attrib.iteritems(): + nsmatch = tag_regexp.match(name) + if nsmatch is not None: + ns, name = nsmatch.groups() + prefix = constants.prefixes[ns] + name = "%s %s"%(prefix, name) rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) if element.text: rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) @@ -201,12 +257,19 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): """Serialize an element and its child nodes to a string""" rv = [] finalText = None + filter = ihatexml.InfosetFilter() def serializeElement(element): if type(element) == type(ElementTree.ElementTree): element = element.getroot() if element.tag == "": - rv.append(""%(element.text,)) + if element.get("publicId") or element.get("systemId"): + publicId = element.get("publicId") or "" + systemId = element.get("systemId") or "" + rv.append( """"""%( + element.text, publicId, systemId)) + else: + rv.append(""%(element.text,)) elif element.tag == "": if element.text: rv.append(element.text) @@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False): else: #This is assumed to be an ordinary element if not element.attrib: - rv.append("<%s>"%(element.tag,)) + rv.append("<%s>"%(filter.fromXmlName(element.tag),)) else: - attr = " ".join(["%s=\"%s\""%(name, value) + attr = " ".join(["%s=\"%s\""%( + filter.fromXmlName(name), value) for name, value in element.attrib.iteritems()]) rv.append("<%s %s>"%(element.tag, attr)) if element.text: diff --git a/planet/vendor/html5lib/treebuilders/etree_lxml.py b/planet/vendor/html5lib/treebuilders/etree_lxml.py new file mode 100644 index 0000000..92f0f87 --- /dev/null +++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py @@ -0,0 +1,331 @@ +import new +import warnings +import re + +import _base +from html5lib.constants import DataLossWarning +import html5lib.constants as constants +import etree as etree_builders +from html5lib import ihatexml + +try: + import lxml.etree as etree +except ImportError: + pass + +fullTree = True + +"""Module for supporting the lxml.etree library. The idea here is to use as much +of the native library as possible, without using fragile hacks like custom element +names that break between releases. The downside of this is that we cannot represent +all possible trees; specifically the following are known to cause problems: + +Text or comments as siblings of the root element +Docypes with no name + +When any of these things occur, we emit a DataLossWarning +""" + +class DocumentType(object): + def __init__(self, name, publicId, systemId): + self.name = name + self.publicId = publicId + self.systemId = systemId + +class Document(object): + def __init__(self): + self._elementTree = None + self._childNodes = [] + + def appendChild(self, element): + self._elementTree.getroot().addnext(element._element) + + def _getChildNodes(self): + return self._childNodes + + childNodes = property(_getChildNodes) + +def testSerializer(element): + rv = [] + finalText = None + filter = ihatexml.InfosetFilter() + def serializeElement(element, indent=0): + if not hasattr(element, "tag"): + if hasattr(element, "getroot"): + #Full tree case + rv.append("#document") + if element.docinfo.internalDTD: + if not (element.docinfo.public_id or + element.docinfo.system_url): + dtd_str = ""%element.docinfo.root_name + else: + dtd_str = """"""%( + element.docinfo.root_name, + element.docinfo.public_id, + element.docinfo.system_url) + rv.append("|%s%s"%(' '*(indent+2), dtd_str)) + next_element = element.getroot() + while next_element.getprevious() is not None: + next_element = next_element.getprevious() + while next_element is not None: + serializeElement(next_element, indent+2) + next_element = next_element.getnext() + elif isinstance(element, basestring): + #Text in a fragment + rv.append("|%s\"%s\""%(' '*indent, element)) + else: + #Fragment case + rv.append("#document-fragment") + for next_element in element: + serializeElement(next_element, indent+2) + elif type(element.tag) == type(etree.Comment): + rv.append("|%s"%(' '*indent, element.text)) + else: + nsmatch = etree_builders.tag_regexp.match(element.tag) + if nsmatch is not None: + ns = nsmatch.group(1) + tag = nsmatch.group(2) + prefix = constants.prefixes[ns] + if prefix != "html": + rv.append("|%s<%s %s>"%(' '*indent, prefix, + filter.fromXmlName(tag))) + else: + rv.append("|%s<%s>"%(' '*indent, + filter.fromXmlName(tag))) + else: + rv.append("|%s<%s>"%(' '*indent, + filter.fromXmlName(element.tag))) + + if hasattr(element, "attrib"): + for name, value in element.attrib.iteritems(): + nsmatch = etree_builders.tag_regexp.match(name) + if nsmatch: + ns = nsmatch.group(1) + name = nsmatch.group(2) + prefix = constants.prefixes[ns] + rv.append('|%s%s %s="%s"' % (' '*(indent+2), + prefix, + filter.fromXmlName(name), + value)) + else: + rv.append('|%s%s="%s"' % (' '*(indent+2), + filter.fromXmlName(name), + value)) + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element.getchildren(): + serializeElement(child, indent) + if hasattr(element, "tail") and element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + +def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + def serializeElement(element): + if not hasattr(element, "tag"): + if element.docinfo.internalDTD: + if element.docinfo.doctype: + dtd_str = element.docinfo.doctype + else: + dtd_str = ""%element.docinfo.root_name + rv.append(dtd_str) + serializeElement(element.getroot()) + + elif type(element.tag) == type(etree.Comment): + rv.append(""%(element.text,)) + + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(element.tag,)) + else: + attr = " ".join(["%s=\"%s\""%(name, value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element.getchildren(): + serializeElement(child) + + rv.append("%s>"%(element.tag,)) + + if hasattr(element, "tail") and element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = None + commentClass = None + fragmentClass = Document + + def __init__(self, namespaceHTMLElements, fullTree = False): + builder = etree_builders.getETreeModule(etree, fullTree=fullTree) + filter = self.filter = ihatexml.InfosetFilter() + self.namespaceHTMLElements = namespaceHTMLElements + + class Attributes(dict): + def __init__(self, element, value={}): + self._element = element + dict.__init__(self, value) + for key, value in self.iteritems(): + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + if isinstance(key, tuple): + name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) + else: + name = filter.coerceAttribute(key) + self._element._element.attrib[name] = value + + class Element(builder.Element): + def __init__(self, name, namespace): + name = filter.coerceElement(name) + builder.Element.__init__(self, name, namespace=namespace) + self._attributes = Attributes(self) + + def _setName(self, name): + self._name = filter.coerceElement(name) + self._element.tag = self._getETreeTag( + self._name, self._namespace) + + def _getName(self): + return self._name + + name = property(_getName, _setName) + + def _getAttributes(self): + return self._attributes + + def _setAttributes(self, attributes): + self._attributes = Attributes(self, attributes) + + attributes = property(_getAttributes, _setAttributes) + + def insertText(self, data, insertBefore=None): + data = filter.coerceCharacters(data) + builder.Element.insertText(self, data, insertBefore) + + def appendChild(self, child): + builder.Element.appendChild(self, child) + + + class Comment(builder.Comment): + def __init__(self, data): + data = filter.coerceComment(data) + builder.Comment.__init__(self, data) + + def _setData(self, data): + data = filter.coerceComment(data) + self._element.text = data + + def _getData(self): + return self._element.text + + data = property(_getData, _setData) + + self.elementClass = Element + self.commentClass = builder.Comment + #self.fragmentClass = builder.DocumentFragment + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def reset(self): + _base.TreeBuilder.reset(self) + self.insertComment = self.insertCommentInitial + self.initial_comments = [] + self.doctype = None + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._elementTree + else: + return self.document._elementTree.getroot() + + def getFragment(self): + fragment = [] + element = self.openElements[0]._element + if element.text: + fragment.append(element.text) + fragment.extend(element.getchildren()) + if element.tail: + fragment.append(element.tail) + return fragment + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if not name or ihatexml.nonXmlBMPRegexp.search(name): + warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) + doctype = self.doctypeClass(name, publicId, systemId) + self.doctype = doctype + + def insertCommentInitial(self, data, parent=None): + self.initial_comments.append(data) + + def insertRoot(self, token): + """Create the document root""" + #Because of the way libxml2 works, it doesn't seem to be possible to + #alter information like the doctype after the tree has been parsed. + #Therefore we need to use the built-in parser to create our iniial + #tree, after which we can add elements like normal + docStr = "" + if self.doctype and self.doctype.name: + docStr += "" + #TODO - this needs to work when elements are not put into the default ns + docStr += "" + + try: + root = etree.fromstring(docStr) + except etree.XMLSyntaxError: + print docStr + raise + + #Append the initial comments: + for comment_token in self.initial_comments: + root.addprevious(etree.Comment(comment_token["data"])) + + #Create the root document and add the ElementTree to it + self.document = self.documentClass() + self.document._elementTree = root.getroottree() + + #Add the root element to the internal child/open data structures + namespace = token.get("namespace", None) + root_element = self.elementClass(token["name"], namespace) + root_element._element = root + self.document._childNodes.append(root_element) + self.openElements.append(root_element) + + #Reset to the default insert comment function + self.insertComment = super(TreeBuilder, self).insertComment diff --git a/planet/vendor/html5lib/treebuilders/simpletree.py b/planet/vendor/html5lib/treebuilders/simpletree.py index 225cb3e..6d92892 100755 --- a/planet/vendor/html5lib/treebuilders/simpletree.py +++ b/planet/vendor/html5lib/treebuilders/simpletree.py @@ -1,5 +1,5 @@ import _base -from html5lib.constants import voidElements +from html5lib.constants import voidElements, namespaces, prefixes from xml.sax.saxutils import escape # Really crappy basic implementation of a DOM-core like thing @@ -63,6 +63,8 @@ class Node(_base.Node): def cloneNode(self): newNode = type(self)(self.name) + if hasattr(self, 'namespace'): + newNode.namespace = self.namespace if hasattr(self, 'attributes'): for attr, value in self.attributes.iteritems(): newNode.attributes[attr] = value @@ -73,6 +75,14 @@ class Node(_base.Node): """Return true if the node has children or text""" return bool(self.childNodes) + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + class Document(Node): type = 1 def __init__(self): @@ -81,6 +91,9 @@ class Document(Node): def __unicode__(self): return "#document" + def appendChild(self, child): + Node.appendChild(self, child) + def toxml(self, encoding="utf=8"): result = "" for child in self.childNodes: @@ -106,13 +119,21 @@ class DocumentFragment(Document): class DocumentType(Node): type = 3 - def __init__(self, name): + def __init__(self, name, publicId, systemId): Node.__init__(self, name) - self.publicId = u"" - self.systemId = u"" + self.publicId = publicId + self.systemId = systemId def __unicode__(self): - return u"" % self.name + if self.publicId or self.systemId: + publicId = self.publicId or "" + systemId = self.systemId or "" + return """"""%( + self.name, publicId, systemId) + + else: + return u"" % self.name + toxml = __unicode__ @@ -135,12 +156,16 @@ class TextNode(Node): class Element(Node): type = 5 - def __init__(self, name): + def __init__(self, name, namespace=None): Node.__init__(self, name) + self.namespace = namespace self.attributes = {} - + def __unicode__(self): - return u"<%s>" % self.name + if self.namespace in (None, namespaces["html"]): + return u"<%s>" % self.name + else: + return u"<%s %s>"%(prefixes[self.namespace], self.name) def toxml(self): result = '<' + self.name @@ -174,6 +199,8 @@ class Element(Node): indent += 2 if self.attributes: for name, value in self.attributes.iteritems(): + if isinstance(name, tuple): + name = "%s %s"%(name[0], name[1]) tree += '\n|%s%s="%s"' % (' ' * indent, name, value) for child in self.childNodes: tree += child.printTree(indent) diff --git a/planet/vendor/html5lib/treebuilders/soup.py b/planet/vendor/html5lib/treebuilders/soup.py index 9708d42..367de06 100644 --- a/planet/vendor/html5lib/treebuilders/soup.py +++ b/planet/vendor/html5lib/treebuilders/soup.py @@ -1,6 +1,9 @@ +import warnings + from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration import _base +from html5lib.constants import namespaces, DataLossWarning class AttrList(object): def __init__(self, element): @@ -22,22 +25,39 @@ class AttrList(object): class Element(_base.Node): - def __init__(self, element, soup): + def __init__(self, element, soup, namespace): _base.Node.__init__(self, element.name) self.element = element - self.soup=soup + self.soup = soup + self.namespace = namespace + + def _nodeIndex(self, node, refNode): + # Finds a node by identity rather than equality + for index in range(len(self.element.contents)): + if id(self.element.contents[index]) == id(refNode.element): + return index + return None def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): - newNode = TextNode(NavigableString( - self.element.contents[-1]+node.element), self.soup) - self.element.contents[-1].extract() - self.appendChild(newNode) + # Concatenate new text onto old text node + # (TODO: This has O(n^2) performance, for input like "aaa...") + newStr = NavigableString(self.element.contents[-1]+node.element) + + # Remove the old text node + # (Can't simply use .extract() by itself, because it fails if + # an equal text node exists within the parent node) + oldElement = self.element.contents[-1] + del self.element.contents[-1] + oldElement.parent = None + oldElement.extract() + + self.element.insert(len(self.element.contents), newStr) else: self.element.insert(len(self.element.contents), node.element) node.parent = self - + def getAttributes(self): return AttrList(self.element) @@ -56,18 +76,25 @@ class Element(_base.Node): self.appendChild(text) def insertBefore(self, node, refNode): - index = self.element.contents.index(refNode.element) + index = self._nodeIndex(node, refNode) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): - newNode = TextNode(NavigableString( - self.element.contents[index-1]+node.element), self.soup) - self.element.contents[index-1].extract() - self.insertBefore(newNode, refNode) + # (See comments in appendChild) + newStr = NavigableString(self.element.contents[index-1]+node.element) + oldNode = self.element.contents[index-1] + del self.element.contents[index-1] + oldNode.parent = None + oldNode.extract() + + self.element.insert(index-1, newStr) else: self.element.insert(index, node.element) node.parent = self def removeChild(self, node): + index = self._nodeIndex(node.parent, node) + del node.parent.element.contents[index] + node.element.parent = None node.element.extract() node.parent = None @@ -76,12 +103,12 @@ class Element(_base.Node): child = self.element.contents[0] child.extract() if isinstance(child, Tag): - newParent.appendChild(Element(child, self.soup)) + newParent.appendChild(Element(child, self.soup, namespaces["html"])) else: newParent.appendChild(TextNode(child, self.soup)) def cloneNode(self): - node = Element(Tag(self.soup, self.element.name), self.soup) + node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node @@ -89,11 +116,19 @@ class Element(_base.Node): def hasContent(self): return self.element.contents + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + class TextNode(Element): def __init__(self, element, soup): _base.Node.__init__(self, None) self.element = element - self.soup=soup + self.soup = soup def cloneNode(self): raise NotImplementedError @@ -101,13 +136,25 @@ class TextNode(Element): class TreeBuilder(_base.TreeBuilder): def documentClass(self): self.soup = BeautifulSoup("") - return Element(self.soup, self.soup) + return Element(self.soup, self.soup, None) - def insertDoctype(self, name, publicId, systemId): - self.soup.insert(0, Declaration(name)) + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if publicId: + self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) + elif systemId: + self.soup.insert(0, Declaration("%s SYSTEM \"%s\""% + (name, systemId))) + else: + self.soup.insert(0, Declaration(name)) - def elementClass(self, name): - return Element(Tag(self.soup, name), self.soup) + def elementClass(self, name, namespace): + if namespace not in (None, namespaces["html"]): + warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning) + return Element(Tag(self.soup, name), self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) @@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder): def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup) + return Element(self.soup, self.soup, None) def appendChild(self, node): self.soup.insert(len(self.soup.contents), node.element) @@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder): return _base.TreeBuilder.getFragment(self).element def testSerializer(element): + import re rv = [] def serializeElement(element, indent=0): if isinstance(element, Declaration): - rv.append("|%s"%(' '*indent, element.string)) + doctype_regexp = r'(?P[^\s]*)( PUBLIC "(?P.*)" "(?P.*)"| SYSTEM "(?P.*)")?' + m = re.compile(doctype_regexp).match(element.string) + assert m is not None, "DOCTYPE did not match expected format" + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') or "" + else: + systemId = m.group('systemId2') + + if publicId is not None or systemId is not None: + rv.append("""|%s"""% + (' '*indent, name, publicId or "", systemId or "")) + else: + rv.append("|%s"%(' '*indent, name)) + elif isinstance(element, BeautifulSoup): if element.name == "[document_fragment]": rv.append("#document-fragment") diff --git a/planet/vendor/html5lib/treewalkers/_base.py b/planet/vendor/html5lib/treewalkers/_base.py index fd12d58..2b192bd 100644 --- a/planet/vendor/html5lib/treewalkers/_base.py +++ b/planet/vendor/html5lib/treewalkers/_base.py @@ -21,18 +21,24 @@ class TreeWalker(object): attrs = attrs.items() return [(unicode(name),unicode(value)) for name,value in attrs] - def emptyTag(self, name, attrs, hasChildren=False): - yield {"type": "EmptyTag", "name": unicode(name), \ - "data": self.normalizeAttrs(attrs)} + def emptyTag(self, namespace, name, attrs, hasChildren=False): + yield {"type": "EmptyTag", "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} if hasChildren: yield self.error(_("Void element has children")) - def startTag(self, name, attrs): - return {"type": "StartTag", "name": unicode(name), \ - "data": self.normalizeAttrs(attrs)} + def startTag(self, namespace, name, attrs): + return {"type": "StartTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": self.normalizeAttrs(attrs)} - def endTag(self, name): - return {"type": "EndTag", "name": unicode(name), "data": []} + def endTag(self, namespace, name): + return {"type": "EndTag", + "name": unicode(name), + "namespace":unicode(namespace), + "data": []} def text(self, data): data = unicode(data) @@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker): def walkChildren(self, node): raise NodeImplementedError - def element(self, node, name, attrs, hasChildren): + def element(self, node, namespace, name, attrs, hasChildren): if name in voidElements: - for token in self.emptyTag(name, attrs, hasChildren): + for token in self.emptyTag(namespace, name, attrs, hasChildren): yield token else: yield self.startTag(name, attrs) @@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker): details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] hasChildren = False + endTag = None if type == DOCTYPE: yield self.doctype(*details) @@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker): yield token elif type == ELEMENT: - name, attributes, hasChildren = details + namespace, name, attributes, hasChildren = details if name in voidElements: - for token in self.emptyTag(name, attributes, hasChildren): + for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token hasChildren = False else: - yield self.startTag(name, attributes) + endTag = name + yield self.startTag(namespace, name, attributes) elif type == COMMENT: yield self.comment(details[0]) @@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker): details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] if type == ELEMENT: - name, attributes, hasChildren = details + namespace, name, attributes, hasChildren = details if name not in voidElements: - yield self.endTag(name) + yield self.endTag(namespace, name) nextSibling = self.getNextSibling(currentNode) if nextSibling is not None: currentNode = nextSibling diff --git a/planet/vendor/html5lib/treewalkers/dom.py b/planet/vendor/html5lib/treewalkers/dom.py index 1ed2aed..c2b0712 100644 --- a/planet/vendor/html5lib/treewalkers/dom.py +++ b/planet/vendor/html5lib/treewalkers/dom.py @@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker): return _base.TEXT, node.nodeValue elif node.nodeType == Node.ELEMENT_NODE: - return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes + return (_base.ELEMENT, node.namespaceURI, node.nodeName, + node.attributes.items(), node.hasChildNodes) elif node.nodeType == Node.COMMENT_NODE: return _base.COMMENT, node.nodeValue diff --git a/planet/vendor/html5lib/treewalkers/etree.py b/planet/vendor/html5lib/treewalkers/etree.py index 976411b..739d307 100644 --- a/planet/vendor/html5lib/treewalkers/etree.py +++ b/planet/vendor/html5lib/treewalkers/etree.py @@ -3,10 +3,13 @@ _ = gettext.gettext import new import copy +import re import _base from html5lib.constants import voidElements +tag_regexp = re.compile("{([^}]*)}(.*)") + moduleCache = {} def getETreeModule(ElementTreeImplementation): @@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation): to avoid using recursion, returns "nodes" as tuples with the following content: - 1. An Element node serving as *context* (it cannot be called the parent - node due to the particular ``tail`` text nodes. - - 2. Either the string literals ``"text"`` or ``"tail"`` or a child index - - 3. A list used as a stack of all ancestor *context nodes*. It is a - pair tuple whose first item is an Element and second item is a child - index. + 1. The current element + + 2. The index of the element relative to its parent + + 3. A stack of ancestor elements + + 4. A flag "text", "tail" or None to indicate if the current node is a + text node; either the text or tail of the current element (1) """ - def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element - elt, key, parents = node - if key in ("text", "tail"): - return _base.TEXT, getattr(elt, key) + elt, key, parents, flag = node + if flag in ("text", "tail"): + return _base.TEXT, getattr(elt, flag) else: - node = elt[int(key)] + node = elt if not(hasattr(node, "tag")): node = node.getroot() @@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation): return (_base.DOCUMENT,) elif node.tag == "": - return _base.DOCTYPE, node.text + return (_base.DOCTYPE, node.text, + node.get("publicId"), node.get("systemId")) elif type(node.tag) == type(ElementTree.Comment): return _base.COMMENT, node.text else: #This is assumed to be an ordinary element - return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text - + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + return (_base.ELEMENT, namespace, tag, + node.attrib.items(), len(node) or node.text) + def getFirstChild(self, node): - if isinstance(node, tuple): # It might be the root Element - elt, key, parents = node - assert key not in ("text", "tail"), "Text nodes have no children" - parents.append((elt, int(key))) - node = elt[int(key)] + if isinstance(node, tuple): + element, key, parents, flag = node else: - parents = [] - - assert len(node) or node.text, "Node has no children" - if node.text: - return (node, "text", parents) + element, key, parents, flag = node, None, [], None + + if flag in ("text", "tail"): + return None else: - return (node, 0, parents) - + if element.text: + return element, key, parents, "text" + elif len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + def getNextSibling(self, node): - assert isinstance(node, tuple), "Node is not a tuple: " + str(node) - - elt, key, parents = node - if key == "text": - key = -1 - elif key == "tail": - elt, key = parents.pop() - else: - # Look for "tail" of the "revisited" node - child = elt[key] - if child.tail: - parents.append((elt, key)) - return (child, "tail", parents) - - # case where key were "text" or "tail" or elt[key] had a tail - key += 1 - if len(elt) > key: - return (elt, key, parents) + if isinstance(node, tuple): + element, key, parents, flag = node else: return None - + + if flag == "text": + if len(element): + parents.append(element) + return element[0], 0, parents, None + else: + return None + else: + if element.tail and flag != "tail": + return element, key, parents, "tail" + elif key < len(parents[-1]) - 1: + return parents[-1][key+1], key+1, parents, None + else: + return None + def getParentNode(self, node): - assert isinstance(node, tuple) - elt, key, parents = node - if parents: - elt, key = parents.pop() - return elt, key, parents + if isinstance(node, tuple): + element, key, parents, flag = node else: - # HACK: We could return ``elt`` but None will stop the algorithm the same way return None + + if flag == "text": + if not parents: + return element + else: + return element, key, parents, None + else: + parent = parents.pop() + if not parents: + return parent + else: + return parent, list(parents[-1]).index(parent), parents, None return locals() diff --git a/planet/vendor/html5lib/treewalkers/genshistream.py b/planet/vendor/html5lib/treewalkers/genshistream.py index ecc7a0b..0014073 100644 --- a/planet/vendor/html5lib/treewalkers/genshistream.py +++ b/planet/vendor/html5lib/treewalkers/genshistream.py @@ -1,4 +1,4 @@ -from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \ +from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \ START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT from genshi.output import NamespaceFlattener @@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker): depth = 0 ignore_until = None previous = None - for event in NamespaceFlattener(prefixes={ - 'http://www.w3.org/1999/xhtml': '' - })(self.tree): + for event in self.tree: if previous is not None: if previous[0] == START: depth += 1 @@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker): kind, data, pos = event if kind == START: tag, attrib = data + name = tag.localname + namespace = tag.namespace if tag in voidElements: - for token in self.emptyTag(tag, list(attrib), \ - not next or next[0] != END or next[1] != tag): + for token in self.emptyTag(namespace, name, list(attrib), + not next or next[0] != END + or next[1] != tag): yield token else: - yield self.startTag(tag, list(attrib)) + yield self.startTag(namespace, name, list(attrib)) elif kind == END: - if data not in voidElements: - yield self.endTag(data) + name = data.localname + namespace = data.namespace + if (namespace, name) not in voidElements: + yield self.endTag(namespace, name) elif kind == COMMENT: yield self.comment(data) @@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker): elif kind == DOCTYPE: yield self.doctype(*data) - elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \ + elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \ START_CDATA, END_CDATA, PI): pass diff --git a/planet/vendor/html5lib/treewalkers/lxmletree.py b/planet/vendor/html5lib/treewalkers/lxmletree.py new file mode 100644 index 0000000..3f4de4f --- /dev/null +++ b/planet/vendor/html5lib/treewalkers/lxmletree.py @@ -0,0 +1,175 @@ +from lxml import etree +from html5lib.treebuilders.etree import tag_regexp + +from gettext import gettext +_ = gettext + +import _base + +from html5lib.constants import voidElements +from html5lib import ihatexml + +class Root(object): + def __init__(self, et): + self.elementtree = et + self.children = [] + if et.docinfo.internalDTD: + self.children.append(Doctype(self, et.docinfo.root_name, + et.docinfo.public_id, + et.docinfo.system_url)) + root = et.getroot() + node = root + + while node.getprevious() is not None: + node = node.getprevious() + while node is not None: + self.children.append(node) + node = node.getnext() + + self.text = None + self.tail = None + + def __getitem__(self, key): + return self.children[key] + + def getnext(self): + return None + + def __len__(self): + return 1 + +class Doctype(object): + def __init__(self, root_node, name, public_id, system_id): + self.root_node = root_node + self.name = name + self.public_id = public_id + self.system_id = system_id + + self.text = None + self.tail = None + + def getnext(self): + return self.root_node.children[1] + +class FragmentRoot(Root): + def __init__(self, children): + self.children = [FragmentWrapper(self, child) for child in children] + self.text = self.tail = None + + def getnext(self): + return None + +class FragmentWrapper(object): + def __init__(self, fragment_root, obj): + self.root_node = fragment_root + self.obj = obj + if hasattr(self.obj, 'text'): + self.text = self.obj.text + else: + self.text = None + if hasattr(self.obj, 'tail'): + self.tail = self.obj.tail + else: + self.tail = None + self.isstring = isinstance(obj, basestring) + + def __getattr__(self, name): + return getattr(self.obj, name) + + def getnext(self): + siblings = self.root_node.children + idx = siblings.index(self) + if idx < len(siblings) - 1: + return siblings[idx + 1] + else: + return None + + def __getitem__(self, key): + return self.obj[key] + + def __nonzero__(self): + return bool(self.obj) + + def getparent(self): + return None + + def __str__(self): + return str(self.obj) + + def __len__(self): + return len(self.obj) + + +class TreeWalker(_base.NonRecursiveTreeWalker): + def __init__(self, tree): + if hasattr(tree, "getroot"): + tree = Root(tree) + elif isinstance(tree, list): + tree = FragmentRoot(tree) + _base.NonRecursiveTreeWalker.__init__(self, tree) + self.filter = ihatexml.InfosetFilter() + def getNodeDetails(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + return _base.TEXT, getattr(node, key) + + elif isinstance(node, Root): + return (_base.DOCUMENT,) + + elif isinstance(node, Doctype): + return _base.DOCTYPE, node.name, node.public_id, node.system_id + + elif isinstance(node, FragmentWrapper) and node.isstring: + return _base.TEXT, node + + elif node.tag == etree.Comment: + return _base.COMMENT, node.text + + else: + #This is assumed to be an ordinary element + match = tag_regexp.match(node.tag) + if match: + namespace, tag = match.groups() + else: + namespace = None + tag = node.tag + return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), + [(self.filter.fromXmlName(name), value) for + name,value in node.attrib.iteritems()], + len(node) > 0 or node.text) + + def getFirstChild(self, node): + assert not isinstance(node, tuple), _("Text nodes have no children") + + assert len(node) or node.text, "Node has no children" + if node.text: + return (node, "text") + else: + return node[0] + + def getNextSibling(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + # XXX: we cannot use a "bool(node) and node[0] or None" construct here + # because node[0] might evaluate to False if it has no child element + if len(node): + return node[0] + else: + return None + else: # tail + return node.getnext() + + return node.tail and (node, "tail") or node.getnext() + + def getParentNode(self, node): + if isinstance(node, tuple): # Text node + node, key = node + assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + if key == "text": + return node + # else: fallback to "normal" processing + + return node.getparent() diff --git a/planet/vendor/html5lib/treewalkers/pulldom.py b/planet/vendor/html5lib/treewalkers/pulldom.py index 4a96aed..7354a0e 100644 --- a/planet/vendor/html5lib/treewalkers/pulldom.py +++ b/planet/vendor/html5lib/treewalkers/pulldom.py @@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker): type, node = event if type == START_ELEMENT: name = node.nodeName + namespace = node.namespaceURI if name in voidElements: - for token in self.emptyTag(name, \ - node.attributes.items(), not next or next[1] is not node): + for token in self.emptyTag(namespace, + name, + node.attributes.items(), + not next or next[1] is not node): yield token else: - yield self.startTag(name, node.attributes.items()) + yield self.startTag(namespace, name, node.attributes.items()) elif type == END_ELEMENT: name = node.nodeName + namespace = node.namespaceURI if name not in voidElements: - yield self.endTag(name) + yield self.endTag(namespace, name) elif type == COMMENT: yield self.comment(node.nodeValue) diff --git a/planet/vendor/html5lib/treewalkers/simpletree.py b/planet/vendor/html5lib/treewalkers/simpletree.py index 9dac6c8..42be2a2 100644 --- a/planet/vendor/html5lib/treewalkers/simpletree.py +++ b/planet/vendor/html5lib/treewalkers/simpletree.py @@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker): return _base.TEXT, node.value elif node.type == 5: # Element - return _base.ELEMENT, node.name, \ - node.attributes.items(), node.hasContent() + return (_base.ELEMENT, node.namespace, node.name, + node.attributes.items(), node.hasContent()) elif node.type == 6: # CommentNode return _base.COMMENT, node.data diff --git a/planet/vendor/html5lib/treewalkers/soup.py b/planet/vendor/html5lib/treewalkers/soup.py index 1d52ca0..ae29f03 100644 --- a/planet/vendor/html5lib/treewalkers/soup.py +++ b/planet/vendor/html5lib/treewalkers/soup.py @@ -1,3 +1,4 @@ +import re import gettext _ = gettext.gettext @@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag import _base class TreeWalker(_base.NonRecursiveTreeWalker): + doctype_regexp = re.compile( + r'(?P[^\s]*)(\s*PUBLIC\s*"(?P.*)"\s*"(?P.*)"|\s*SYSTEM\s*"(?P.*)")?') def getNodeDetails(self, node): if isinstance(node, BeautifulSoup): # Document or DocumentFragment return (_base.DOCUMENT,) elif isinstance(node, Declaration): # DocumentType - #Slice needed to remove markup added during unicode conversion - return _base.DOCTYPE, unicode(node.string)[2:-1] + string = unicode(node.string) + #Slice needed to remove markup added during unicode conversion, + #but only in some versions of BeautifulSoup/Python + if string.startswith(''): + string = string[2:-1] + m = self.doctype_regexp.match(string) + #This regexp approach seems wrong and fragile + #but beautiful soup stores the doctype as a single thing and we want the seperate bits + #It should work as long as the tree is created by html5lib itself but may be wrong if it's + #been modified at all + #We could just feed to it a html5lib tokenizer, I guess... + assert m is not None, "DOCTYPE did not match expected format" + name = m.group('name') + publicId = m.group('publicId') + if publicId is not None: + systemId = m.group('systemId1') + else: + systemId = m.group('systemId2') + return _base.DOCTYPE, name, publicId or "", systemId or "" elif isinstance(node, Comment): - return _base.COMMENT, unicode(node.string)[4:-3] + string = unicode(node.string) + if string.startswith(''): + string = string[4:-3] + return _base.COMMENT, string elif isinstance(node, unicode): # TextNode return _base.TEXT, node diff --git a/planet/vendor/html5lib/utils.py b/planet/vendor/html5lib/utils.py index c71e864..7c6c8ae 100644 --- a/planet/vendor/html5lib/utils.py +++ b/planet/vendor/html5lib/utils.py @@ -34,3 +34,123 @@ class MethodDispatcher(dict): def __getitem__(self, key): return dict.get(self, key, self.default) + +#Pure python implementation of deque taken from the ASPN Python Cookbook +#Original code by Raymond Hettinger + +class deque(object): + + def __init__(self, iterable=(), maxsize=-1): + if not hasattr(self, 'data'): + self.left = self.right = 0 + self.data = {} + self.maxsize = maxsize + self.extend(iterable) + + def append(self, x): + self.data[self.right] = x + self.right += 1 + if self.maxsize != -1 and len(self) > self.maxsize: + self.popleft() + + def appendleft(self, x): + self.left -= 1 + self.data[self.left] = x + if self.maxsize != -1 and len(self) > self.maxsize: + self.pop() + + def pop(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + self.right -= 1 + elem = self.data[self.right] + del self.data[self.right] + return elem + + def popleft(self): + if self.left == self.right: + raise IndexError('cannot pop from empty deque') + elem = self.data[self.left] + del self.data[self.left] + self.left += 1 + return elem + + def clear(self): + self.data.clear() + self.left = self.right = 0 + + def extend(self, iterable): + for elem in iterable: + self.append(elem) + + def extendleft(self, iterable): + for elem in iterable: + self.appendleft(elem) + + def rotate(self, n=1): + if self: + n %= len(self) + for i in xrange(n): + self.appendleft(self.pop()) + + def __getitem__(self, i): + if i < 0: + i += len(self) + try: + return self.data[i + self.left] + except KeyError: + raise IndexError + + def __setitem__(self, i, value): + if i < 0: + i += len(self) + try: + self.data[i + self.left] = value + except KeyError: + raise IndexError + + def __delitem__(self, i): + size = len(self) + if not (-size <= i < size): + raise IndexError + data = self.data + if i < 0: + i += size + for j in xrange(self.left+i, self.right-1): + data[j] = data[j+1] + self.pop() + + def __len__(self): + return self.right - self.left + + def __cmp__(self, other): + if type(self) != type(other): + return cmp(type(self), type(other)) + return cmp(list(self), list(other)) + + def __repr__(self, _track=[]): + if id(self) in _track: + return '...' + _track.append(id(self)) + r = 'deque(%r)' % (list(self),) + _track.remove(id(self)) + return r + + def __getstate__(self): + return (tuple(self),) + + def __setstate__(self, s): + self.__init__(s[0]) + + def __hash__(self): + raise TypeError + + def __copy__(self): + return self.__class__(self) + + def __deepcopy__(self, memo={}): + from copy import deepcopy + result = self.__class__() + memo[id(self)] = result + result.__init__(deepcopy(tuple(self), memo)) + return result \ No newline at end of file diff --git a/tests/data/reconstitute/content_illegal_char.xml b/tests/data/reconstitute/content_illegal_char.xml index 0b0a5b1..cf4e53f 100644 --- a/tests/data/reconstitute/content_illegal_char.xml +++ b/tests/data/reconstitute/content_illegal_char.xml @@ -1,6 +1,6 @@
optgroup
and blocks) we want to drop - # leading newlines + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + #Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of , , and blocks) we + # want to drop leading newlines + data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "textarea") and - not self.tree.openElements[-1].hasContent()): + self.tree.openElements[-1].name in ("pre", "listing", "textarea") + and not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() self.tree.insertText(data) - def processCharacters(self, data): + def processCharacters(self, token): # XXX The specification says to do this for every character at the # moment, but apparently that doesn't match the real world so we don't # do it for space characters. self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) + self.framesetOK = False #This matches the current spec but may not match the real world - def processSpaceCharacters(self, data): + def processSpaceCharacters(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) - def startTagProcessInHead(self, name, attributes): - self.parser.phases["inHead"].processStartTag(name, attributes) + def startTagProcessInHead(self, token): + self.parser.phases["inHead"].processStartTag(token) - def startTagTitle(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,)) - self.parser.phases["inHead"].processStartTag(name, attributes) - - def startTagBody(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (body).")) + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: - for attr, value in attributes.iteritems(): + for attr, value in token["data"].iteritems(): if attr not in self.tree.openElements[1].attributes: self.tree.openElements[1].attributes[attr] = value - def startTagCloseP(self, name, attributes): + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagCloseP(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) - if name == "pre": + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + if token["name"] in ("pre", "listing"): + self.parser.framesetOK = False self.processSpaceCharacters = self.processSpaceCharactersDropNewline - def startTagForm(self, name, attributes): + def startTagForm(self, token): if self.tree.formPointer: - self.parser.parseError("Unexpected start tag (form). Ignored.") + self.parser.parseError(u"unexpected-start-tag", {"name": "form"}) else: if self.tree.elementInScope("p"): self.endTagP("p") - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.formPointer = self.tree.openElements[-1] - def startTagListItem(self, name, attributes): + def startTagListItem(self, token): + self.parser.framesetOK = False if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")} - stopName = stopNames[name] + stopName = stopNames[token["name"]] # AT Use reversed in Python 2.4... for i, node in enumerate(self.tree.openElements[::-1]): if node.name in stopName: @@ -820,251 +1003,340 @@ class InBodyPhase(Phase): poppedNodes.append(self.tree.openElements.pop()) if i >= 1: self.parser.parseError( - (i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)")) - % u", ".join([item.name for item in poppedNodes[:-1]])) + i == 1 and "missing-end-tag" or "missing-end-tags", + {"name": u", ".join([item.name + for item + in poppedNodes[:-1]])}) break # Phrasing elements are all non special, non scoping, non # formatting elements - if (node.name in (specialElements | scopingElements) - and node.name not in ("address", "div")): + if (node.nameTuple in + (specialElements | scopingElements) + and node.name not in ("address", "div")): break # Always insert an element. - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagPlaintext(self, name, attributes): + def startTagPlaintext(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"] - def startTagHeading(self, name, attributes): + def startTagHeading(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() # Uncomment the following for IE7 behavior: # #for item in headingElements: # if self.tree.elementInScope(item): - # self.parser.parseError(_(u"Unexpected start tag (" + name +\ - # ").")) + # self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) # item = self.tree.openElements.pop() # while item.name not in headingElements: # item = self.tree.openElements.pop() # break - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagA(self, name, attributes): + def startTagA(self, token): afeAElement = self.tree.elementInActiveFormattingElements("a") if afeAElement: - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'a', u'a')) - self.endTagFormatting("a") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) if afeAElement in self.tree.openElements: self.tree.openElements.remove(afeAElement) if afeAElement in self.tree.activeFormattingElements: self.tree.activeFormattingElements.remove(afeAElement) self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagFormatting(self, name, attributes): + def startTagFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagNobr(self, name, attributes): + def startTagNobr(self, token): self.tree.reconstructActiveFormattingElements() if self.tree.elementInScope("nobr"): - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'nobr', u'nobr')) - self.processEndTag("nobr") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) # XXX Need tests that trigger the following self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagButton(self, name, attributes): + def startTagButton(self, token): if self.tree.elementInScope("button"): - self.parser.parseError(_(u"Unexpected start tag (%s) implied " - u"end tag (%s).") % (u'button', u'button')) - self.processEndTag("button") - self.parser.phase.processStartTag(name, attributes) + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + self.parser.phase.processStartTag(token) else: self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagMarqueeObject(self, name, attributes): + def startTagAppletMarqueeObject(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagXmp(self, name, attributes): + def startTagXmp(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") + self.parser.framesetOK = False - def startTagTable(self, name, attributes): - if self.tree.elementInScope("p"): - self.processEndTag("p") - self.tree.insertElement(name, attributes) + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False self.parser.phase = self.parser.phases["inTable"] - def startTagVoidFormatting(self, name, attributes): + def startTagVoidFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagHr(self, name, attributes): + def startTagHr(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagImage(self, name, attributes): + def startTagImage(self, token): # No really... - self.parser.parseError(_(u"Unexpected start tag (image). Treated " - u"as img.")) - self.processStartTag("img", attributes) + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) - def startTagInput(self, name, attributes): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - if self.tree.formPointer: - # XXX Not exactly sure what to do here - self.tree.openElements[-1].form = self.tree.formPointer - self.tree.openElements.pop() - - def startTagIsIndex(self, name, attributes): - self.parser.parseError("Unexpected start tag isindex. Don't use it!") + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) if self.tree.formPointer: return - self.processStartTag("form", {}) - self.processStartTag("hr", {}) - self.processStartTag("p", {}) - self.processStartTag("label", {}) + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Insert your search keywords here: " self.processCharacters( - "This is a searchable index. Insert your search keywords here: ") + {"type":tokenTypes["Characters"], "data":prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] attributes["name"] = "isindex" - attrs = [[key,value] for key,value in attributes.iteritems()] - self.processStartTag("input", dict(attrs)) - self.processEndTag("label") - self.processEndTag("p") - self.processStartTag("hr", {}) - self.processEndTag("form") + self.processStartTag(impliedTagToken("input", "StartTag", + attributes = attributes, + selfClosing = + token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) - def startTagTextarea(self, name, attributes): + def startTagTextarea(self, token): # XXX Form element pointer checking here as well... - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False - def startTagCdata(self, name, attributes): + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagCdata(token) + + def startTagCdata(self, token): """iframe, noembed noframes, noscript(if scripting enabled)""" - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") - def startTagSelect(self, name, attributes): + def startTagOpt(self, token): + if self.tree.elementInScope("option"): + self.parser.phase.processEndTag(impliedTagToken("option")) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.phase = self.parser.phases["inSelect"] + self.parser.tree.insertElement(token) - def startTagMisplaced(self, name, attributes): + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] + + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + while self.tree.openElements[-1].name != "ruby": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMisplaced(self, token): """ Elements that should be children of other elements that have a different insertion mode; here they are ignored "caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "noscript" """ - self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - def startTagNew(self, name, attributes): + def startTagNew(self, token): """New HTML5 elements, "event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command" """ - sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) - self.startTagOther(name, attributes) + #2007-08-30 - MAP - commenting out this write to sys.stderr because + # it's really annoying me when I run the validator tests + #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) + self.startTagOther(token) #raise NotImplementedError - def startTagOther(self, name, attributes): + def startTagOther(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def endTagP(self, name): + def endTagP(self, token): if self.tree.elementInScope("p"): self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": - self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',)) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) if self.tree.elementInScope("p"): while self.tree.elementInScope("p"): self.tree.openElements.pop() else: - self.startTagCloseP("p", {}) - self.endTagP("p") + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.endTagP(impliedTagToken("p")) - def endTagBody(self, name): + def endTagBody(self, token): # XXX Need to take open tags into account here. We shouldn't imply # but we should not throw a parse error either. Specification is # likely to be updated. - if self.tree.openElements[1].name != "body": + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): # innerHTML case self.parser.parseError() return - if self.tree.openElements[-1].name != "body": - self.parser.parseError(_(u"Unexpected end tag (%s). Missing " - u"end tag (%s).") % (u'body', self.tree.openElements[-1].name)) + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "p", + "tbody", "td", "tfoot", + "th", "thead", "tr")): + #Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"expectedName": "body", "gotName": node.name}) + break self.parser.phase = self.parser.phases["afterBody"] - def endTagHtml(self, name): - self.endTagBody(name) + def endTagHtml(self, token): + self.endTagBody(impliedTagToken("body")) if not self.parser.innerHTML: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagBlock(self, name): + def endTagBlock(self, token): #Put us back in the right whitespace handling mode - if name == "pre": + if token["name"] == "pre": self.processSpaceCharacters = self.processSpaceCharactersNonPre - inScope = self.tree.elementInScope(name) + inScope = self.tree.elementInScope(token["name"]) if inScope: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) if inScope: node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagForm(self, name): - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (form) seen too early. Ignored.")) - else: - self.tree.openElements.pop() + def endTagForm(self, token): + node = self.tree.formPointer self.tree.formPointer = None + if node is None or not self.tree.elementInScope(token["name"]): + self.parser.parseError("unexpected-end-tag", + {"name":"form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) - def endTagListItem(self, name): + def endTagListItem(self, token): # AT Could merge this with the Block case - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags(name) + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags(token["name"]) - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagHeading(self, name): + def endTagHeading(self, token): for item in headingElements: if self.tree.elementInScope(item): self.tree.generateImpliedEndTags() break - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). " - u"Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) for item in headingElements: if self.tree.elementInScope(item): @@ -1073,38 +1345,37 @@ class InBodyPhase(Phase): item = self.tree.openElements.pop() break - def endTagFormatting(self, name): - """The much-feared adoption agency algorithm - """ + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # XXX Better parseError messages appreciated. + name = token["name"] while True: # Step 1 paragraph 1 - afeElement = self.tree.elementInActiveFormattingElements(name) + afeElement = self.tree.elementInActiveFormattingElements( + token["name"]) if not afeElement or (afeElement in self.tree.openElements and not self.tree.elementInScope(afeElement.name)): - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.1", {"name": token["name"]}) return # Step 1 paragraph 2 elif afeElement not in self.tree.openElements: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) self.tree.activeFormattingElements.remove(afeElement) return # Step 1 paragraph 3 if afeElement != self.tree.openElements[-1]: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) # Step 2 # Start of the adoption agency algorithm proper afeIndex = self.tree.openElements.index(afeElement) furthestBlock = None for element in self.tree.openElements[afeIndex:]: - if element.name in specialElements | scopingElements: + if (element.nameTuple in + specialElements | scopingElements): furthestBlock = element break @@ -1118,17 +1389,17 @@ class InBodyPhase(Phase): commonAncestor = self.tree.openElements[afeIndex-1] # Step 5 - if furthestBlock.parent: - furthestBlock.parent.removeChild(furthestBlock) + #if furthestBlock.parent: + # furthestBlock.parent.removeChild(furthestBlock) - # Step 6 + # Step 5 # The bookmark is supposed to help us identify where to reinsert # nodes in step 12. We have to ensure that we reinsert nodes after # the node before the active formatting element. Note the bookmark # can move in step 7.4 bookmark = self.tree.activeFormattingElements.index(afeElement) - # Step 7 + # Step 6 lastNode = node = furthestBlock while True: # AT replace this with a function and recursion? @@ -1140,26 +1411,24 @@ class InBodyPhase(Phase): node = self.tree.openElements[ self.tree.openElements.index(node)-1] self.tree.openElements.remove(tmpNode) - # Step 7.3 + # Step 6.3 if node == afeElement: break - # Step 7.4 + # Step 6.4 if lastNode == furthestBlock: - # XXX should this be index(node) or index(node)+1 - # Anne: I think +1 is ok. Given x = [2,3,4,5] - # x.index(3) gives 1 and then x[1 +1] gives 4... - bookmark = self.tree.activeFormattingElements.\ - index(node) + 1 - # Step 7.5 - cite = node.parent - if node.hasContent(): - clone = node.cloneNode() - # Replace node with clone - self.tree.activeFormattingElements[ - self.tree.activeFormattingElements.index(node)] = clone - self.tree.openElements[ - self.tree.openElements.index(node)] = clone - node = clone + bookmark = (self.tree.activeFormattingElements.index(node) + + 1) + # Step 6.5 + #cite = node.parent + #if node.hasContent(): + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + # Step 7.6 # Remove lastNode from its parents, if any if lastNode.parent: @@ -1167,87 +1436,101 @@ class InBodyPhase(Phase): node.appendChild(lastNode) # Step 7.7 lastNode = node - # End of inner loop + # End of inner loop - # Step 8 + # Step 7 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster parent the + # lastNode if lastNode.parent: lastNode.parent.removeChild(lastNode) commonAncestor.appendChild(lastNode) - # Step 9 + # Step 8 clone = afeElement.cloneNode() - # Step 10 + # Step 9 furthestBlock.reparentChildren(clone) - # Step 11 + # Step 10 furthestBlock.appendChild(clone) - # Step 12 + # Step 11 self.tree.activeFormattingElements.remove(afeElement) self.tree.activeFormattingElements.insert(bookmark, clone) - # Step 13 + # Step 12 self.tree.openElements.remove(afeElement) self.tree.openElements.insert( self.tree.openElements.index(furthestBlock) + 1, clone) - def endTagButtonMarqueeObject(self, name): - if self.tree.elementInScope(name): + def endTagAppletButtonMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): element = self.tree.openElements.pop() - while element.name != name: + while element.name != token["name"]: element = self.tree.openElements.pop() self.tree.clearActiveFormattingElements() - def endTagMisplaced(self, name): - # This handles elements with end tags in other insertion modes. - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagBr(self, name): - self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element.")) + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, {}) + self.tree.insertElement(impliedTagToken("br", "StartTag")) self.tree.openElements.pop() - def endTagNone(self, name): - # This handles elements with no end tag. - self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,)) - - def endTagCdataTextAreaXmp(self, name): - if self.tree.openElements[-1].name == name: - self.tree.openElements.pop() - else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagNew(self, name): - """New HTML5 elements, "event-source", "section", "nav", - "article", "aside", "header", "footer", "datagrid", "command" - """ - sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name) - self.endTagOther(name) - #raise NotImplementedError - - def endTagOther(self, name): - # XXX This logic should be moved into the treebuilder - # AT should use reversed instead of [::-1] when Python 2.4 == True. + def endTagOther(self, token): for node in self.tree.openElements[::-1]: - if node.name == name: + if node.name == token["name"]: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while self.tree.openElements.pop() != node: pass break else: - if node.name in specialElements | scopingElements: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + if (node.nameTuple in + specialElements | scopingElements): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break +class InCDataRCDataPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([ + ("script", self.endTagScript)]) + self.endTagHandler.default = self.endTagOther + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + self.tree.openElements[-1].name) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + self.parser.phase.processEOF() + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in (R)CDATA mode"%name + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + #The rest of this method is all stuff that only happens if + #document.write works + + def endTagOther(self, token): + node = self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): @@ -1259,7 +1542,9 @@ class InTablePhase(Phase): ("col", self.startTagCol), (("tbody", "tfoot", "thead"), self.startTagRowGroup), (("td", "th", "tr"), self.startTagImplyTbody), - ("table", self.startTagTable) + ("table", self.startTagTable), + (("style", "script"), self.startTagStyleScript), + ("input", self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1274,66 +1559,101 @@ class InTablePhase(Phase): def clearStackToTableContext(self): # "clear the stack back to a table context" while self.tree.openElements[-1].name not in ("table", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() # When the current node is it's an innerHTML case + def getCurrentTable(self): + i = -1 + while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table": + i -= 1 + if -i > len(self.tree.openElements): + return self.tree.openElements[0] + else: + return self.tree.openElements[i] + # processing methods - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters in " - u"table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + #Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.characterTokens.append(token) + + def processCharacters(self, token): + #If we get here there must be at least one non-whitespace character + # Do the table magic! self.tree.insertFromTable = True - # Process the character in the "in body" mode - self.parser.phases["inBody"].processCharacters(data) + self.parser.phases["inBody"].processCharacters(token) self.tree.insertFromTable = False - def startTagCaption(self, name, attributes): + def startTagCaption(self, token): self.clearStackToTableContext() self.tree.activeFormattingElements.append(Marker) - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCaption"] - def startTagColgroup(self, name, attributes): + def startTagColgroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inColumnGroup"] - def startTagCol(self, name, attributes): - self.startTagColgroup("colgroup", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagRowGroup(self, name, attributes): + def startTagRowGroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inTableBody"] - def startTagImplyTbody(self, name, attributes): - self.startTagRowGroup("tbody", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTable(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (table) in table " - u"phase. Implies end tag (table).")) - self.parser.phase.processEndTag("table") + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) if not self.parser.innerHTML: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def startTagStyleScript(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") + self.tree.insertElement(token) + # XXX associate with form + self.tree.openElements.pop() + else: + self.startTagOther(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the start tag in the "in body" mode - self.parser.phases["inBody"].processStartTag(name, attributes) + self.parser.phases["inBody"].processStartTag(token) self.tree.insertFromTable = False - def endTagTable(self, name): + def endTagTable(self, token): if self.tree.elementInScope("table", True): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": - self.parser.parseError(_(u"Unexpected end tag (table). " - u"Expected end tag (%s).") % (self.tree.openElements[-1].name,)) + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1343,18 +1663,61 @@ class InTablePhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the end tag in the "in body" mode - self.parser.phases["inBody"].processEndTag(name) + self.parser.phases["inBody"].processEndTag(token) self.tree.insertFromTable = False +class InTableTextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any([item not in spaceCharacters for item in data]): + token = {"type":tokenTypes["Characters"], "data":data} + self.originalPhase.processCharacters(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processComment(token) + + def processEOF(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEOF(token) + + def processCharacters(self, token): + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + #pretty sure we should never reach here + self.characterTokens.append(token) +# assert False + + def processStartTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processStartTag(token) + + def processEndTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEndTag(token) + class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption @@ -1379,27 +1742,31 @@ class InCaptionPhase(Phase): def ignoreEndTagCaption(self): return not self.tree.elementInScope("caption", True) - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() - def startTagTableElement(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): self.parser.parseError() #XXX Have to duplicate logic here to find out if the tag is ignored ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) - def endTagCaption(self, name): + def endTagCaption(self, token): if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": - self.parser.parseError(_(u"Unexpected end tag (caption). " - u"Missing end tags.")) + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1410,18 +1777,18 @@ class InCaptionPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): self.parser.parseError() ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) class InColumnGroupPhase(Phase): @@ -1445,23 +1812,33 @@ class InColumnGroupPhase(Phase): def ignoreEndTagColgroup(self): return self.tree.openElements[-1].name == "html" - def processCharacters(self, data): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup("colgroup") - if not ignoreEndTag: - self.parser.phase.processCharacters(data) + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processEOF() - def startTagCol(self, name ,attributes): - self.tree.insertElement(name, attributes) + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + self.parser.phase.processCharacters(token) + + def startTagCol(self, token): + self.tree.insertElement(token) self.tree.openElements.pop() - def startTagOther(self, name, attributes): + def startTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def endTagColgroup(self, name): + def endTagColgroup(self, token): if self.ignoreEndTagColgroup(): # innerHTML case assert self.parser.innerHTML @@ -1470,15 +1847,14 @@ class InColumnGroupPhase(Phase): self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] - def endTagCol(self, name): - self.parser.parseError(_(u"Unexpected end tag (col). " - u"col has no end tag.")) + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) - def endTagOther(self, name): + def endTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) class InTableBodyPhase(Phase): @@ -1489,7 +1865,8 @@ class InTableBodyPhase(Phase): ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther) + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther @@ -1505,62 +1882,76 @@ class InTableBodyPhase(Phase): def clearStackToTableBodyContext(self): while self.tree.openElements[-1].name not in ("tbody", "tfoot", "thead", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML # the rest - def processCharacters(self,data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTr(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTr(self, token): self.clearStackToTableBodyContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inRow"] - def startTagTableCell(self, name, attributes): - self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,)) - self.startTagTr("tr", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): # XXX AT Any ideas on how to share this with endTagTable? if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processStartTag(name, attributes) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.clearStackToTableBodyContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] else: - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagTable(self, name): + def endTagTable(self, token): if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processEndTag(name) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InRowPhase(Phase): @@ -1587,33 +1978,40 @@ class InRowPhase(Phase): # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): while self.tree.openElements[-1].name not in ("tr", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") % (self.tree.openElements[-1].name,)) + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() def ignoreEndTagTr(self): return not self.tree.elementInScope("tr", tableVariant=True) # the rest - def processCharacters(self, data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTableCell(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTableCell(self, token): self.clearStackToTableRowContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCell"] self.tree.activeFormattingElements.append(Marker) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTr(self, name): + def endTagTr(self, token): if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() @@ -1623,27 +2021,28 @@ class InRowPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.endTagTr("tr") - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell @@ -1666,60 +2065,63 @@ class InCellPhase(Phase): # helper def closeCell(self): if self.tree.elementInScope("td", True): - self.endTagTableCell("td") + self.endTagTableCell(impliedTagToken("td")) elif self.tree.elementInScope("th", True): - self.endTagTableCell("th") + self.endTagTableCell(impliedTagToken("th")) # the rest - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) - def startTagTableOther(self, name, attributes): - if self.tree.elementInScope("td", True) or \ - self.tree.elementInScope("th", True): + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", True) or + self.tree.elementInScope("th", True)): self.closeCell() - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.startTagHandler.default =\ self.parser.phases["inBody"].processStartTag - def endTagTableCell(self, name): - if self.tree.elementInScope(name, True): - self.tree.generateImpliedEndTags(name) - if self.tree.openElements[-1].name != name: - self.parser.parseError("Got table cell end tag (" + name +\ - ") while required end tags are missing.") + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], True): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", + {"name": token["name"]}) while True: node = self.tree.openElements.pop() - if node.name == name: + if node.name == token["name"]: break else: self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inRow"] else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagImply(self, name): - if self.tree.elementInScope(name, True): + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], True): self.closeCell() - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # sometimes innerHTML case self.parser.parseError() - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.endTagHandler.default = self.parser.phases["inBody"].processEndTag @@ -1733,7 +2135,8 @@ class InSelectPhase(Phase): ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), - ("select", self.startTagSelect) + ("select", self.startTagSelect), + (("input", "keygen", "textarea"), self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1747,52 +2150,63 @@ class InSelectPhase(Phase): self.endTagHandler.default = self.endTagOther # http://www.whatwg.org/specs/web-apps/current-work/#in-select - def processCharacters(self, data): - self.tree.insertText(data) + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML - def startTagOption(self, name, attributes): + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def startTagOption(self, token): # We need to imply
, , and blocks) we + # want to drop leading newlines + data = token["data"] self.processSpaceCharacters = self.processSpaceCharactersNonPre if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "textarea") and - not self.tree.openElements[-1].hasContent()): + self.tree.openElements[-1].name in ("pre", "listing", "textarea") + and not self.tree.openElements[-1].hasContent()): data = data[1:] if data: self.tree.reconstructActiveFormattingElements() self.tree.insertText(data) - def processCharacters(self, data): + def processCharacters(self, token): # XXX The specification says to do this for every character at the # moment, but apparently that doesn't match the real world so we don't # do it for space characters. self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) + self.framesetOK = False #This matches the current spec but may not match the real world - def processSpaceCharacters(self, data): + def processSpaceCharacters(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) + self.tree.insertText(token["data"]) - def startTagProcessInHead(self, name, attributes): - self.parser.phases["inHead"].processStartTag(name, attributes) + def startTagProcessInHead(self, token): + self.parser.phases["inHead"].processStartTag(token) - def startTagTitle(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,)) - self.parser.phases["inHead"].processStartTag(name, attributes) - - def startTagBody(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (body).")) + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): assert self.parser.innerHTML else: - for attr, value in attributes.iteritems(): + for attr, value in token["data"].iteritems(): if attr not in self.tree.openElements[1].attributes: self.tree.openElements[1].attributes[attr] = value - def startTagCloseP(self, name, attributes): + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] + + def startTagCloseP(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) - if name == "pre": + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + if token["name"] in ("pre", "listing"): + self.parser.framesetOK = False self.processSpaceCharacters = self.processSpaceCharactersDropNewline - def startTagForm(self, name, attributes): + def startTagForm(self, token): if self.tree.formPointer: - self.parser.parseError("Unexpected start tag (form). Ignored.") + self.parser.parseError(u"unexpected-start-tag", {"name": "form"}) else: if self.tree.elementInScope("p"): self.endTagP("p") - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.formPointer = self.tree.openElements[-1] - def startTagListItem(self, name, attributes): + def startTagListItem(self, token): + self.parser.framesetOK = False if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")} - stopName = stopNames[name] + stopName = stopNames[token["name"]] # AT Use reversed in Python 2.4... for i, node in enumerate(self.tree.openElements[::-1]): if node.name in stopName: @@ -820,251 +1003,340 @@ class InBodyPhase(Phase): poppedNodes.append(self.tree.openElements.pop()) if i >= 1: self.parser.parseError( - (i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)")) - % u", ".join([item.name for item in poppedNodes[:-1]])) + i == 1 and "missing-end-tag" or "missing-end-tags", + {"name": u", ".join([item.name + for item + in poppedNodes[:-1]])}) break # Phrasing elements are all non special, non scoping, non # formatting elements - if (node.name in (specialElements | scopingElements) - and node.name not in ("address", "div")): + if (node.nameTuple in + (specialElements | scopingElements) + and node.name not in ("address", "div")): break # Always insert an element. - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagPlaintext(self, name, attributes): + def startTagPlaintext(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"] - def startTagHeading(self, name, attributes): + def startTagHeading(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() # Uncomment the following for IE7 behavior: # #for item in headingElements: # if self.tree.elementInScope(item): - # self.parser.parseError(_(u"Unexpected start tag (" + name +\ - # ").")) + # self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) # item = self.tree.openElements.pop() # while item.name not in headingElements: # item = self.tree.openElements.pop() # break - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def startTagA(self, name, attributes): + def startTagA(self, token): afeAElement = self.tree.elementInActiveFormattingElements("a") if afeAElement: - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'a', u'a')) - self.endTagFormatting("a") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) if afeAElement in self.tree.openElements: self.tree.openElements.remove(afeAElement) if afeAElement in self.tree.activeFormattingElements: self.tree.activeFormattingElements.remove(afeAElement) self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagFormatting(self, name, attributes): + def startTagFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagNobr(self, name, attributes): + def startTagNobr(self, token): self.tree.reconstructActiveFormattingElements() if self.tree.elementInScope("nobr"): - self.parser.parseError(_(u"Unexpected start tag (%s) implies " - u"end tag (%s).") % (u'nobr', u'nobr')) - self.processEndTag("nobr") + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) # XXX Need tests that trigger the following self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(name, attributes) + self.addFormattingElement(token) - def startTagButton(self, name, attributes): + def startTagButton(self, token): if self.tree.elementInScope("button"): - self.parser.parseError(_(u"Unexpected start tag (%s) implied " - u"end tag (%s).") % (u'button', u'button')) - self.processEndTag("button") - self.parser.phase.processStartTag(name, attributes) + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + self.parser.phase.processStartTag(token) else: self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagMarqueeObject(self, name, attributes): + def startTagAppletMarqueeObject(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False - def startTagXmp(self, name, attributes): + def startTagXmp(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") + self.parser.framesetOK = False - def startTagTable(self, name, attributes): - if self.tree.elementInScope("p"): - self.processEndTag("p") - self.tree.insertElement(name, attributes) + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False self.parser.phase = self.parser.phases["inTable"] - def startTagVoidFormatting(self, name, attributes): + def startTagVoidFormatting(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagHr(self, name, attributes): + def startTagHr(self, token): if self.tree.elementInScope("p"): - self.endTagP("p") - self.tree.insertElement(name, attributes) + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False - def startTagImage(self, name, attributes): + def startTagImage(self, token): # No really... - self.parser.parseError(_(u"Unexpected start tag (image). Treated " - u"as img.")) - self.processStartTag("img", attributes) + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) - def startTagInput(self, name, attributes): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - if self.tree.formPointer: - # XXX Not exactly sure what to do here - self.tree.openElements[-1].form = self.tree.formPointer - self.tree.openElements.pop() - - def startTagIsIndex(self, name, attributes): - self.parser.parseError("Unexpected start tag isindex. Don't use it!") + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) if self.tree.formPointer: return - self.processStartTag("form", {}) - self.processStartTag("hr", {}) - self.processStartTag("p", {}) - self.processStartTag("label", {}) + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Insert your search keywords here: " self.processCharacters( - "This is a searchable index. Insert your search keywords here: ") + {"type":tokenTypes["Characters"], "data":prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] attributes["name"] = "isindex" - attrs = [[key,value] for key,value in attributes.iteritems()] - self.processStartTag("input", dict(attrs)) - self.processEndTag("label") - self.processEndTag("p") - self.processStartTag("hr", {}) - self.processEndTag("form") + self.processStartTag(impliedTagToken("input", "StartTag", + attributes = attributes, + selfClosing = + token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) - def startTagTextarea(self, name, attributes): + def startTagTextarea(self, token): # XXX Form element pointer checking here as well... - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False - def startTagCdata(self, name, attributes): + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagCdata(token) + + def startTagCdata(self, token): """iframe, noembed noframes, noscript(if scripting enabled)""" - self.tree.insertElement(name, attributes) - self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + self.parser.parseRCDataCData(token, "CDATA") - def startTagSelect(self, name, attributes): + def startTagOpt(self, token): + if self.tree.elementInScope("option"): + self.parser.phase.processEndTag(impliedTagToken("option")) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) - self.parser.phase = self.parser.phases["inSelect"] + self.parser.tree.insertElement(token) - def startTagMisplaced(self, name, attributes): + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] + + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + while self.tree.openElements[-1].name != "ruby": + self.tree.openElements.pop() + self.tree.insertElement(token) + + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + #Need to get the parse error right for the case where the token + #has a namespace not equal to the xmlns attribute + if self.parser.phase != self.parser.phases["inForeignContent"]: + self.parser.secondaryPhase = self.parser.phase + self.parser.phase = self.parser.phases["inForeignContent"] + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + def startTagMisplaced(self, token): """ Elements that should be children of other elements that have a different insertion mode; here they are ignored "caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "tr", "noscript" """ - self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - def startTagNew(self, name, attributes): + def startTagNew(self, token): """New HTML5 elements, "event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command" """ - sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) - self.startTagOther(name, attributes) + #2007-08-30 - MAP - commenting out this write to sys.stderr because + # it's really annoying me when I run the validator tests + #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name) + self.startTagOther(token) #raise NotImplementedError - def startTagOther(self, name, attributes): + def startTagOther(self, token): self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) - def endTagP(self, name): + def endTagP(self, token): if self.tree.elementInScope("p"): self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": - self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',)) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) if self.tree.elementInScope("p"): while self.tree.elementInScope("p"): self.tree.openElements.pop() else: - self.startTagCloseP("p", {}) - self.endTagP("p") + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.endTagP(impliedTagToken("p")) - def endTagBody(self, name): + def endTagBody(self, token): # XXX Need to take open tags into account here. We shouldn't imply # but we should not throw a parse error either. Specification is # likely to be updated. - if self.tree.openElements[1].name != "body": + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): # innerHTML case self.parser.parseError() return - if self.tree.openElements[-1].name != "body": - self.parser.parseError(_(u"Unexpected end tag (%s). Missing " - u"end tag (%s).") % (u'body', self.tree.openElements[-1].name)) + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "p", + "tbody", "td", "tfoot", + "th", "thead", "tr")): + #Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"expectedName": "body", "gotName": node.name}) + break self.parser.phase = self.parser.phases["afterBody"] - def endTagHtml(self, name): - self.endTagBody(name) + def endTagHtml(self, token): + self.endTagBody(impliedTagToken("body")) if not self.parser.innerHTML: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagBlock(self, name): + def endTagBlock(self, token): #Put us back in the right whitespace handling mode - if name == "pre": + if token["name"] == "pre": self.processSpaceCharacters = self.processSpaceCharactersNonPre - inScope = self.tree.elementInScope(name) + inScope = self.tree.elementInScope(token["name"]) if inScope: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) if inScope: node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagForm(self, name): - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (form) seen too early. Ignored.")) - else: - self.tree.openElements.pop() + def endTagForm(self, token): + node = self.tree.formPointer self.tree.formPointer = None + if node is None or not self.tree.elementInScope(token["name"]): + self.parser.parseError("unexpected-end-tag", + {"name":"form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) - def endTagListItem(self, name): + def endTagListItem(self, token): # AT Could merge this with the Block case - if self.tree.elementInScope(name): - self.tree.generateImpliedEndTags(name) + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags(token["name"]) - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"End tag (%s) seen too " - u"early. Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): node = self.tree.openElements.pop() - while node.name != name: + while node.name != token["name"]: node = self.tree.openElements.pop() - def endTagHeading(self, name): + def endTagHeading(self, token): for item in headingElements: if self.tree.elementInScope(item): self.tree.generateImpliedEndTags() break - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). " - u"Expected other end tag.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) for item in headingElements: if self.tree.elementInScope(item): @@ -1073,38 +1345,37 @@ class InBodyPhase(Phase): item = self.tree.openElements.pop() break - def endTagFormatting(self, name): - """The much-feared adoption agency algorithm - """ + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency # XXX Better parseError messages appreciated. + name = token["name"] while True: # Step 1 paragraph 1 - afeElement = self.tree.elementInActiveFormattingElements(name) + afeElement = self.tree.elementInActiveFormattingElements( + token["name"]) if not afeElement or (afeElement in self.tree.openElements and not self.tree.elementInScope(afeElement.name)): - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.1", {"name": token["name"]}) return # Step 1 paragraph 2 elif afeElement not in self.tree.openElements: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) self.tree.activeFormattingElements.remove(afeElement) return # Step 1 paragraph 3 if afeElement != self.tree.openElements[-1]: - self.parser.parseError(_(u"End tag (%s) violates " - u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,)) + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) # Step 2 # Start of the adoption agency algorithm proper afeIndex = self.tree.openElements.index(afeElement) furthestBlock = None for element in self.tree.openElements[afeIndex:]: - if element.name in specialElements | scopingElements: + if (element.nameTuple in + specialElements | scopingElements): furthestBlock = element break @@ -1118,17 +1389,17 @@ class InBodyPhase(Phase): commonAncestor = self.tree.openElements[afeIndex-1] # Step 5 - if furthestBlock.parent: - furthestBlock.parent.removeChild(furthestBlock) + #if furthestBlock.parent: + # furthestBlock.parent.removeChild(furthestBlock) - # Step 6 + # Step 5 # The bookmark is supposed to help us identify where to reinsert # nodes in step 12. We have to ensure that we reinsert nodes after # the node before the active formatting element. Note the bookmark # can move in step 7.4 bookmark = self.tree.activeFormattingElements.index(afeElement) - # Step 7 + # Step 6 lastNode = node = furthestBlock while True: # AT replace this with a function and recursion? @@ -1140,26 +1411,24 @@ class InBodyPhase(Phase): node = self.tree.openElements[ self.tree.openElements.index(node)-1] self.tree.openElements.remove(tmpNode) - # Step 7.3 + # Step 6.3 if node == afeElement: break - # Step 7.4 + # Step 6.4 if lastNode == furthestBlock: - # XXX should this be index(node) or index(node)+1 - # Anne: I think +1 is ok. Given x = [2,3,4,5] - # x.index(3) gives 1 and then x[1 +1] gives 4... - bookmark = self.tree.activeFormattingElements.\ - index(node) + 1 - # Step 7.5 - cite = node.parent - if node.hasContent(): - clone = node.cloneNode() - # Replace node with clone - self.tree.activeFormattingElements[ - self.tree.activeFormattingElements.index(node)] = clone - self.tree.openElements[ - self.tree.openElements.index(node)] = clone - node = clone + bookmark = (self.tree.activeFormattingElements.index(node) + + 1) + # Step 6.5 + #cite = node.parent + #if node.hasContent(): + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + # Step 7.6 # Remove lastNode from its parents, if any if lastNode.parent: @@ -1167,87 +1436,101 @@ class InBodyPhase(Phase): node.appendChild(lastNode) # Step 7.7 lastNode = node - # End of inner loop + # End of inner loop - # Step 8 + # Step 7 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster parent the + # lastNode if lastNode.parent: lastNode.parent.removeChild(lastNode) commonAncestor.appendChild(lastNode) - # Step 9 + # Step 8 clone = afeElement.cloneNode() - # Step 10 + # Step 9 furthestBlock.reparentChildren(clone) - # Step 11 + # Step 10 furthestBlock.appendChild(clone) - # Step 12 + # Step 11 self.tree.activeFormattingElements.remove(afeElement) self.tree.activeFormattingElements.insert(bookmark, clone) - # Step 13 + # Step 12 self.tree.openElements.remove(afeElement) self.tree.openElements.insert( self.tree.openElements.index(furthestBlock) + 1, clone) - def endTagButtonMarqueeObject(self, name): - if self.tree.elementInScope(name): + def endTagAppletButtonMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if self.tree.elementInScope(name): + if self.tree.elementInScope(token["name"]): element = self.tree.openElements.pop() - while element.name != name: + while element.name != token["name"]: element = self.tree.openElements.pop() self.tree.clearActiveFormattingElements() - def endTagMisplaced(self, name): - # This handles elements with end tags in other insertion modes. - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagBr(self, name): - self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element.")) + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(name, {}) + self.tree.insertElement(impliedTagToken("br", "StartTag")) self.tree.openElements.pop() - def endTagNone(self, name): - # This handles elements with no end tag. - self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,)) - - def endTagCdataTextAreaXmp(self, name): - if self.tree.openElements[-1].name == name: - self.tree.openElements.pop() - else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) - - def endTagNew(self, name): - """New HTML5 elements, "event-source", "section", "nav", - "article", "aside", "header", "footer", "datagrid", "command" - """ - sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name) - self.endTagOther(name) - #raise NotImplementedError - - def endTagOther(self, name): - # XXX This logic should be moved into the treebuilder - # AT should use reversed instead of [::-1] when Python 2.4 == True. + def endTagOther(self, token): for node in self.tree.openElements[::-1]: - if node.name == name: + if node.name == token["name"]: self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,)) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while self.tree.openElements.pop() != node: pass break else: - if node.name in specialElements | scopingElements: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + if (node.nameTuple in + specialElements | scopingElements): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) break +class InCDataRCDataPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.startTagHandler = utils.MethodDispatcher([]) + self.startTagHandler.default = self.startTagOther + self.endTagHandler = utils.MethodDispatcher([ + ("script", self.endTagScript)]) + self.endTagHandler.default = self.endTagOther + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + self.tree.openElements[-1].name) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + self.parser.phase.processEOF() + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in (R)CDATA mode"%name + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + #The rest of this method is all stuff that only happens if + #document.write works + + def endTagOther(self, token): + node = self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + class InTablePhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-table def __init__(self, parser, tree): @@ -1259,7 +1542,9 @@ class InTablePhase(Phase): ("col", self.startTagCol), (("tbody", "tfoot", "thead"), self.startTagRowGroup), (("td", "th", "tr"), self.startTagImplyTbody), - ("table", self.startTagTable) + ("table", self.startTagTable), + (("style", "script"), self.startTagStyleScript), + ("input", self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1274,66 +1559,101 @@ class InTablePhase(Phase): def clearStackToTableContext(self): # "clear the stack back to a table context" while self.tree.openElements[-1].name not in ("table", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() # When the current node is it's an innerHTML case + def getCurrentTable(self): + i = -1 + while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table": + i -= 1 + if -i > len(self.tree.openElements): + return self.tree.openElements[0] + else: + return self.tree.openElements[i] + # processing methods - def processCharacters(self, data): - self.parser.parseError(_(u"Unexpected non-space characters in " - u"table context caused voodoo mode.")) - # Make all the special element rearranging voodoo kick in + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + #Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.characterTokens.append(token) + + def processCharacters(self, token): + #If we get here there must be at least one non-whitespace character + # Do the table magic! self.tree.insertFromTable = True - # Process the character in the "in body" mode - self.parser.phases["inBody"].processCharacters(data) + self.parser.phases["inBody"].processCharacters(token) self.tree.insertFromTable = False - def startTagCaption(self, name, attributes): + def startTagCaption(self, token): self.clearStackToTableContext() self.tree.activeFormattingElements.append(Marker) - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCaption"] - def startTagColgroup(self, name, attributes): + def startTagColgroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inColumnGroup"] - def startTagCol(self, name, attributes): - self.startTagColgroup("colgroup", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagRowGroup(self, name, attributes): + def startTagRowGroup(self, token): self.clearStackToTableContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inTableBody"] - def startTagImplyTbody(self, name, attributes): - self.startTagRowGroup("tbody", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTable(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (table) in table " - u"phase. Implies end tag (table).")) - self.parser.phase.processEndTag("table") + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) if not self.parser.innerHTML: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def startTagStyleScript(self, token): + self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") + self.tree.insertElement(token) + # XXX associate with form + self.tree.openElements.pop() + else: + self.startTagOther(token) + + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the start tag in the "in body" mode - self.parser.phases["inBody"].processStartTag(name, attributes) + self.parser.phases["inBody"].processStartTag(token) self.tree.insertFromTable = False - def endTagTable(self, name): + def endTagTable(self, token): if self.tree.elementInScope("table", True): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": - self.parser.parseError(_(u"Unexpected end tag (table). " - u"Expected end tag (%s).") % (self.tree.openElements[-1].name,)) + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1343,18 +1663,61 @@ class InTablePhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in " - u"table context caused voodoo mode.") % (name,)) - # Make all the special element rearranging voodoo kick in + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + if "tainted" not in self.getCurrentTable()._flags: + self.getCurrentTable()._flags.append("tainted") + # Do the table magic! self.tree.insertFromTable = True - # Process the end tag in the "in body" mode - self.parser.phases["inBody"].processEndTag(name) + self.parser.phases["inBody"].processEndTag(token) self.tree.insertFromTable = False +class InTableTextPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any([item not in spaceCharacters for item in data]): + token = {"type":tokenTypes["Characters"], "data":data} + self.originalPhase.processCharacters(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processComment(token) + + def processEOF(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEOF(token) + + def processCharacters(self, token): + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + #pretty sure we should never reach here + self.characterTokens.append(token) +# assert False + + def processStartTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processStartTag(token) + + def processEndTag(self, token): + self.flushCharacters() + self.phase = self.originalPhase + self.phase.processEndTag(token) + class InCaptionPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-caption @@ -1379,27 +1742,31 @@ class InCaptionPhase(Phase): def ignoreEndTagCaption(self): return not self.tree.elementInScope("caption", True) - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() - def startTagTableElement(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): self.parser.parseError() #XXX Have to duplicate logic here to find out if the tag is ignored ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) - def endTagCaption(self, name): + def endTagCaption(self, token): if not self.ignoreEndTagCaption(): # AT this code is quite similar to endTagTable in "InTable" self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "caption": - self.parser.parseError(_(u"Unexpected end tag (caption). " - u"Missing end tags.")) + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() self.tree.openElements.pop() @@ -1410,18 +1777,18 @@ class InCaptionPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): self.parser.parseError() ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag("caption") + self.parser.phase.processEndTag(impliedTagToken("caption")) if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) class InColumnGroupPhase(Phase): @@ -1445,23 +1812,33 @@ class InColumnGroupPhase(Phase): def ignoreEndTagColgroup(self): return self.tree.openElements[-1].name == "html" - def processCharacters(self, data): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup("colgroup") - if not ignoreEndTag: - self.parser.phase.processCharacters(data) + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup("colgroup") + if not ignoreEndTag: + self.parser.phase.processEOF() - def startTagCol(self, name ,attributes): - self.tree.insertElement(name, attributes) + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + self.parser.phase.processCharacters(token) + + def startTagCol(self, token): + self.tree.insertElement(token) self.tree.openElements.pop() - def startTagOther(self, name, attributes): + def startTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def endTagColgroup(self, name): + def endTagColgroup(self, token): if self.ignoreEndTagColgroup(): # innerHTML case assert self.parser.innerHTML @@ -1470,15 +1847,14 @@ class InColumnGroupPhase(Phase): self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] - def endTagCol(self, name): - self.parser.parseError(_(u"Unexpected end tag (col). " - u"col has no end tag.")) + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) - def endTagOther(self, name): + def endTagOther(self, token): ignoreEndTag = self.ignoreEndTagColgroup() self.endTagColgroup("colgroup") if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) class InTableBodyPhase(Phase): @@ -1489,7 +1865,8 @@ class InTableBodyPhase(Phase): ("html", self.startTagHtml), ("tr", self.startTagTr), (("td", "th"), self.startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther) + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + self.startTagTableOther) ]) self.startTagHandler.default = self.startTagOther @@ -1505,62 +1882,76 @@ class InTableBodyPhase(Phase): def clearStackToTableBodyContext(self): while self.tree.openElements[-1].name not in ("tbody", "tfoot", "thead", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") % (self.tree.openElements[-1].name,)) + #self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML # the rest - def processCharacters(self,data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTr(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTr(self, token): self.clearStackToTableBodyContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inRow"] - def startTagTableCell(self, name, attributes): - self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,)) - self.startTagTr("tr", {}) - self.parser.phase.processStartTag(name, attributes) + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + self.parser.phase.processStartTag(token) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): # XXX AT Any ideas on how to share this with endTagTable? if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processStartTag(name, attributes) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.clearStackToTableBodyContext() self.tree.openElements.pop() self.parser.phase = self.parser.phases["inTable"] else: - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagTable(self, name): + def endTagTable(self, token): if (self.tree.elementInScope("tbody", True) or self.tree.elementInScope("thead", True) or self.tree.elementInScope("tfoot", True)): self.clearStackToTableBodyContext() - self.endTagTableRowGroup(self.tree.openElements[-1].name) - self.parser.phase.processEndTag(name) + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InRowPhase(Phase): @@ -1587,33 +1978,40 @@ class InRowPhase(Phase): # helper methods (XXX unify this with other table helper methods) def clearStackToTableRowContext(self): while self.tree.openElements[-1].name not in ("tr", "html"): - self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") % (self.tree.openElements[-1].name,)) + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() def ignoreEndTagTr(self): return not self.tree.elementInScope("tr", tableVariant=True) # the rest - def processCharacters(self, data): - self.parser.phases["inTable"].processCharacters(data) + def processEOF(self): + self.parser.phases["inTable"].processEOF() + + def processSpaceCharacters(self, token): + self.parser.phases["inTable"].processSpaceCharacters(token) - def startTagTableCell(self, name, attributes): + def processCharacters(self, token): + self.parser.phases["inTable"].processCharacters(token) + + def startTagTableCell(self, token): self.clearStackToTableRowContext() - self.tree.insertElement(name, attributes) + self.tree.insertElement(token) self.parser.phase = self.parser.phases["inCell"] self.tree.activeFormattingElements.append(Marker) - def startTagTableOther(self, name, attributes): + def startTagTableOther(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) - def startTagOther(self, name, attributes): - self.parser.phases["inTable"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inTable"].processStartTag(token) - def endTagTr(self, name): + def endTagTr(self, token): if not self.ignoreEndTagTr(): self.clearStackToTableRowContext() self.tree.openElements.pop() @@ -1623,27 +2021,28 @@ class InRowPhase(Phase): assert self.parser.innerHTML self.parser.parseError() - def endTagTable(self, name): + def endTagTable(self, token): ignoreEndTag = self.ignoreEndTagTr() self.endTagTr("tr") # Reprocess the current tag if the tr end tag was not ignored # XXX how are we sure it's always ignored in the innerHTML case? if not ignoreEndTag: - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) - def endTagTableRowGroup(self, name): - if self.tree.elementInScope(name, True): + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], True): self.endTagTr("tr") - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # innerHTML case self.parser.parseError() - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) - def endTagOther(self, name): - self.parser.phases["inTable"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inTable"].processEndTag(token) class InCellPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-cell @@ -1666,60 +2065,63 @@ class InCellPhase(Phase): # helper def closeCell(self): if self.tree.elementInScope("td", True): - self.endTagTableCell("td") + self.endTagTableCell(impliedTagToken("td")) elif self.tree.elementInScope("th", True): - self.endTagTableCell("th") + self.endTagTableCell(impliedTagToken("th")) # the rest - def processCharacters(self, data): - self.parser.phases["inBody"].processCharacters(data) + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + self.parser.phases["inBody"].processCharacters(token) - def startTagTableOther(self, name, attributes): - if self.tree.elementInScope("td", True) or \ - self.tree.elementInScope("th", True): + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", True) or + self.tree.elementInScope("th", True)): self.closeCell() - self.parser.phase.processStartTag(name, attributes) + self.parser.phase.processStartTag(token) else: # innerHTML case self.parser.parseError() - def startTagOther(self, name, attributes): - self.parser.phases["inBody"].processStartTag(name, attributes) + def startTagOther(self, token): + self.parser.phases["inBody"].processStartTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.startTagHandler.default =\ self.parser.phases["inBody"].processStartTag - def endTagTableCell(self, name): - if self.tree.elementInScope(name, True): - self.tree.generateImpliedEndTags(name) - if self.tree.openElements[-1].name != name: - self.parser.parseError("Got table cell end tag (" + name +\ - ") while required end tags are missing.") + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], True): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", + {"name": token["name"]}) while True: node = self.tree.openElements.pop() - if node.name == name: + if node.name == token["name"]: break else: self.tree.openElements.pop() self.tree.clearActiveFormattingElements() self.parser.phase = self.parser.phases["inRow"] else: - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagIgnore(self, name): - self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagImply(self, name): - if self.tree.elementInScope(name, True): + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], True): self.closeCell() - self.parser.phase.processEndTag(name) + self.parser.phase.processEndTag(token) else: # sometimes innerHTML case self.parser.parseError() - def endTagOther(self, name): - self.parser.phases["inBody"].processEndTag(name) + def endTagOther(self, token): + self.parser.phases["inBody"].processEndTag(token) # Optimize this for subsequent invocations. Can't do this initially # because self.phases doesn't really exist at that point. self.endTagHandler.default = self.parser.phases["inBody"].processEndTag @@ -1733,7 +2135,8 @@ class InSelectPhase(Phase): ("html", self.startTagHtml), ("option", self.startTagOption), ("optgroup", self.startTagOptgroup), - ("select", self.startTagSelect) + ("select", self.startTagSelect), + (("input", "keygen", "textarea"), self.startTagInput) ]) self.startTagHandler.default = self.startTagOther @@ -1747,52 +2150,63 @@ class InSelectPhase(Phase): self.endTagHandler.default = self.endTagOther # http://www.whatwg.org/specs/web-apps/current-work/#in-select - def processCharacters(self, data): - self.tree.insertText(data) + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML - def startTagOption(self, name, attributes): + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def startTagOption(self, token): # We need to imply
tags into account here. We shouldn't imply #