Update to the lastest html5lib; replace feedparser's sanitizer with

html5lib's
This commit is contained in:
Sam Ruby 2009-09-09 10:54:21 -04:00
parent 63fa05e556
commit 6f0f23dd36
32 changed files with 4868 additions and 2386 deletions

View File

@ -16,7 +16,7 @@ Todo:
import re, time, sgmllib
from xml.sax.saxutils import escape
from xml.dom import minidom, Node
from html5lib import liberalxmlparser
from html5lib import html5parser
from html5lib.treebuilders import dom
import planet, config
@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
bozo=1
if detail.type.find('xhtml')<0 or bozo:
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue

View File

@ -128,5 +128,11 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML(
node.value, 'utf-8', node.type)
# Run this through HTML5's serializer
from html5lib import html5parser, sanitizer, treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
doc = p.parseFragment(node.value, encoding='utf-8')
walker = treewalkers.getTreeWalker('simpletree')
xhtml = serializer.XHTMLSerializer()
tree = xhtml.serialize(walker(doc), encoding='utf-8')
node['value'] = ''.join([n for n in tree])

View File

@ -11,5 +11,6 @@ f = open("my_document.html")
p = html5lib.HTMLParser()
tree = p.parse(f)
"""
from html5parser import HTMLParser
from liberalxmlparser import XMLParser, XHTMLParser
from html5parser import HTMLParser, parse
from treebuilders import getTreeBuilder
from serializer import serialize

View File

@ -1,4 +1,5 @@
import string
import string, gettext
_ = gettext.gettext
try:
frozenset
@ -9,6 +10,260 @@ except NameError:
EOF = None
E = {
"null-character":
_(u"Null character in input stream, replaced with U+FFFD."),
"invalid-character":
_(u"Invalid codepoint in stream."),
"incorrectly-placed-solidus":
_(u"Solidus (/) incorrectly placed in tag."),
"incorrect-cr-newline-entity":
_(u"Incorrect CR newline entity, replaced with LF."),
"illegal-windows-1252-entity":
_(u"Entity used with illegal number (windows-1252 reference)."),
"cant-convert-numeric-entity":
_(u"Numeric entity couldn't be converted to character "
u"(codepoint U+%(charAsInt)08x)."),
"illegal-codepoint-for-numeric-entity":
_(u"Numeric entity represents an illegal codepoint: "
u"U+%(charAsInt)08x."),
"numeric-entity-without-semicolon":
_(u"Numeric entity didn't end with ';'."),
"expected-numeric-entity-but-got-eof":
_(u"Numeric entity expected. Got end of file instead."),
"expected-numeric-entity":
_(u"Numeric entity expected but none found."),
"named-entity-without-semicolon":
_(u"Named entity didn't end with ';'."),
"expected-named-entity":
_(u"Named entity expected. Got none."),
"attributes-in-end-tag":
_(u"End tag contains unexpected attributes."),
"expected-tag-name-but-got-right-bracket":
_(u"Expected tag name. Got '>' instead."),
"expected-tag-name-but-got-question-mark":
_(u"Expected tag name. Got '?' instead. (HTML doesn't "
u"support processing instructions.)"),
"expected-tag-name":
_(u"Expected tag name. Got something else instead"),
"expected-closing-tag-but-got-right-bracket":
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
"expected-closing-tag-but-got-eof":
_(u"Expected closing tag. Unexpected end of file."),
"expected-closing-tag-but-got-char":
_(u"Expected closing tag. Unexpected character '%(data)s' found."),
"eof-in-tag-name":
_(u"Unexpected end of file in the tag name."),
"expected-attribute-name-but-got-eof":
_(u"Unexpected end of file. Expected attribute name instead."),
"eof-in-attribute-name":
_(u"Unexpected end of file in attribute name."),
"invalid-character-in-attribute-name":
_(u"Invalid chracter in attribute name"),
"duplicate-attribute":
_(u"Dropped duplicate attribute on tag."),
"expected-end-of-tag-name-but-got-eof":
_(u"Unexpected end of file. Expected = or end of tag."),
"expected-attribute-value-but-got-eof":
_(u"Unexpected end of file. Expected attribute value."),
"expected-attribute-value-but-got-right-bracket":
_(u"Expected attribute value. Got '>' instead."),
"eof-in-attribute-value-double-quote":
_(u"Unexpected end of file in attribute value (\")."),
"eof-in-attribute-value-single-quote":
_(u"Unexpected end of file in attribute value (')."),
"eof-in-attribute-value-no-quotes":
_(u"Unexpected end of file in attribute value."),
"unexpected-EOF-after-solidus-in-tag":
_(u"Unexpected end of file in tag. Expected >"),
"unexpected-character-after-soldius-in-tag":
_(u"Unexpected character after / in tag. Expected >"),
"expected-dashes-or-doctype":
_(u"Expected '--' or 'DOCTYPE'. Not found."),
"incorrect-comment":
_(u"Incorrect comment."),
"eof-in-comment":
_(u"Unexpected end of file in comment."),
"eof-in-comment-end-dash":
_(u"Unexpected end of file in comment (-)"),
"unexpected-dash-after-double-dash-in-comment":
_(u"Unexpected '-' after '--' found in comment."),
"eof-in-comment-double-dash":
_(u"Unexpected end of file in comment (--)."),
"unexpected-char-in-comment":
_(u"Unexpected character in comment found."),
"need-space-after-doctype":
_(u"No space after literal string 'DOCTYPE'."),
"expected-doctype-name-but-got-right-bracket":
_(u"Unexpected > character. Expected DOCTYPE name."),
"expected-doctype-name-but-got-eof":
_(u"Unexpected end of file. Expected DOCTYPE name."),
"eof-in-doctype-name":
_(u"Unexpected end of file in DOCTYPE name."),
"eof-in-doctype":
_(u"Unexpected end of file in DOCTYPE."),
"expected-space-or-right-bracket-in-doctype":
_(u"Expected space or '>'. Got '%(data)s'"),
"unexpected-end-of-doctype":
_(u"Unexpected end of DOCTYPE."),
"unexpected-char-in-doctype":
_(u"Unexpected character in DOCTYPE."),
"eof-in-innerhtml":
_(u"XXX innerHTML EOF"),
"unexpected-doctype":
_(u"Unexpected DOCTYPE. Ignored."),
"non-html-root":
_(u"html needs to be the first start tag."),
"expected-doctype-but-got-eof":
_(u"Unexpected End of file. Expected DOCTYPE."),
"unknown-doctype":
_(u"Erroneous DOCTYPE."),
"expected-doctype-but-got-chars":
_(u"Unexpected non-space characters. Expected DOCTYPE."),
"expected-doctype-but-got-start-tag":
_(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
"expected-doctype-but-got-end-tag":
_(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
"end-tag-after-implied-root":
_(u"Unexpected end tag (%(name)s) after the (implied) root element."),
"expected-named-closing-tag-but-got-eof":
_(u"Unexpected end of file. Expected end tag (%(name)s)."),
"two-heads-are-not-better-than-one":
_(u"Unexpected start tag head in existing head. Ignored."),
"unexpected-end-tag":
_(u"Unexpected end tag (%(name)s). Ignored."),
"unexpected-start-tag-out-of-my-head":
_(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
"unexpected-start-tag":
_(u"Unexpected start tag (%(name)s)."),
"missing-end-tag":
_(u"Missing end tag (%(name)s)."),
"missing-end-tags":
_(u"Missing end tags (%(name)s)."),
"unexpected-start-tag-implies-end-tag":
_(u"Unexpected start tag (%(startName)s) "
u"implies end tag (%(endName)s)."),
"unexpected-start-tag-treated-as":
_(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
"deprecated-tag":
_(u"Unexpected start tag %(name)s. Don't use it!"),
"unexpected-start-tag-ignored":
_(u"Unexpected start tag %(name)s. Ignored."),
"expected-one-end-tag-but-got-another":
_(u"Unexpected end tag (%(gotName)s). "
u"Missing end tag (%(expectedName)s)."),
"end-tag-too-early":
_(u"End tag (%(name)s) seen too early. Expected other end tag."),
"end-tag-too-early-named":
_(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
"end-tag-too-early-ignored":
_(u"End tag (%(name)s) seen too early. Ignored."),
"adoption-agency-1.1":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 1 of the adoption agency algorithm."),
"adoption-agency-1.2":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 2 of the adoption agency algorithm."),
"adoption-agency-1.3":
_(u"End tag (%(name)s) violates step 1, "
u"paragraph 3 of the adoption agency algorithm."),
"unexpected-end-tag-treated-as":
_(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
"no-end-tag":
_(u"This element (%(name)s) has no end tag."),
"unexpected-implied-end-tag-in-table":
_(u"Unexpected implied end tag (%(name)s) in the table phase."),
"unexpected-implied-end-tag-in-table-body":
_(u"Unexpected implied end tag (%(name)s) in the table body phase."),
"unexpected-char-implies-table-voodoo":
_(u"Unexpected non-space characters in "
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
"unexpected-end-tag-implies-table-voodoo":
_(u"Unexpected end tag (%(name)s) in "
u"table context caused voodoo mode."),
"unexpected-cell-in-table-body":
_(u"Unexpected table cell start tag (%(name)s) "
u"in the table body phase."),
"unexpected-cell-end-tag":
_(u"Got table cell end tag (%(name)s) "
u"while required end tags are missing."),
"unexpected-end-tag-in-table-body":
_(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
"unexpected-implied-end-tag-in-table-row":
_(u"Unexpected implied end tag (%(name)s) in the table row phase."),
"unexpected-end-tag-in-table-row":
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
"unexpected-select-in-select":
_(u"Unexpected select start tag in the select phase "
u"treated as select end tag."),
"unexpected-input-in-select":
_(u"Unexpected input start tag in the select phase."),
"unexpected-start-tag-in-select":
_(u"Unexpected start tag token (%(name)s in the select phase. "
u"Ignored."),
"unexpected-end-tag-in-select":
_(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
"unexpected-table-element-start-tag-in-select-in-table":
_(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
"unexpected-table-element-end-tag-in-select-in-table":
_(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
"unexpected-char-after-body":
_(u"Unexpected non-space characters in the after body phase."),
"unexpected-start-tag-after-body":
_(u"Unexpected start tag token (%(name)s)"
u" in the after body phase."),
"unexpected-end-tag-after-body":
_(u"Unexpected end tag token (%(name)s)"
u" in the after body phase."),
"unexpected-char-in-frameset":
_(u"Unepxected characters in the frameset phase. Characters ignored."),
"unexpected-start-tag-in-frameset":
_(u"Unexpected start tag token (%(name)s)"
u" in the frameset phase. Ignored."),
"unexpected-frameset-in-frameset-innerhtml":
_(u"Unexpected end tag token (frameset) "
u"in the frameset phase (innerHTML)."),
"unexpected-end-tag-in-frameset":
_(u"Unexpected end tag token (%(name)s)"
u" in the frameset phase. Ignored."),
"unexpected-char-after-frameset":
_(u"Unexpected non-space characters in the "
u"after frameset phase. Ignored."),
"unexpected-start-tag-after-frameset":
_(u"Unexpected start tag (%(name)s)"
u" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-frameset":
_(u"Unexpected end tag (%(name)s)"
u" in the after frameset phase. Ignored."),
"unexpected-end-tag-after-body-innerhtml":
_(u"Unexpected end tag after body(innerHtml)"),
"expected-eof-but-got-char":
_(u"Unexpected non-space characters. Expected end of file."),
"expected-eof-but-got-start-tag":
_(u"Unexpected start tag (%(name)s)"
u". Expected end of file."),
"expected-eof-but-got-end-tag":
_(u"Unexpected end tag (%(name)s)"
u". Expected end of file."),
"eof-in-table":
_(u"Unexpected end of file. Expected table content."),
"eof-in-select":
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
contentModelFlags = {
"PCDATA":0,
"RCDATA":1,
@ -16,101 +271,126 @@ contentModelFlags = {
"PLAINTEXT":3
}
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
"svg":"http://www.w3.org/2000/svg",
"xlink":"http://www.w3.org/1999/xlink",
"xml":"http://www.w3.org/XML/1998/namespace",
"xmlns":"http://www.w3.org/2000/xmlns/"
}
scopingElements = frozenset((
"button",
"caption",
"html",
"marquee",
"object",
"table",
"td",
"th"
(namespaces["html"], "applet"),
(namespaces["html"], "button"),
(namespaces["html"], "caption"),
(namespaces["html"], "html"),
(namespaces["html"], "marquee"),
(namespaces["html"], "object"),
(namespaces["html"], "table"),
(namespaces["html"], "td"),
(namespaces["html"], "th"),
(namespaces["svg"], "foreignObject")
))
formattingElements = frozenset((
"a",
"b",
"big",
"em",
"font",
"i",
"nobr",
"s",
"small",
"strike",
"strong",
"tt",
"u"
(namespaces["html"], "a"),
(namespaces["html"], "b"),
(namespaces["html"], "big"),
(namespaces["html"], "code"),
(namespaces["html"], "em"),
(namespaces["html"], "font"),
(namespaces["html"], "i"),
(namespaces["html"], "nobr"),
(namespaces["html"], "s"),
(namespaces["html"], "small"),
(namespaces["html"], "strike"),
(namespaces["html"], "strong"),
(namespaces["html"], "tt"),
(namespaces["html"], "u")
))
specialElements = frozenset((
"address",
"area",
"base",
"basefont",
"bgsound",
"blockquote",
"body",
"br",
"center",
"col",
"colgroup",
"dd",
"dir",
"div",
"dl",
"dt",
"embed",
"fieldset",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"hr",
"iframe",
"image",
"img",
"input",
"isindex",
"li",
"link",
"listing",
"menu",
"meta",
"noembed",
"noframes",
"noscript",
"ol",
"optgroup",
"option",
"p",
"param",
"plaintext",
"pre",
"script",
"select",
"spacer",
"style",
"tbody",
"textarea",
"tfoot",
"thead",
"title",
"tr",
"ul",
"wbr"
(namespaces["html"], "address"),
(namespaces["html"], "area"),
(namespaces["html"], "article"),
(namespaces["html"], "aside"),
(namespaces["html"], "base"),
(namespaces["html"], "basefont"),
(namespaces["html"], "bgsound"),
(namespaces["html"], "blockquote"),
(namespaces["html"], "body"),
(namespaces["html"], "br"),
(namespaces["html"], "center"),
(namespaces["html"], "col"),
(namespaces["html"], "colgroup"),
(namespaces["html"], "command"),
(namespaces["html"], "datagrid"),
(namespaces["html"], "dd"),
(namespaces["html"], "details"),
(namespaces["html"], "dialog"),
(namespaces["html"], "dir"),
(namespaces["html"], "div"),
(namespaces["html"], "dl"),
(namespaces["html"], "dt"),
(namespaces["html"], "embed"),
(namespaces["html"], "event-source"),
(namespaces["html"], "fieldset"),
(namespaces["html"], "figure"),
(namespaces["html"], "footer"),
(namespaces["html"], "form"),
(namespaces["html"], "frame"),
(namespaces["html"], "frameset"),
(namespaces["html"], "h1"),
(namespaces["html"], "h2"),
(namespaces["html"], "h3"),
(namespaces["html"], "h4"),
(namespaces["html"], "h5"),
(namespaces["html"], "h6"),
(namespaces["html"], "head"),
(namespaces["html"], "header"),
(namespaces["html"], "hr"),
(namespaces["html"], "iframe"),
# Note that image is commented out in the spec as "this isn't an
# element that can end up on the stack, so it doesn't matter,"
(namespaces["html"], "image"),
(namespaces["html"], "img"),
(namespaces["html"], "input"),
(namespaces["html"], "isindex"),
(namespaces["html"], "li"),
(namespaces["html"], "link"),
(namespaces["html"], "listing"),
(namespaces["html"], "menu"),
(namespaces["html"], "meta"),
(namespaces["html"], "nav"),
(namespaces["html"], "noembed"),
(namespaces["html"], "noframes"),
(namespaces["html"], "noscript"),
(namespaces["html"], "ol"),
(namespaces["html"], "optgroup"),
(namespaces["html"], "option"),
(namespaces["html"], "p"),
(namespaces["html"], "param"),
(namespaces["html"], "plaintext"),
(namespaces["html"], "pre"),
(namespaces["html"], "script"),
(namespaces["html"], "section"),
(namespaces["html"], "select"),
(namespaces["html"], "spacer"),
(namespaces["html"], "style"),
(namespaces["html"], "tbody"),
(namespaces["html"], "textarea"),
(namespaces["html"], "tfoot"),
(namespaces["html"], "thead"),
(namespaces["html"], "title"),
(namespaces["html"], "tr"),
(namespaces["html"], "ul"),
(namespaces["html"], "wbr")
))
spaceCharacters = frozenset((
u"\t",
u"\n",
u"\u000B",
u"\u000C",
u" ",
u"\r"
@ -143,9 +423,10 @@ headingElements = (
"h6"
)
# XXX What about event-source and command?
voidElements = frozenset((
"base",
"command",
"event-source",
"link",
"meta",
"hr",
@ -155,7 +436,8 @@ voidElements = frozenset((
"param",
"area",
"col",
"input"
"input",
"source"
))
cdataElements = frozenset(('title', 'textarea'))
@ -440,7 +722,7 @@ entities = {
"kappa;": u"\u03BA",
"lArr;": u"\u21D0",
"lambda;": u"\u03BB",
"lang;": u"\u3008",
"lang;": u"\u27E8",
"laquo;": u"\u00AB",
"laquo": u"\u00AB",
"larr;": u"\u2190",
@ -520,7 +802,7 @@ entities = {
"quot": u"\u0022",
"rArr;": u"\u21D2",
"radic;": u"\u221A",
"rang;": u"\u3009",
"rang;": u"\u27E9",
"raquo;": u"\u00BB",
"raquo": u"\u00BB",
"rarr;": u"\u2192",
@ -596,221 +878,255 @@ entities = {
"zwnj;": u"\u200C"
}
encodings = frozenset((
"ansi_x3.4-1968",
"iso-ir-6",
"ansi_x3.4-1986",
"iso_646.irv:1991",
"ascii",
"iso646-us",
"us-ascii",
"us",
"ibm367",
"cp367",
"csascii",
"ks_c_5601-1987",
"korean",
"iso-2022-kr",
"csiso2022kr",
"euc-kr",
"iso-2022-jp",
"csiso2022jp",
"iso-2022-jp-2",
"iso-ir-58",
"chinese",
"csiso58gb231280",
"iso_8859-1:1987",
"iso-ir-100",
"iso_8859-1",
"iso-8859-1",
"latin1",
"l1",
"ibm819",
"cp819",
"csisolatin1",
"iso_8859-2:1987",
"iso-ir-101",
"iso_8859-2",
"iso-8859-2",
"latin2",
"l2",
"csisolatin2",
"iso_8859-3:1988",
"iso-ir-109",
"iso_8859-3",
"iso-8859-3",
"latin3",
"l3",
"csisolatin3",
"iso_8859-4:1988",
"iso-ir-110",
"iso_8859-4",
"iso-8859-4",
"latin4",
"l4",
"csisolatin4",
"iso_8859-6:1987",
"iso-ir-127",
"iso_8859-6",
"iso-8859-6",
"ecma-114",
"asmo-708",
"arabic",
"csisolatinarabic",
"iso_8859-7:1987",
"iso-ir-126",
"iso_8859-7",
"iso-8859-7",
"elot_928",
"ecma-118",
"greek",
"greek8",
"csisolatingreek",
"iso_8859-8:1988",
"iso-ir-138",
"iso_8859-8",
"iso-8859-8",
"hebrew",
"csisolatinhebrew",
"iso_8859-5:1988",
"iso-ir-144",
"iso_8859-5",
"iso-8859-5",
"cyrillic",
"csisolatincyrillic",
"iso_8859-9:1989",
"iso-ir-148",
"iso_8859-9",
"iso-8859-9",
"latin5",
"l5",
"csisolatin5",
"iso-8859-10",
"iso-ir-157",
"l6",
"iso_8859-10:1992",
"csisolatin6",
"latin6",
"hp-roman8",
"roman8",
"r8",
"ibm037",
"cp037",
"csibm037",
"ibm424",
"cp424",
"csibm424",
"ibm437",
"cp437",
"437",
"cspc8codepage437",
"ibm500",
"cp500",
"csibm500",
"ibm775",
"cp775",
"cspc775baltic",
"ibm850",
"cp850",
"850",
"cspc850multilingual",
"ibm852",
"cp852",
"852",
"cspcp852",
"ibm855",
"cp855",
"855",
"csibm855",
"ibm857",
"cp857",
"857",
"csibm857",
"ibm860",
"cp860",
"860",
"csibm860",
"ibm861",
"cp861",
"861",
"cp-is",
"csibm861",
"ibm862",
"cp862",
"862",
"cspc862latinhebrew",
"ibm863",
"cp863",
"863",
"csibm863",
"ibm864",
"cp864",
"csibm864",
"ibm865",
"cp865",
"865",
"csibm865",
"ibm866",
"cp866",
"866",
"csibm866",
"ibm869",
"cp869",
"869",
"cp-gr",
"csibm869",
"ibm1026",
"cp1026",
"csibm1026",
"koi8-r",
"cskoi8r",
"koi8-u",
"big5-hkscs",
"ptcp154",
"csptcp154",
"pt154",
"cp154",
"utf-7",
"utf-16be",
"utf-16le",
"utf-16",
"utf-8",
"iso-8859-13",
"iso-8859-14",
"iso-ir-199",
"iso_8859-14:1998",
"iso_8859-14",
"latin8",
"iso-celtic",
"l8",
"iso-8859-15",
"iso_8859-15",
"iso-8859-16",
"iso-ir-226",
"iso_8859-16:2001",
"iso_8859-16",
"latin10",
"l10",
"gbk",
"cp936",
"ms936",
"gb18030",
"shift_jis",
"ms_kanji",
"csshiftjis",
"euc-jp",
"gb2312",
"big5",
"csbig5",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"tis-620",
"hz-gb-2312",
))
encodings = {
'437': 'cp437',
'850': 'cp850',
'852': 'cp852',
'855': 'cp855',
'857': 'cp857',
'860': 'cp860',
'861': 'cp861',
'862': 'cp862',
'863': 'cp863',
'865': 'cp865',
'866': 'cp866',
'869': 'cp869',
'ansix341968': 'ascii',
'ansix341986': 'ascii',
'arabic': 'iso8859-6',
'ascii': 'ascii',
'asmo708': 'iso8859-6',
'big5': 'big5',
'big5hkscs': 'big5hkscs',
'chinese': 'gbk',
'cp037': 'cp037',
'cp1026': 'cp1026',
'cp154': 'ptcp154',
'cp367': 'ascii',
'cp424': 'cp424',
'cp437': 'cp437',
'cp500': 'cp500',
'cp775': 'cp775',
'cp819': 'windows-1252',
'cp850': 'cp850',
'cp852': 'cp852',
'cp855': 'cp855',
'cp857': 'cp857',
'cp860': 'cp860',
'cp861': 'cp861',
'cp862': 'cp862',
'cp863': 'cp863',
'cp864': 'cp864',
'cp865': 'cp865',
'cp866': 'cp866',
'cp869': 'cp869',
'cp936': 'gbk',
'cpgr': 'cp869',
'cpis': 'cp861',
'csascii': 'ascii',
'csbig5': 'big5',
'cseuckr': 'cp949',
'cseucpkdfmtjapanese': 'euc_jp',
'csgb2312': 'gbk',
'cshproman8': 'hp-roman8',
'csibm037': 'cp037',
'csibm1026': 'cp1026',
'csibm424': 'cp424',
'csibm500': 'cp500',
'csibm855': 'cp855',
'csibm857': 'cp857',
'csibm860': 'cp860',
'csibm861': 'cp861',
'csibm863': 'cp863',
'csibm864': 'cp864',
'csibm865': 'cp865',
'csibm866': 'cp866',
'csibm869': 'cp869',
'csiso2022jp': 'iso2022_jp',
'csiso2022jp2': 'iso2022_jp_2',
'csiso2022kr': 'iso2022_kr',
'csiso58gb231280': 'gbk',
'csisolatin1': 'windows-1252',
'csisolatin2': 'iso8859-2',
'csisolatin3': 'iso8859-3',
'csisolatin4': 'iso8859-4',
'csisolatin5': 'windows-1254',
'csisolatin6': 'iso8859-10',
'csisolatinarabic': 'iso8859-6',
'csisolatincyrillic': 'iso8859-5',
'csisolatingreek': 'iso8859-7',
'csisolatinhebrew': 'iso8859-8',
'cskoi8r': 'koi8-r',
'csksc56011987': 'cp949',
'cspc775baltic': 'cp775',
'cspc850multilingual': 'cp850',
'cspc862latinhebrew': 'cp862',
'cspc8codepage437': 'cp437',
'cspcp852': 'cp852',
'csptcp154': 'ptcp154',
'csshiftjis': 'shift_jis',
'csunicode11utf7': 'utf-7',
'cyrillic': 'iso8859-5',
'cyrillicasian': 'ptcp154',
'ebcdiccpbe': 'cp500',
'ebcdiccpca': 'cp037',
'ebcdiccpch': 'cp500',
'ebcdiccphe': 'cp424',
'ebcdiccpnl': 'cp037',
'ebcdiccpus': 'cp037',
'ebcdiccpwt': 'cp037',
'ecma114': 'iso8859-6',
'ecma118': 'iso8859-7',
'elot928': 'iso8859-7',
'eucjp': 'euc_jp',
'euckr': 'cp949',
'extendedunixcodepackedformatforjapanese': 'euc_jp',
'gb18030': 'gb18030',
'gb2312': 'gbk',
'gb231280': 'gbk',
'gbk': 'gbk',
'greek': 'iso8859-7',
'greek8': 'iso8859-7',
'hebrew': 'iso8859-8',
'hproman8': 'hp-roman8',
'hzgb2312': 'hz',
'ibm037': 'cp037',
'ibm1026': 'cp1026',
'ibm367': 'ascii',
'ibm424': 'cp424',
'ibm437': 'cp437',
'ibm500': 'cp500',
'ibm775': 'cp775',
'ibm819': 'windows-1252',
'ibm850': 'cp850',
'ibm852': 'cp852',
'ibm855': 'cp855',
'ibm857': 'cp857',
'ibm860': 'cp860',
'ibm861': 'cp861',
'ibm862': 'cp862',
'ibm863': 'cp863',
'ibm864': 'cp864',
'ibm865': 'cp865',
'ibm866': 'cp866',
'ibm869': 'cp869',
'iso2022jp': 'iso2022_jp',
'iso2022jp2': 'iso2022_jp_2',
'iso2022kr': 'iso2022_kr',
'iso646irv1991': 'ascii',
'iso646us': 'ascii',
'iso88591': 'windows-1252',
'iso885910': 'iso8859-10',
'iso8859101992': 'iso8859-10',
'iso885911987': 'windows-1252',
'iso885913': 'iso8859-13',
'iso885914': 'iso8859-14',
'iso8859141998': 'iso8859-14',
'iso885915': 'iso8859-15',
'iso885916': 'iso8859-16',
'iso8859162001': 'iso8859-16',
'iso88592': 'iso8859-2',
'iso885921987': 'iso8859-2',
'iso88593': 'iso8859-3',
'iso885931988': 'iso8859-3',
'iso88594': 'iso8859-4',
'iso885941988': 'iso8859-4',
'iso88595': 'iso8859-5',
'iso885951988': 'iso8859-5',
'iso88596': 'iso8859-6',
'iso885961987': 'iso8859-6',
'iso88597': 'iso8859-7',
'iso885971987': 'iso8859-7',
'iso88598': 'iso8859-8',
'iso885981988': 'iso8859-8',
'iso88599': 'windows-1254',
'iso885991989': 'windows-1254',
'isoceltic': 'iso8859-14',
'isoir100': 'windows-1252',
'isoir101': 'iso8859-2',
'isoir109': 'iso8859-3',
'isoir110': 'iso8859-4',
'isoir126': 'iso8859-7',
'isoir127': 'iso8859-6',
'isoir138': 'iso8859-8',
'isoir144': 'iso8859-5',
'isoir148': 'windows-1254',
'isoir149': 'cp949',
'isoir157': 'iso8859-10',
'isoir199': 'iso8859-14',
'isoir226': 'iso8859-16',
'isoir58': 'gbk',
'isoir6': 'ascii',
'koi8r': 'koi8-r',
'koi8u': 'koi8-u',
'korean': 'cp949',
'ksc5601': 'cp949',
'ksc56011987': 'cp949',
'ksc56011989': 'cp949',
'l1': 'windows-1252',
'l10': 'iso8859-16',
'l2': 'iso8859-2',
'l3': 'iso8859-3',
'l4': 'iso8859-4',
'l5': 'windows-1254',
'l6': 'iso8859-10',
'l8': 'iso8859-14',
'latin1': 'windows-1252',
'latin10': 'iso8859-16',
'latin2': 'iso8859-2',
'latin3': 'iso8859-3',
'latin4': 'iso8859-4',
'latin5': 'windows-1254',
'latin6': 'iso8859-10',
'latin8': 'iso8859-14',
'latin9': 'iso8859-15',
'ms936': 'gbk',
'mskanji': 'shift_jis',
'pt154': 'ptcp154',
'ptcp154': 'ptcp154',
'r8': 'hp-roman8',
'roman8': 'hp-roman8',
'shiftjis': 'shift_jis',
'tis620': 'cp874',
'unicode11utf7': 'utf-7',
'us': 'ascii',
'usascii': 'ascii',
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
'windows1252': 'cp1252',
'windows1253': 'cp1253',
'windows1254': 'cp1254',
'windows1255': 'cp1255',
'windows1256': 'cp1256',
'windows1257': 'cp1257',
'windows1258': 'cp1258',
'windows936': 'gbk',
'x-x-big5': 'big5'}
tokenTypes = {
"Doctype":0,
"Characters":1,
"SpaceCharacters":2,
"StartTag":3,
"EndTag":4,
"EmptyTag":5,
"Comment":6,
"ParseError":7
}
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]))
prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
class DataLossWarning(UserWarning):
pass
class ReparseException(Exception):
pass

View File

@ -0,0 +1,127 @@
#
# The goal is to finally have a form filler where you pass data for
# each form, using the algorithm for "Seeding a form with initial values"
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
#
import _base
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class SimpleFilter(_base.Filter):
def __init__(self, source, fieldStorage):
_base.Filter.__init__(self, source)
self.fieldStorage = fieldStorage
def __iter__(self):
field_indices = {}
state = None
field_name = None
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"].lower()
if name == "input":
field_name = None
field_type = None
input_value_index = -1
input_checked_index = -1
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == u"name":
field_name = v.strip(spaceCharacters)
elif n == u"type":
field_type = v.strip(spaceCharacters)
elif n == u"checked":
input_checked_index = i
elif n == u"value":
input_value_index = i
value_list = self.fieldStorage.getlist(field_name)
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if field_type in (u"checkbox", u"radio"):
if value_list:
if token["data"][input_value_index][1] == value:
if input_checked_index < 0:
token["data"].append((u"checked", u""))
field_indices[field_name] = field_index + 1
elif input_checked_index >= 0:
del token["data"][input_checked_index]
elif field_type not in (u"button", u"submit", u"reset"):
if input_value_index >= 0:
token["data"][input_value_index] = (u"value", value)
else:
token["data"].append((u"value", value))
field_indices[field_name] = field_index + 1
field_type = None
field_name = None
elif name == "textarea":
field_type = "textarea"
field_name = dict((token["data"])[::-1])["name"]
elif name == "select":
field_type = "select"
attributes = dict(token["data"][::-1])
field_name = attributes.get("name")
is_select_multiple = "multiple" in attributes
is_selected_option_found = False
elif field_type == "select" and field_name and name == "option":
option_selected_index = -1
option_value = None
for i,(n,v) in enumerate(token["data"]):
n = n.lower()
if n == "selected":
option_selected_index = i
elif n == "value":
option_value = v.strip(spaceCharacters)
if option_value is None:
raise NotImplementedError("<option>s without a value= attribute")
else:
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
if (is_select_multiple or not is_selected_option_found) and option_value == value:
if option_selected_index < 0:
token["data"].append((u"selected", u""))
field_indices[field_name] = field_index + 1
is_selected_option_found = True
elif option_selected_index >= 0:
del token["data"][option_selected_index]
elif field_type is not None and field_name and type == "EndTag":
name = token["name"].lower()
if name == field_type:
if name == "textarea":
value_list = self.fieldStorage.getlist(field_name)
if value_list:
field_index = field_indices.setdefault(field_name, 0)
if field_index < len(value_list):
value = value_list[field_index]
else:
value = ""
yield {"type": "Characters", "data": value}
field_indices[field_name] = field_index + 1
field_name = None
elif name == "option" and field_type == "select":
pass # TODO: part of "option without value= attribute" processing
elif field_type == "textarea":
continue # ignore token
yield token

View File

@ -14,7 +14,8 @@ class Filter(_base.Filter):
for previous, token, next in self.slider():
type = token["type"]
if type == "StartTag":
if token["data"] or not self.is_optional_start(token["name"], previous, next):
if (token["data"] or
not self.is_optional_start(token["name"], previous, next)):
yield token
elif type == "EndTag":
if not self.is_optional_end(token["name"], next):
@ -31,7 +32,11 @@ class Filter(_base.Filter):
elif tagname == 'head':
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == "StartTag"
# XXX: we also omit the start tag if the head element is empty
if type in ("StartTag", "EmptyTag"):
return True
elif type == "EndTag":
return next["name"] == "head"
elif tagname == 'body':
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
@ -52,7 +57,7 @@ class Filter(_base.Filter):
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == "StartTag":
if type in ("StartTag", "EmptyTag"):
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
@ -81,16 +86,13 @@ class Filter(_base.Filter):
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname in ('li', 'optgroup', 'option', 'tr'):
elif tagname in ('li', 'optgroup', 'tr'):
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
@ -112,14 +114,39 @@ class Filter(_base.Filter):
return False
elif tagname == 'p':
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# immediately followed by an address, article, aside,
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
# nav, ol, p, pre, section, table, or ul, element, or if
# there is no more content in the parent element.
if type in ("StartTag", "EmptyTag"):
return next["name"] in ('address', 'article', 'aside',
'blockquote', 'datagrid', 'dialog',
'dir', 'div', 'dl', 'fieldset', 'footer',
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hr', 'menu', 'nav', 'ol',
'p', 'pre', 'section', 'table', 'ul')
else:
return type == "EndTag" or type is None
elif tagname == 'option':
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if it is immediately followed by an <code>optgroup</code>
# element, or if there is no more content in the parent
# element.
if type == "StartTag":
return next["name"] in ('address', 'blockquote', \
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
return next["name"] in ('option', 'optgroup')
else:
return type == "EndTag" or type is None
elif tagname in ('rt', 'rp'):
# An rt element's end tag may be omitted if the rt element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
# An rp element's end tag may be omitted if the rp element is
# immediately followed by an rt or rp element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('rt', 'rp')
else:
return type == "EndTag" or type is None
elif tagname == 'colgroup':

View File

@ -0,0 +1,8 @@
import _base
from html5lib.sanitizer import HTMLSanitizerMixin
class Filter(_base.Filter, HTMLSanitizerMixin):
def __iter__(self):
for token in _base.Filter.__iter__(self):
token = self.sanitize_token(token)
if token: yield token

File diff suppressed because it is too large Load Diff

170
planet/vendor/html5lib/ihatexml.py vendored Normal file
View File

@ -0,0 +1,170 @@
import re
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
letter = " | ".join([baseChar, ideographic])
#Without the
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
extender])
nameFirst = " | ".join([letter, "_"])
reChar = re.compile(r"#x([\d|A-F]{4,4})")
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
def charStringToList(chars):
charRanges = [item.strip() for item in chars.split(" | ")]
rv = []
for item in charRanges:
foundMatch = False
for regexp in (reChar, reCharRange):
match = regexp.match(item)
if match is not None:
rv.append([hexToInt(item) for item in match.groups()])
if len(rv[-1]) == 1:
rv[-1] = rv[-1]*2
foundMatch = True
break
if not foundMatch:
assert len(item) == 1
rv.append([ord(item)] * 2)
rv = normaliseCharList(rv)
return rv
def normaliseCharList(charList):
charList = sorted(charList)
for item in charList:
assert item[1] >= item[0]
rv = []
i = 0
while i < len(charList):
j = 1
rv.append(charList[i])
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
rv[-1][1] = charList[i+j][1]
j += 1
i += j
return rv
#We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
def missingRanges(charList):
rv = []
if charList[0] != 0:
rv.append([0, charList[0][0] - 1])
for i, item in enumerate(charList[:-1]):
rv.append([item[1]+1, charList[i+1][0] - 1])
if charList[-1][1] != max_unicode:
rv.append([charList[-1][1] + 1, max_unicode])
return rv
def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(intToUnicodeStr(item[0]))
else:
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
return "[%s]"%"|".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def intToUnicodeStr(intValue):
#There must be a better (non-evil) way to do this
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, r"\\" + char)
if char in string:
print string
return string
#output from the above
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
replaceRanges = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
if replaceRanges is not None or replaceChars is not None:
raise NotImplementedError
else:
self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
self.preventDoubleDashComments = preventDoubleDashComments
self.preventDashAtCommentEnd = preventDashAtCommentEnd
self.replaceFormFeedCharacters = replaceFormFeedCharacters
self.replaceCache = {}
def coerceAttribute(self, name, namespace=None):
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
#Need a datalosswarning here
return None
elif (self.dropXmlnsAttrNs and
namespace == "http://www.w3.org/2000/xmlns/"):
return None
else:
return self.toXmlName(name)
def coerceElement(self, name, namespace=None):
return self.toXmlName(name)
def coerceComment(self, data):
if self.preventDoubleDashComments:
while "--" in data:
data = data.replace("--", "- -")
return data
def coerceCharacters(self, data):
if self.replaceFormFeedCharacters:
data = data.replace("\x0C", " ")
#Other non-xml characters
return data
def toXmlName(self, name):
replaceChars = set(self.replaceCharsRegexp.findall(name))
for char in replaceChars:
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
name = name.replace(char, replacement)
return name
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):
name = name.replace(item, self.unescapeChar(item))
return name
def escapeChar(self, char):
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
self.replaceCache[char] = replacement
return replacement
def unescapeChar(self, charcode):
return unichr(int(charcode[1:], 16))

View File

@ -1,15 +1,109 @@
import codecs
import re
import types
from gettext import gettext
_ = gettext
import sys
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings
from utils import MethodDispatcher
from constants import encodings, ReparseException
class HTMLInputStream(object):
#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
# Cache for charsUntil()
charsUntilRegEx = {}
class BufferedStream:
"""Buffering for streams that do not have buffering of their own
The buffer is implemented as a list of chunks on the assumption that
joining many strings will be slow since it is O(n**2)
"""
def __init__(self, stream):
self.stream = stream
self.buffer = []
self.position = [-1,0] #chunk number, offset
def tell(self):
pos = 0
for chunk in self.buffer[:self.position[0]]:
pos += len(chunk)
pos += self.position[1]
return pos
def seek(self, pos):
assert pos < self._bufferedBytes()
offset = pos
i = 0
while len(self.buffer[i]) < offset:
offset -= pos
i += 1
self.position = [i, offset]
def read(self, bytes):
if not self.buffer:
return self._readStream(bytes)
elif (self.position[0] == len(self.buffer) and
self.position[1] == len(self.buffer[-1])):
return self._readStream(bytes)
else:
return self._readFromBuffer(bytes)
def _bufferedBytes(self):
return sum([len(item) for item in self.buffer])
def _readStream(self, bytes):
data = self.stream.read(bytes)
self.buffer.append(data)
self.position[0] += 1
self.position[1] = len(data)
return data
def _readFromBuffer(self, bytes):
remainingBytes = bytes
rv = []
bufferIndex = self.position[0]
bufferOffset = self.position[1]
while bufferIndex < len(self.buffer) and remainingBytes != 0:
assert remainingBytes > 0
bufferedData = self.buffer[bufferIndex]
if remainingBytes <= len(bufferedData) - bufferOffset:
bytesToRead = remainingBytes
self.position = [bufferIndex, bufferOffset + bytesToRead]
else:
bytesToRead = len(bufferedData) - bufferOffset
self.position = [bufferIndex, len(bufferedData)]
bufferIndex += 1
data = rv.append(bufferedData[bufferOffset:
bufferOffset + bytesToRead])
remainingBytes -= bytesToRead
bufferOffset = 0
if remainingBytes:
rv.append(self._readStream(remainingBytes))
return "".join(rv)
class HTMLInputStream:
"""Provides a unicode stream of characters to the HTMLTokenizer.
This class takes care of character encoding and removing or replacing
@ -17,11 +111,13 @@ class HTMLInputStream(object):
"""
_defaultChunkSize = 10240
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
for use by the HTML5Lib.
for use by html5lib.
source can be either a file-object, local filename or a string.
@ -33,10 +129,17 @@ class HTMLInputStream(object):
parseMeta - Look for a <meta> element containing encoding information
"""
#Craziness
if len(u"\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
else:
self.reportCharacterErrors = self.characterErrorsUCS2
# List of where new lines occur
self.newLines = [0]
self.charEncoding = encoding
self.charEncoding = (codecName(encoding), "certain")
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
@ -52,17 +155,25 @@ class HTMLInputStream(object):
self.defaultEncoding = "windows-1252"
#Detect encoding iff no explicit "transport level" encoding is supplied
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
self.reset()
def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
'replace')
self.queue = []
self.chunk = u""
self.chunkSize = 0
self.chunkOffset = 0
self.errors = []
self.line = self.col = 0
self.lineLengths = []
# number of (complete) lines in previous chunks
self.prevNumLines = 0
# number of columns in the last line of the previous chunk
self.prevNumCols = 0
#Flag to indicate we may have a CR LF broken across a data chunk
self._lastChunkEndsWithCR = False
@ -80,22 +191,29 @@ class HTMLInputStream(object):
# Otherwise treat source as a string and convert to a file object
if isinstance(source, unicode):
source = source.encode('utf-8')
self.charEncoding = "utf-8"
self.charEncoding = ("utf-8", "certain")
import cStringIO
stream = cStringIO.StringIO(str(source))
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
stream is sys.stdin):
stream = BufferedStream(stream)
return stream
def detectEncoding(self, parseMeta=True, chardet=True):
#First look for a BOM
#This will also read past the BOM if present
encoding = self.detectBOM()
confidence = "certain"
#If there is no BOM need to look for meta elements with encoding
#information
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
confidence = "tentative"
#Guess with chardet, if avaliable
if encoding is None and chardet:
confidence = "tentative"
try:
from chardet.universaldetector import UniversalDetector
buffers = []
@ -108,11 +226,12 @@ class HTMLInputStream(object):
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
self.seek("".join(buffers), 0)
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
confidence="tentative"
encoding = self.defaultEncoding
#Substitute for equivalent encodings:
@ -121,7 +240,21 @@ class HTMLInputStream(object):
if encoding.lower() in encodingSub:
encoding = encodingSub[encoding.lower()]
return encoding
return encoding, confidence
def changeEncoding(self, newEncoding):
newEncoding = codecName(newEncoding)
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
if newEncoding is None:
return
elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain")
else:
self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certain")
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
@ -149,198 +282,219 @@ class HTMLInputStream(object):
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.seek(string, encoding and seek or 0)
self.rawStream.seek(encoding and seek or 0)
return encoding
def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return
if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass
class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer)
self.seek(buffer, 0)
return parser.getEncoding()
self.rawStream.seek(0)
encoding = parser.getEncoding()
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
encoding = "utf-8"
return encoding
def _position(self, offset):
chunk = self.chunk
nLines = chunk.count(u'\n', 0, offset)
positionLine = self.prevNumLines + nLines
lastLinePos = chunk.rfind(u'\n', 0, offset)
if lastLinePos == -1:
positionColumn = self.prevNumCols + offset
else:
positionColumn = offset - (lastLinePos + 1)
return (positionLine, positionColumn)
def position(self):
"""Returns (line, col) of the current position in the stream."""
line, col = self.line, self.col
line, col = self._position(self.chunkOffset)
return (line+1, col)
def char(self):
""" Read one character from the stream or queue if available. Return
EOF when EOF is reached.
"""
if not self.queue:
self.readChunk()
#If we still don't have a character we have reached EOF
if not self.queue:
# Read a new chunk from the input stream if necessary
if self.chunkOffset >= self.chunkSize:
if not self.readChunk():
return EOF
char = self.queue.pop(0)
chunkOffset = self.chunkOffset
char = self.chunk[chunkOffset]
self.chunkOffset = chunkOffset + 1
# update position in stream
if char == '\n':
self.lineLengths.append(self.col)
self.line += 1
self.col = 0
else:
self.col += 1
return char
def readChunk(self, chunkSize=10240):
def readChunk(self, chunkSize=None):
if chunkSize is None:
chunkSize = self._defaultChunkSize
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
self.chunk = u""
self.chunkSize = 0
self.chunkOffset = 0
data = self.dataStream.read(chunkSize)
if not data:
return
#Replace null characters
for i in xrange(data.count(u"\u0000")):
self.errors.append(_('null character found in input stream, '
'replaced with U+FFFD'))
return False
self.reportCharacterErrors(data)
data = data.replace(u"\u0000", u"\ufffd")
#Check for CR LF broken across chunks
if (self._lastChunkEndsWithCR and data[0] == "\n"):
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
data = data[1:]
self._lastChunkEndsWithCR = data[-1] == "\r"
data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
# Stop if the chunk is now empty
if not data:
return False
self._lastChunkEndsWithCR = data[-1] == u"\r"
data = data.replace(u"\r\n", u"\n")
data = data.replace(u"\r", u"\n")
data = unicode(data)
self.queue.extend([char for char in data])
self.chunk = data
self.chunkSize = len(data)
return True
def characterErrorsUCS4(self, data):
for i in xrange(data.count(u"\u0000")):
self.errors.append("null-character")
for i in xrange(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data):
#Someone picked the wrong compile option
#You lose
for i in xrange(data.count(u"\u0000")):
self.errors.append("null-character")
skip = False
import sys
for match in invalid_unicode_re.finditer(data):
if skip:
continue
codepoint = ord(match.group())
pos = match.start()
#Pretty sure there should be endianness issues here
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
pos < len(data) - 1 and
ord(data[pos + 1]) >= 0xDC00 and
ord(data[pos + 1]) <= 0xDFFF):
#We have a surrogate pair!
#From a perl manpage
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
(ord(data[pos + 1]) - 0xDC00))
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
pos == len(data) - 1):
self.errors.append("invalid-codepoint")
else:
skip = False
self.errors.append("invalid-codepoint")
#This is still wrong if it is possible for a surrogate pair to break a
#chunk boundary
def charsUntil(self, characters, opposite = False):
""" Returns a string of characters from the stream up to but not
including any character in characters or EOF. characters can be
any container that supports the in method being called on it.
including any character in 'characters' or EOF. 'characters' must be
a container that supports the 'in' method and iteration over its
characters.
"""
#This method is currently 40-50% of our total runtime and badly needs
#optimizing
#Possible improvements:
# - use regexp to find characters that match the required character set
# (with regexp cache since we do the same searches many many times)
# - improve EOF handling for fewer if statements
# Use a cache of regexps to find the required characters
try:
chars = charsUntilRegEx[(characters, opposite)]
except KeyError:
if __debug__:
for c in characters:
assert(ord(c) < 128)
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
if not opposite:
regex = u"^%s" % regex
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
if not self.queue:
self.readChunk()
#Break if we have reached EOF
if not self.queue or self.queue[0] == None:
return u""
rv = []
i = 0
while (self.queue[i] in characters) == opposite:
i += 1
if i == len(self.queue):
self.readChunk()
#If the queue doesn't grow we have reached EOF
if i == len(self.queue) or self.queue[i] is EOF:
while True:
# Find the longest matching prefix
m = chars.match(self.chunk, self.chunkOffset)
if m is None:
# If nothing matched, and it wasn't because we ran out of chunk,
# then stop
if self.chunkOffset != self.chunkSize:
break
#XXX- wallpaper over bug in calculation below
#Otherwise change the stream position
if self.queue[i] == '\n':
self.lineLengths.append(self.col)
self.line += 1
self.col = 0
else:
self.col += 1
end = m.end()
# If not the whole chunk matched, return everything
# up to the part that didn't match
if end != self.chunkSize:
rv.append(self.chunk[self.chunkOffset:end])
self.chunkOffset = end
break
# If the whole remainder of the chunk matched,
# use it all and read the next chunk
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
rv = u"".join(self.queue[:i])
self.queue = self.queue[i:]
r = u"".join(rv)
return r
#Calculate where we now are in the stream
#One possible optimisation would be to store all read characters and
#Calculate this on an as-needed basis (perhaps flushing the read data
#every time we read a new chunk) rather than once per call here and
#in .char()
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
#XXX Temporarily disable this because there is a bug
#lines = rv.split("\n")
#
#if lines:
# #Add number of lines passed onto positon
# oldCol = self.col
# self.line += len(lines)-1
# if len(lines) > 1:
# self.col = len(lines[-1])
# else:
# self.col += len(lines[0])
#
# if self.lineLengths and oldCol > 0:
# self.lineLengths[-1] += len(lines[0])
# lines = lines[1:-1]
# else:
# lines = lines[:-1]
#
# for line in lines:
# self.lineLengths.append(len(line))
#
return rv
def unget(self, chars):
if chars:
self.queue = list(chars) + self.queue
#Alter the current line, col position
for c in chars[::-1]:
if c == '\n':
self.line -= 1
self.col = self.lineLengths[self.line]
if char is not None:
if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently
# called char and charsUntil.
# So, just prepend the ungotten character onto the current
# chunk:
self.chunk = char + self.chunk
self.chunkSize += 1
else:
self.col -= 1
self.chunkOffset -= 1
assert self.chunk[self.chunkOffset] == char
class EncodingBytes(str):
"""String-like object with an assosiated position and various extra methods
"""String-like object with an associated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
return str.__new__(self, value)
def __init__(self, value):
str.__init__(self, value)
self._position=-1
def __iter__(self):
return self
def next(self):
self._position += 1
rv = self[self.position]
return rv
p = self._position = self._position + 1
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
return self[p]
def previous(self):
p = self._position
if p >= len(self):
raise StopIteration
elif p < 0:
raise TypeError
self._position = p = p - 1
return self[p]
def setPosition(self, position):
if self._position >= len(self):
@ -362,20 +516,39 @@ class EncodingBytes(str):
currentByte = property(getCurrentByte)
def skip(self, chars=spaceCharacters):
def skip(self, chars=spaceCharactersBytes):
"""Skip past a list of characters"""
while self.currentByte in chars:
self.position += 1
p = self.position # use property for the error-checking
while p < len(self):
c = self[p]
if c not in chars:
self._position = p
return c
p += 1
self._position = p
return None
def skipUntil(self, chars):
p = self.position
while p < len(self):
c = self[p]
if c in chars:
self._position = p
return c
p += 1
self._position = p
return None
def matchBytes(self, bytes, lower=False):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
data = self[self.position:self.position+len(bytes)]
p = self.position
data = self[p:p+len(bytes)]
if lower:
data = data.lower()
rv = data.startswith(bytes)
if rv == True:
if rv:
self.position += len(bytes)
return rv
@ -389,12 +562,6 @@ class EncodingBytes(str):
else:
raise StopIteration
def findNext(self, byteList):
"""Move the pointer so it points to the next byte in a set of possible
bytes"""
while (self.currentByte not in byteList):
self.position += 1
class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements"""
@ -423,8 +590,7 @@ class EncodingParser(object):
break
if not keepParsing:
break
if self.encoding is not None:
self.encoding = self.encoding.strip()
return self.encoding
def handleComment(self):
@ -432,7 +598,7 @@ class EncodingParser(object):
return self.data.jumpTo("-->")
def handleMeta(self):
if self.data.currentByte not in spaceCharacters:
if self.data.currentByte not in spaceCharactersBytes:
#if we have <meta not followed by a space so just keep going
return True
#We have a valid meta element we want to search for attributes
@ -444,38 +610,41 @@ class EncodingParser(object):
else:
if attr[0] == "charset":
tentativeEncoding = attr[1]
if isValidEncoding(tentativeEncoding):
self.encoding = tentativeEncoding
codec = codecName(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
elif attr[0] == "content":
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if isValidEncoding(tentativeEncoding):
self.encoding = tentativeEncoding
codec = codecName(tentativeEncoding)
if codec is not None:
self.encoding = codec
return False
def handlePossibleStartTag(self):
return self.handlePossibleTag(False)
def handlePossibleEndTag(self):
self.data.position+=1
self.data.next()
return self.handlePossibleTag(True)
def handlePossibleTag(self, endTag):
if self.data.currentByte not in asciiLetters:
data = self.data
if data.currentByte not in asciiLettersBytes:
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if endTag:
self.data.position -= 1
data.previous()
self.handleOther()
return True
self.data.findNext(list(spaceCharacters) + ["<", ">"])
if self.data.currentByte == "<":
c = data.skipUntil(spacesAngleBrackets)
if c == "<":
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
self.data.position -= 1
data.previous()
else:
#Read all attributes
attr = self.getAttribute()
@ -489,73 +658,75 @@ class EncodingParser(object):
def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
self.data.skip(list(spaceCharacters)+["/"])
if self.data.currentByte == "<":
self.data.position -= 1
data = self.data
c = data.skip(spaceCharactersBytes | frozenset("/"))
if c == "<":
data.previous()
return None
elif self.data.currentByte == ">":
elif c == ">" or c is None:
return None
attrName = []
attrValue = []
spaceFound = False
#Step 5 attribute name
while True:
if self.data.currentByte == "=" and attrName:
if c == "=" and attrName:
break
elif self.data.currentByte in spaceCharacters:
elif c in spaceCharactersBytes:
spaceFound=True
break
elif self.data.currentByte in ("/", "<", ">"):
elif c in ("/", "<", ">"):
return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase:
attrName.extend(self.data.currentByte.lower())
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
else:
attrName.extend(self.data.currentByte)
attrName.append(c)
#Step 6
self.data.position += 1
c = data.next()
#Step 7
if spaceFound:
self.data.skip()
c = data.skip()
#Step 8
if self.data.currentByte != "=":
self.data.position -= 1
if c != "=":
data.previous()
return "".join(attrName), ""
#XXX need to advance position in both spaces and value case
#Step 9
self.data.position += 1
data.next()
#Step 10
self.data.skip()
c = data.skip()
#Step 11
if self.data.currentByte in ("'", '"'):
if c in ("'", '"'):
#11.1
quoteChar = self.data.currentByte
quoteChar = c
while True:
self.data.position+=1
#11.3
if self.data.currentByte == quoteChar:
self.data.position += 1
c = data.next()
if c == quoteChar:
data.next()
return "".join(attrName), "".join(attrValue)
#11.4
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
#11.5
else:
attrValue.extend(self.data.currentByte)
elif self.data.currentByte in (">", '<'):
attrValue.append(c)
elif c in (">", "<"):
return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.extend(self.data.currentByte)
attrValue.append(c)
while True:
self.data.position +=1
if self.data.currentByte in (
list(spaceCharacters) + [">", '<']):
c = data.next()
if c in spacesAngleBrackets:
return "".join(attrName), "".join(attrValue)
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
else:
attrValue.extend(self.data.currentByte)
attrValue.append(c)
class ContentAttrParser(object):
@ -588,7 +759,7 @@ class ContentAttrParser(object):
#Unquoted value
oldPosition = self.data.position
try:
self.data.findNext(spaceCharacters)
self.data.skipUntil(spaceCharactersBytes)
return self.data[oldPosition:self.data.position]
except StopIteration:
#Return the whole remaining value
@ -596,7 +767,12 @@ class ContentAttrParser(object):
except StopIteration:
return None
def isValidEncoding(encoding):
"""Determine if a string is a supported encoding"""
return (encoding is not None and type(encoding) == types.StringType and
encoding.lower().strip() in encodings)
def codecName(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if (encoding is not None and type(encoding) in types.StringTypes):
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
return encodings.get(canonicalName, None)
else:
return None

View File

@ -1,147 +0,0 @@
"""
Warning: this module is experimental and subject to change and even removal
at any time.
For background/rationale, see:
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
* http://tinyurl.com/ylfj8k (and follow-ups)
References:
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Selectively lowercase only XHTML, but not foreign markup
"""
import html5parser
from constants import voidElements, contentModelFlags
from xml.dom import XHTML_NAMESPACE
from xml.sax.saxutils import unescape
class XMLParser(html5parser.HTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlRootPhase(self, self.tree)
def normalizeToken(self, token):
if token["type"] in ("StartTag", "EmptyTag"):
token["data"] = dict(token["data"][::-1])
# For EmptyTags, process both a Start and an End tag
if token["type"] == "EmptyTag":
save = self.tokenizer.contentModelFlag
self.phase.processStartTag(token["name"], token["data"])
self.tokenizer.contentModelFlag = save
token["data"] = {}
token["type"] = "EndTag"
elif token["type"] == "Characters":
# un-escape rcdataElements (e.g. style, script)
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
token["data"] = unescape(token["data"])
elif token["type"] == "Comment":
# Rescue CDATA from the comments
if (token["data"].startswith("[CDATA[") and
token["data"].endswith("]]")):
token["type"] = "Characters"
token["data"] = token["data"][7:-2]
return token
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
**kwargs):
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
encoding, lowercaseElementName=False,
lowercaseAttrName=False)
class XHTMLParser(XMLParser):
""" liberal XMTHML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlInitialPhase(self, self.tree)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
token = XMLParser.normalizeToken(self, token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag":
if token["name"] in voidElements:
if not self.tree.openElements or \
self.tree.openElements[-1].name != token["name"]:
token["type"] = "EmptyTag"
if not token.has_key("data"): token["data"] = {}
else:
if token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] != XHTML_NAMESPACE:
break
else:
self.tree.insertText('')
return token
class XhmlRootPhase(html5parser.RootElementPhase):
def insertHtmlElement(self):
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
self.tree.openElements.append(element)
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
class XmlInitialPhase(html5parser.InitialPhase):
""" Consume XML Prologs """
def processComment(self, data):
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
class XmlRootPhase(html5parser.Phase):
""" Consume XML Prologs """
def processComment(self, data):
print repr(data)
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
""" Prime the Xml parser """
def __getattr__(self, name):
self.tree.openElements.append(self.tree.document)
self.parser.phase = XmlElementPhase(self.parser, self.tree)
return getattr(self.parser.phase, name)
class XmlElementPhase(html5parser.Phase):
""" Generic handling for all XML elements """
def __init__(self, *args, **kwargs):
html5parser.Phase.__init__(self, *args, **kwargs)
self.startTagHandler = html5parser.utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = html5parser.utils.MethodDispatcher([])
self.endTagHandler.default = self.endTagOther
def startTagOther(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
def endTagOther(self, name):
for node in self.tree.openElements[::-1]:
if node.name == name:
while self.tree.openElements.pop() != node:
pass
break
else:
self.parser.parseError()
def processCharacters(self, data):
self.tree.insertText(data)

View File

@ -1,6 +1,8 @@
import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
from constants import tokenTypes
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
'font-family', 'font-size', 'font-stretch', 'font-style',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
@ -83,6 +85,13 @@ class HTMLSanitizerMixin(object):
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
'radialGradient', 'textpath', 'tref', 'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
attrs = dict([(name,val) for name,val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
if not attrs.has_key(attr):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else:
if token["type"] == "EndTag":
if token["type"] == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
if token["type"] == tokenTypes["EmptyTag"]:
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
elif token["type"] == "Comment":
elif token["type"] == tokenTypes["Comment"]:
pass
else:
return token
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
elif prop.split('-')[0].lower() in ['background','border','margin',
'padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True,
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False):
#Change case matching defaults as we only output lowercase html anyway
#This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName)
def __iter__(self):

View File

@ -1,3 +1,17 @@
from html5lib import treewalkers
from htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer
def serialize(input, tree="simpletree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
elif format == "xhtml":
s = XHTMLSerializer(**serializer_opts)
else:
raise ValueError, "type must be either html or xhtml"
return s.render(walker(input), encoding)

View File

@ -147,7 +147,7 @@ class HTMLSerializer(object):
quote_attr = True
else:
quote_attr = reduce(lambda x,y: x or (y in v),
spaceCharacters + "<>\"'", False)
spaceCharacters + ">\"'=", False)
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
if encoding:

File diff suppressed because it is too large Load Diff

View File

@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation for the sake of
backwards compatibility (as releases up until 0.10 had a
builder called "dom" that was a minidom implemenation).
"etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
implementation - (Currently applies to the "etree" tree type only). A module
implementing the tree type e.g. xml.etree.ElementTree or
lxml.etree."""
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
xml.etree.ElementTree or lxml.etree."""
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType in ("dom", "simpletree"):
mod = __import__(treeType, globals())
treeBuilderCache[treeType] = mod.TreeBuilder
if treeType == "dom":
import dom
# XXX: Keep backwards compatibility by using minidom if no implementation is given
if implementation == None:
from xml.dom import minidom
implementation = minidom
# XXX: NEVER cache here, caching is done in the dom submodule
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "simpletree":
import simpletree
treeBuilderCache[treeType] = simpletree.TreeBuilder
elif treeType == "beautifulsoup":
import soup
treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "lxml":
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
import etree
# XXX: NEVER cache here, caching is done in the etree submodule

View File

@ -1,3 +1,4 @@
import warnings
from html5lib.constants import scopingElements, tableInsertModeElements
try:
frozenset
@ -11,9 +12,6 @@ except NameError:
# from "leaking" into tables, buttons, object elements, and marquees.
Marker = None
#XXX - TODO; make the default interface more ElementTree-like
# rather than DOM-like
class Node(object):
def __init__(self, name):
"""Node representing an item in the tree.
@ -43,7 +41,7 @@ class Node(object):
return "<%s>"%(self.name)
def __repr__(self):
return "<%s %s>" % (self.__class__, self.name)
return "<%s>" % (self.name)
def appendChild(self, node):
"""Insert node as a child of the current node
@ -112,7 +110,12 @@ class TreeBuilder(object):
#Fragment class
fragmentClass = None
def __init__(self):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
self.reset()
def reset(self):
@ -140,7 +143,8 @@ class TreeBuilder(object):
return True
elif node.name == "table":
return False
elif not tableVariant and node.name in scopingElements:
elif (not tableVariant and (node.nameTuple in
scopingElements)):
return False
elif node.name == "html":
return False
@ -179,7 +183,10 @@ class TreeBuilder(object):
clone = self.activeFormattingElements[i].cloneNode()
# Step 9
element = self.insertElement(clone.name, clone.attributes)
element = self.insertElement({"type":"StartTag",
"name":clone.name,
"namespace":clone.namespace,
"data":clone.attributes})
# Step 10
self.activeFormattingElements[i] = element
@ -207,21 +214,30 @@ class TreeBuilder(object):
return item
return False
def insertDoctype(self, name, publicId, systemId):
doctype = self.doctypeClass(name)
doctype.publicId = publicId
doctype.systemId = systemId
def insertRoot(self, token):
element = self.createElement(token)
self.openElements.append(element)
self.document.appendChild(element)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype)
def insertComment(self, data, parent=None):
def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(data))
parent.appendChild(self.commentClass(token["data"]))
def createElement(self, name, attributes):
def createElement(self, token):
"""Create an element but don't insert it anywhere"""
element = self.elementClass(name)
element.attributes = attributes
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
return element
def _getInsertFromTable(self):
@ -238,19 +254,20 @@ class TreeBuilder(object):
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
def insertElementNormal(self, name, attributes):
element = self.elementClass(name)
element.attributes = attributes
def insertElementNormal(self, token):
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
self.openElements[-1].appendChild(element)
self.openElements.append(element)
return element
def insertElementTable(self, name, attributes):
def insertElementTable(self, token):
"""Create an element and insert it into the tree"""
element = self.elementClass(name)
element.attributes = attributes
element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(name, attributes)
return self.insertElementNormal(token)
else:
#We should be in the InTable mode. This means we want to do
#special magic element rearranging
@ -267,9 +284,9 @@ class TreeBuilder(object):
if parent is None:
parent = self.openElements[-1]
if (not(self.insertFromTable) or (self.insertFromTable and
self.openElements[-1].name not in
tableInsertModeElements)):
if (not self.insertFromTable or (self.insertFromTable and
self.openElements[-1].name
not in tableInsertModeElements)):
parent.insertText(data)
else:
# We should be in the InTable mode. This means we want to do
@ -287,7 +304,7 @@ class TreeBuilder(object):
fosterParent = None
insertBefore = None
for elm in self.openElements[::-1]:
if elm.name == u"table":
if elm.name == "table":
lastTable = elm
break
if lastTable:

View File

@ -1,35 +1,66 @@
import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
import _base
from html5lib import constants, ihatexml
from html5lib.constants import namespaces
moduleCache = {}
def getDomModule(DomImplementation):
name = "_" + DomImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module(name)
objs = getDomBuilder(DomImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getDomBuilder(DomImplementation):
Dom = DomImplementation
infoset_filter = ihatexml.InfosetFilter()
class AttrList:
def __init__(self, element):
self.element = element
def __iter__(self):
return self.element.attributes.items().__iter__()
def __setitem__(self, name, value):
value=illegal_xml_chars.sub(u'\uFFFD',value)
self.element.setAttribute(name, value)
self.element.setAttribute(infoset_filter.coerceAttribute(name),
infoset_filter.coerceCharacters(value))
def items(self):
return self.element.attributes.items()
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
self.element.attributes.items()]
def keys(self):
return self.element.attributes.keys()
return [infoset_filter.fromXmlName(item) for item in
self.element.attributes.keys()]
def __getitem__(self, name):
name = infoset_filter.toXmlName(name)
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.hasAttribute(infoset_filter.toXmlName(name))
class NodeBuilder(_base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.nodeName)
_base.Node.__init__(self, element.localName)
self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
and self.element.namespaceURI or None)
def appendChild(self, node):
node.parent = self
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
data=infoset_filter.coerceCharacters(data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
@ -58,9 +89,19 @@ class NodeBuilder(_base.Node):
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
value=illegal_xml_chars.sub(u'\uFFFD',value)
self.element.setAttribute(name, value)
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" +
infoset_filter.coerceAttribute(
name[1]))
else:
qualifiedName = infoset_filter.coerceAttribute(
name[1])
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
infoset_filter.coerceAttribute(name), value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
@ -69,19 +110,37 @@ class NodeBuilder(_base.Node):
def hasContent(self):
return self.element.hasChildNodes()
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
return self
def insertDoctype(self, name, publicId, systemId):
domimpl = minidom.getDOMImplementation()
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
domimpl = Dom.getDOMImplementation()
doctype = domimpl.createDocumentType(name, publicId, systemId)
self.document.appendChild(NodeBuilder(doctype))
if Dom == minidom:
doctype.ownerDocument = self.dom
def elementClass(self, name):
return NodeBuilder(self.dom.createElement(name))
def elementClass(self, name, namespace=None):
if namespace is None and self.defaultNamespace is None:
node = self.dom.createElement(name)
else:
node = self.dom.createElementNS(namespace, name)
return NodeBuilder(node)
def commentClass(self, data):
return NodeBuilder(self.dom.createComment(data))
@ -102,7 +161,7 @@ class TreeBuilder(_base.TreeBuilder):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=illegal_xml_chars.sub(u'\uFFFD',data)
data=infoset_filter.coerceCharacters(data)
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
@ -121,6 +180,12 @@ def testSerializer(element):
def serializeElement(element, indent=0):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
if element.name:
if element.publicId or element.systemId:
publicId = element.publicId or ""
systemId = element.systemId or ""
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
' '*indent, element.name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
else:
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
@ -133,9 +198,26 @@ def testSerializer(element):
elif element.nodeType == Node.TEXT_NODE:
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
if (hasattr(element, "namespaceURI") and
element.namespaceURI not in (None,
constants.namespaces["html"])):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
name = element.nodeName
rv.append("|%s<%s>"%(' '*indent, name))
if element.hasAttributes():
for name, value in element.attributes.items():
i = 0
attr = element.attributes.item(i)
while attr:
name = infoset_filter.fromXmlName(attr.localName)
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], name)
i += 1
attr = element.attributes.item(i)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
for child in element.childNodes:
@ -201,3 +283,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
return locals()
# Keep backwards compatibility with things that directly load
# classes/functions from this module
for key, value in getDomModule(minidom).__dict__.items():
globals()[key] = value

View File

@ -1,5 +1,12 @@
import _base
import new
import re
import _base
from html5lib import ihatexml
from html5lib import constants
from html5lib.constants import namespaces
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
@ -17,21 +24,44 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
def __init__(self, name, namespace=None):
self._name = name
self._namespace = namespace
self._element = ElementTree.Element(self._getETreeTag(name,
namespace))
if namespace is None:
self.nameTuple = namespaces["html"], self._name
else:
self.nameTuple = self._namespace, self._name
self.parent = None
self._childNodes = []
self._flags = []
def _getETreeTag(self, name, namespace):
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
return etree_tag
def _setName(self, name):
self._element.tag = name
self._name = name
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getName(self):
return self._element.tag
return self._name
name = property(_getName, _setName)
def _setNamespace(self, namespace):
self._namespace = namespace
self._element.tag = self._getETreeTag(self._name, self._namespace)
def _getNamespace(self):
return self._namespace
namespace = property(_getNamespace, _setNamespace)
def _getAttributes(self):
return self._element.attrib
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], key[1])
else:
name = key
self._element.set(name, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self.publicId = publicId
self.systemId = systemId
def _getPublicId(self):
return self._element.get(u"publicId", None)
return self._element.get(u"publicId", "")
def _setPublicId(self, value):
if value is not None:
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
publicId = property(_getPublicId, _setPublicId)
def _getSystemId(self):
return self._element.get(u"systemId", None)
return self._element.get(u"systemId", "")
def _setSystemId(self, value):
if value is not None:
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if not(hasattr(element, "tag")):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
rv.append("#document")
if element.text:
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
elif type(element.tag) == type(ElementTree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
nsmatch = tag_regexp.match(element.tag)
if nsmatch is None:
name = element.tag
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
if prefix != "html":
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
name = "%s %s"%(prefix, name)
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@ -201,11 +257,18 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element):
if type(element) == type(ElementTree.ElementTree):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
element.text, publicId, systemId))
else:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
if element.text:
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
rv.append("<%s>"%(filter.fromXmlName(element.tag),))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
attr = " ".join(["%s=\"%s\""%(
filter.fromXmlName(name), value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:

View File

@ -0,0 +1,331 @@
import new
import warnings
import re
import _base
from html5lib.constants import DataLossWarning
import html5lib.constants as constants
import etree as etree_builders
from html5lib import ihatexml
try:
import lxml.etree as etree
except ImportError:
pass
fullTree = True
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
class DocumentType(object):
def __init__(self, name, publicId, systemId):
self.name = name
self.publicId = publicId
self.systemId = systemId
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
childNodes = property(_getChildNodes)
def testSerializer(element):
rv = []
finalText = None
filter = ihatexml.InfosetFilter()
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
if hasattr(element, "getroot"):
#Full tree case
rv.append("#document")
if element.docinfo.internalDTD:
if not (element.docinfo.public_id or
element.docinfo.system_url):
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
else:
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
element.docinfo.root_name,
element.docinfo.public_id,
element.docinfo.system_url)
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
serializeElement(next_element, indent+2)
next_element = next_element.getnext()
elif isinstance(element, basestring):
#Text in a fragment
rv.append("|%s\"%s\""%(' '*indent, element))
else:
#Fragment case
rv.append("#document-fragment")
for next_element in element:
serializeElement(next_element, indent+2)
elif type(element.tag) == type(etree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
if prefix != "html":
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
nsmatch = etree_builders.tag_regexp.match(name)
if nsmatch:
ns = nsmatch.group(1)
name = nsmatch.group(2)
prefix = constants.prefixes[ns]
rv.append('|%s%s %s="%s"' % (' '*(indent+2),
prefix,
filter.fromXmlName(name),
value))
else:
rv.append('|%s%s="%s"' % (' '*(indent+2),
filter.fromXmlName(name),
value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
rv.append(dtd_str)
serializeElement(element.getroot())
elif type(element.tag) == type(etree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
fragmentClass = Document
def __init__(self, namespaceHTMLElements, fullTree = False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
filter = self.filter = ihatexml.InfosetFilter()
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
self._element = element
dict.__init__(self, value)
for key, value in self.iteritems():
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
else:
name = filter.coerceAttribute(key)
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
name = filter.coerceElement(name)
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return self._name
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
data = filter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
data = filter.coerceComment(data)
builder.Comment.__init__(self, data)
def _setData(self, data):
data = filter.coerceComment(data)
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
#self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self):
_base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
fragment.extend(element.getchildren())
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if not name or ihatexml.nonXmlBMPRegexp.search(name):
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
def insertRoot(self, token):
"""Create the document root"""
#Because of the way libxml2 works, it doesn't seem to be possible to
#alter information like the doctype after the tree has been parsed.
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name:
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += ">"
#TODO - this needs to work when elements are not put into the default ns
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
try:
root = etree.fromstring(docStr)
except etree.XMLSyntaxError:
print docStr
raise
#Append the initial comments:
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
#Create the root document and add the ElementTree to it
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
#Add the root element to the internal child/open data structures
namespace = token.get("namespace", None)
root_element = self.elementClass(token["name"], namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
#Reset to the default insert comment function
self.insertComment = super(TreeBuilder, self).insertComment

View File

@ -1,5 +1,5 @@
import _base
from html5lib.constants import voidElements
from html5lib.constants import voidElements, namespaces, prefixes
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
@ -63,6 +63,8 @@ class Node(_base.Node):
def cloneNode(self):
newNode = type(self)(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
if hasattr(self, 'attributes'):
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
@ -73,6 +75,14 @@ class Node(_base.Node):
"""Return true if the node has children or text"""
return bool(self.childNodes)
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class Document(Node):
type = 1
def __init__(self):
@ -81,6 +91,9 @@ class Document(Node):
def __unicode__(self):
return "#document"
def appendChild(self, child):
Node.appendChild(self, child)
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
@ -106,14 +119,22 @@ class DocumentFragment(Document):
class DocumentType(Node):
type = 3
def __init__(self, name):
def __init__(self, name, publicId, systemId):
Node.__init__(self, name)
self.publicId = u""
self.systemId = u""
self.publicId = publicId
self.systemId = systemId
def __unicode__(self):
if self.publicId or self.systemId:
publicId = self.publicId or ""
systemId = self.systemId or ""
return """<!DOCTYPE %s "%s" "%s">"""%(
self.name, publicId, systemId)
else:
return u"<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
@ -135,12 +156,16 @@ class TextNode(Node):
class Element(Node):
type = 5
def __init__(self, name):
def __init__(self, name, namespace=None):
Node.__init__(self, name)
self.namespace = namespace
self.attributes = {}
def __unicode__(self):
if self.namespace in (None, namespaces["html"]):
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
def toxml(self):
result = '<' + self.name
@ -174,6 +199,8 @@ class Element(Node):
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
if isinstance(name, tuple):
name = "%s %s"%(name[0], name[1])
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)

View File

@ -1,6 +1,9 @@
import warnings
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
from html5lib.constants import namespaces, DataLossWarning
class AttrList(object):
def __init__(self, element):
@ -22,18 +25,35 @@ class AttrList(object):
class Element(_base.Node):
def __init__(self, element, soup):
def __init__(self, element, soup, namespace):
_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
newNode = TextNode(NavigableString(
self.element.contents[-1]+node.element), self.soup)
self.element.contents[-1].extract()
self.appendChild(newNode)
# Concatenate new text onto old text node
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
newStr = NavigableString(self.element.contents[-1]+node.element)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
@ -56,18 +76,25 @@ class Element(_base.Node):
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.contents.index(refNode.element)
index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
newNode = TextNode(NavigableString(
self.element.contents[index-1]+node.element), self.soup)
self.element.contents[index-1].extract()
self.insertBefore(newNode, refNode)
# (See comments in appendChild)
newStr = NavigableString(self.element.contents[index-1]+node.element)
oldNode = self.element.contents[index-1]
del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract()
node.parent = None
@ -76,12 +103,12 @@ class Element(_base.Node):
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup))
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup)
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
@ -89,6 +116,14 @@ class Element(_base.Node):
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
_base.Node.__init__(self, None)
@ -101,13 +136,25 @@ class TextNode(Element):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup)
return Element(self.soup, self.soup, None)
def insertDoctype(self, name, publicId, systemId):
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration(name))
def elementClass(self, name):
return Element(Tag(self.soup, name), self.soup)
def elementClass(self, name, namespace):
if namespace not in (None, namespaces["html"]):
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup)
return Element(self.soup, self.soup, None)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
return _base.TreeBuilder.getFragment(self).element
def testSerializer(element):
import re
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1') or ""
else:
systemId = m.group('systemId2')
if publicId is not None or systemId is not None:
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
(' '*indent, name, publicId or "", systemId or ""))
else:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")

View File

@ -21,18 +21,24 @@ class TreeWalker(object):
attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name), \
def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, name, attrs):
return {"type": "StartTag", "name": unicode(name), \
def startTag(self, namespace, name, attrs):
return {"type": "StartTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": self.normalizeAttrs(attrs)}
def endTag(self, name):
return {"type": "EndTag", "name": unicode(name), "data": []}
def endTag(self, namespace, name):
return {"type": "EndTag",
"name": unicode(name),
"namespace":unicode(namespace),
"data": []}
def text(self, data):
data = unicode(data)
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, name, attrs, hasChildren):
def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(name, attrs, hasChildren):
for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
endTag = None
if type == DOCTYPE:
yield self.doctype(*details)
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
yield token
elif type == ELEMENT:
name, attributes, hasChildren = details
namespace, name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(name, attributes, hasChildren):
for token in self.emptyTag(namespace, name, attributes, hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(name, attributes)
endTag = name
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
name, attributes, hasChildren = details
namespace, name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(name)
yield self.endTag(namespace, name)
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling

View File

@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
node.attributes.items(), node.hasChildNodes)
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue

View File

@ -3,10 +3,13 @@ _ = gettext.gettext
import new
import copy
import re
import _base
from html5lib.constants import voidElements
tag_regexp = re.compile("{([^}]*)}(.*)")
moduleCache = {}
def getETreeModule(ElementTreeImplementation):
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. An Element node serving as *context* (it cannot be called the parent
node due to the particular ``tail`` text nodes.
1. The current element
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
2. The index of the element relative to its parent
3. A list used as a stack of all ancestor *context nodes*. It is a
pair tuple whose first item is an Element and second item is a child
index.
3. A stack of ancestor elements
4. A flag "text", "tail" or None to indicate if the current node is a
text node; either the text or tail of the current element (1)
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents = node
if key in ("text", "tail"):
return _base.TEXT, getattr(elt, key)
elt, key, parents, flag = node
if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag)
else:
node = elt[int(key)]
node = elt
if not(hasattr(node, "tag")):
node = node.getroot()
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return _base.DOCTYPE, node.text
return (_base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
elif type(node.tag) == type(ElementTree.Comment):
return _base.COMMENT, node.text
else:
#This is assumed to be an ordinary element
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, tag,
node.attrib.items(), len(node) or node.text)
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents = node
assert key not in ("text", "tail"), "Text nodes have no children"
parents.append((elt, int(key)))
node = elt[int(key)]
if isinstance(node, tuple):
element, key, parents, flag = node
else:
parents = []
element, key, parents, flag = node, None, [], None
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text", parents)
if flag in ("text", "tail"):
return None
else:
return (node, 0, parents)
if element.text:
return element, key, parents, "text"
elif len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
elt, key, parents = node
if key == "text":
key = -1
elif key == "tail":
elt, key = parents.pop()
if isinstance(node, tuple):
element, key, parents, flag = node
else:
# Look for "tail" of the "revisited" node
child = elt[key]
if child.tail:
parents.append((elt, key))
return (child, "tail", parents)
return None
# case where key were "text" or "tail" or elt[key] had a tail
key += 1
if len(elt) > key:
return (elt, key, parents)
if flag == "text":
if len(element):
parents.append(element)
return element[0], 0, parents, None
else:
return None
else:
if element.tail and flag != "tail":
return element, key, parents, "tail"
elif key < len(parents[-1]) - 1:
return parents[-1][key+1], key+1, parents, None
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
elt, key, parents = node
if parents:
elt, key = parents.pop()
return elt, key, parents
if isinstance(node, tuple):
element, key, parents, flag = node
else:
# HACK: We could return ``elt`` but None will stop the algorithm the same way
return None
if flag == "text":
if not parents:
return element
else:
return element, key, parents, None
else:
parent = parents.pop()
if not parents:
return parent
else:
return parent, list(parents[-1]).index(parent), parents, None
return locals()

View File

@ -1,4 +1,4 @@
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
depth = 0
ignore_until = None
previous = None
for event in NamespaceFlattener(prefixes={
'http://www.w3.org/1999/xhtml': ''
})(self.tree):
for event in self.tree:
if previous is not None:
if previous[0] == START:
depth += 1
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
kind, data, pos = event
if kind == START:
tag, attrib = data
name = tag.localname
namespace = tag.namespace
if tag in voidElements:
for token in self.emptyTag(tag, list(attrib), \
not next or next[0] != END or next[1] != tag):
for token in self.emptyTag(namespace, name, list(attrib),
not next or next[0] != END
or next[1] != tag):
yield token
else:
yield self.startTag(tag, list(attrib))
yield self.startTag(namespace, name, list(attrib))
elif kind == END:
if data not in voidElements:
yield self.endTag(data)
name = data.localname
namespace = data.namespace
if (namespace, name) not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
yield self.comment(data)
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == DOCTYPE:
yield self.doctype(*data)
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
START_CDATA, END_CDATA, PI):
pass

View File

@ -0,0 +1,175 @@
from lxml import etree
from html5lib.treebuilders.etree import tag_regexp
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import voidElements
from html5lib import ihatexml
class Root(object):
def __init__(self, et):
self.elementtree = et
self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self, et.docinfo.root_name,
et.docinfo.public_id,
et.docinfo.system_url))
root = et.getroot()
node = root
while node.getprevious() is not None:
node = node.getprevious()
while node is not None:
self.children.append(node)
node = node.getnext()
self.text = None
self.tail = None
def __getitem__(self, key):
return self.children[key]
def getnext(self):
return None
def __len__(self):
return 1
class Doctype(object):
def __init__(self, root_node, name, public_id, system_id):
self.root_node = root_node
self.name = name
self.public_id = public_id
self.system_id = system_id
self.text = None
self.tail = None
def getnext(self):
return self.root_node.children[1]
class FragmentRoot(Root):
def __init__(self, children):
self.children = [FragmentWrapper(self, child) for child in children]
self.text = self.tail = None
def getnext(self):
return None
class FragmentWrapper(object):
def __init__(self, fragment_root, obj):
self.root_node = fragment_root
self.obj = obj
if hasattr(self.obj, 'text'):
self.text = self.obj.text
else:
self.text = None
if hasattr(self.obj, 'tail'):
self.tail = self.obj.tail
else:
self.tail = None
self.isstring = isinstance(obj, basestring)
def __getattr__(self, name):
return getattr(self.obj, name)
def getnext(self):
siblings = self.root_node.children
idx = siblings.index(self)
if idx < len(siblings) - 1:
return siblings[idx + 1]
else:
return None
def __getitem__(self, key):
return self.obj[key]
def __nonzero__(self):
return bool(self.obj)
def getparent(self):
return None
def __str__(self):
return str(self.obj)
def __len__(self):
return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
tree = Root(tree)
elif isinstance(tree, list):
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
return _base.TEXT, getattr(node, key)
elif isinstance(node, Root):
return (_base.DOCUMENT,)
elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and node.isstring:
return _base.TEXT, node
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)
if match:
namespace, tag = match.groups()
else:
namespace = None
tag = node.tag
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
[(self.filter.fromXmlName(name), value) for
name,value in node.attrib.iteritems()],
len(node) > 0 or node.text)
def getFirstChild(self, node):
assert not isinstance(node, tuple), _("Text nodes have no children")
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text")
else:
return node[0]
def getNextSibling(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
# because node[0] might evaluate to False if it has no child element
if len(node):
return node[0]
else:
return None
else: # tail
return node.getnext()
return node.tail and (node, "tail") or node.getnext()
def getParentNode(self, node):
if isinstance(node, tuple): # Text node
node, key = node
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
if key == "text":
return node
# else: fallback to "normal" processing
return node.getparent()

View File

@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
type, node = event
if type == START_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
if name in voidElements:
for token in self.emptyTag(name, \
node.attributes.items(), not next or next[1] is not node):
for token in self.emptyTag(namespace,
name,
node.attributes.items(),
not next or next[1] is not node):
yield token
else:
yield self.startTag(name, node.attributes.items())
yield self.startTag(namespace, name, node.attributes.items())
elif type == END_ELEMENT:
name = node.nodeName
namespace = node.namespaceURI
if name not in voidElements:
yield self.endTag(name)
yield self.endTag(namespace, name)
elif type == COMMENT:
yield self.comment(node.nodeValue)

View File

@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node.value
elif node.type == 5: # Element
return _base.ELEMENT, node.name, \
node.attributes.items(), node.hasContent()
return (_base.ELEMENT, node.namespace, node.name,
node.attributes.items(), node.hasContent())
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data

View File

@ -1,3 +1,4 @@
import re
import gettext
_ = gettext.gettext
@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
#Slice needed to remove markup added during unicode conversion
return _base.DOCTYPE, unicode(node.string)[2:-1]
string = unicode(node.string)
#Slice needed to remove markup added during unicode conversion,
#but only in some versions of BeautifulSoup/Python
if string.startswith('<!') and string.endswith('>'):
string = string[2:-1]
m = self.doctype_regexp.match(string)
#This regexp approach seems wrong and fragile
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
systemId = m.group('systemId1')
else:
systemId = m.group('systemId2')
return _base.DOCTYPE, name, publicId or "", systemId or ""
elif isinstance(node, Comment):
return _base.COMMENT, unicode(node.string)[4:-3]
string = unicode(node.string)
if string.startswith('<!--') and string.endswith('-->'):
string = string[4:-3]
return _base.COMMENT, string
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node

View File

@ -34,3 +34,123 @@ class MethodDispatcher(dict):
def __getitem__(self, key):
return dict.get(self, key, self.default)
#Pure python implementation of deque taken from the ASPN Python Cookbook
#Original code by Raymond Hettinger
class deque(object):
def __init__(self, iterable=(), maxsize=-1):
if not hasattr(self, 'data'):
self.left = self.right = 0
self.data = {}
self.maxsize = maxsize
self.extend(iterable)
def append(self, x):
self.data[self.right] = x
self.right += 1
if self.maxsize != -1 and len(self) > self.maxsize:
self.popleft()
def appendleft(self, x):
self.left -= 1
self.data[self.left] = x
if self.maxsize != -1 and len(self) > self.maxsize:
self.pop()
def pop(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
self.right -= 1
elem = self.data[self.right]
del self.data[self.right]
return elem
def popleft(self):
if self.left == self.right:
raise IndexError('cannot pop from empty deque')
elem = self.data[self.left]
del self.data[self.left]
self.left += 1
return elem
def clear(self):
self.data.clear()
self.left = self.right = 0
def extend(self, iterable):
for elem in iterable:
self.append(elem)
def extendleft(self, iterable):
for elem in iterable:
self.appendleft(elem)
def rotate(self, n=1):
if self:
n %= len(self)
for i in xrange(n):
self.appendleft(self.pop())
def __getitem__(self, i):
if i < 0:
i += len(self)
try:
return self.data[i + self.left]
except KeyError:
raise IndexError
def __setitem__(self, i, value):
if i < 0:
i += len(self)
try:
self.data[i + self.left] = value
except KeyError:
raise IndexError
def __delitem__(self, i):
size = len(self)
if not (-size <= i < size):
raise IndexError
data = self.data
if i < 0:
i += size
for j in xrange(self.left+i, self.right-1):
data[j] = data[j+1]
self.pop()
def __len__(self):
return self.right - self.left
def __cmp__(self, other):
if type(self) != type(other):
return cmp(type(self), type(other))
return cmp(list(self), list(other))
def __repr__(self, _track=[]):
if id(self) in _track:
return '...'
_track.append(id(self))
r = 'deque(%r)' % (list(self),)
_track.remove(id(self))
return r
def __getstate__(self):
return (tuple(self),)
def __setstate__(self, s):
self.__init__(s[0])
def __hash__(self):
raise TypeError
def __copy__(self):
return self.__class__(self)
def __deepcopy__(self, memo={}):
from copy import deepcopy
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result

View File

@ -1,6 +1,6 @@
<!--
Description: illegal control character
Expect: content[0].value == u'Page 1\ufffdPage 2'
Expect: content[0].value == u'Page 1 Page 2'
-->
<feed xmns="http://www.w3.org/2005/Atom">