Switch from Beautiful Soup to html5lib
This commit is contained in:
parent
04ca707443
commit
3024af031f
@ -33,8 +33,9 @@
|
|||||||
<ul>
|
<ul>
|
||||||
<li><a href="http://www.planetplanet.org/">Planet</a></li>
|
<li><a href="http://www.planetplanet.org/">Planet</a></li>
|
||||||
<li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
|
<li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
|
||||||
<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
|
<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
|
||||||
<li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
|
<li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
|
||||||
|
<li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
|
||||||
<li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
|
<li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
|
||||||
<li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
|
<li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
|
||||||
</ul>
|
</ul>
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
<h2>Normalization</h2>
|
<h2>Normalization</h2>
|
||||||
<p>Venus builds on, and extends, the <a
|
<p>Venus builds on, and extends, the <a
|
||||||
href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
|
href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
|
||||||
href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
|
href="http://code.google.com/p/html5lib/">html5lib</a> to
|
||||||
convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
|
convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
|
||||||
meaning that you don't have to worry about funky feeds, tag soup, or character
|
meaning that you don't have to worry about funky feeds, tag soup, or character
|
||||||
encoding.</p>
|
encoding.</p>
|
||||||
@ -48,7 +48,7 @@ other security risks are removed.</p>
|
|||||||
links are resolved</a> within the HTML. This is also done for links
|
links are resolved</a> within the HTML. This is also done for links
|
||||||
in other areas in the feed too.</p>
|
in other areas in the feed too.</p>
|
||||||
<p>Finally, unmatched tags are closed. This is done with a
|
<p>Finally, unmatched tags are closed. This is done with a
|
||||||
<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>. Additionally, a
|
<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>. Additionally, a
|
||||||
<a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
|
<a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
|
||||||
subset of MathML</a>, as well as a
|
subset of MathML</a>, as well as a
|
||||||
<a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
|
<a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
|
||||||
|
@ -69,7 +69,7 @@
|
|||||||
<g font-size="32" fill="#FFF" text-anchor="middle">
|
<g font-size="32" fill="#FFF" text-anchor="middle">
|
||||||
<text x="350" y="380" fill="#F00">Spider</text>
|
<text x="350" y="380" fill="#F00">Spider</text>
|
||||||
<text x="350" y="460">Universal Feed Parser</text>
|
<text x="350" y="460">Universal Feed Parser</text>
|
||||||
<text x="350" y="530">BeautifulSoup</text>
|
<text x="350" y="530">html5lib</text>
|
||||||
<text x="350" y="600">Reconstitute</text>
|
<text x="350" y="600">Reconstitute</text>
|
||||||
<text x="350" y="750">Filter(s)</text>
|
<text x="350" y="750">Filter(s)</text>
|
||||||
<text x="850" y="250" fill="#F00">Splice</text>
|
<text x="850" y="250" fill="#F00">Splice</text>
|
||||||
|
Before Width: | Height: | Size: 4.3 KiB After Width: | Height: | Size: 4.3 KiB |
File diff suppressed because it is too large
Load Diff
34
planet/html5lib/__init__.py
Normal file
34
planet/html5lib/__init__.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
"""
|
||||||
|
HTML parsing library based on the WHATWG "HTML5"
|
||||||
|
specification. The parser is designed to be compatible with existing
|
||||||
|
HTML found in the wild and implements well-defined error recovery that
|
||||||
|
is largely compatible with modern desktop web browsers.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
f = open("my_document.html")
|
||||||
|
p = html5lib.HTMLParser()
|
||||||
|
tree = p.parse(f)
|
||||||
|
|
||||||
|
By default the returned treeformat is a custom "simpletree", similar
|
||||||
|
to a DOM tree; each element has attributes childNodes and parent
|
||||||
|
holding the parents and children respectively, a name attribute
|
||||||
|
holding the Element name, a data attribute holding the element data
|
||||||
|
(for text and comment nodes) and an attributes dictionary holding the
|
||||||
|
element's attributes (for Element nodes).
|
||||||
|
|
||||||
|
To get output in ElementTree format:
|
||||||
|
|
||||||
|
import html5lib
|
||||||
|
from html5lib.treebuilders import etree
|
||||||
|
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
||||||
|
elementtree = p.parse(f)
|
||||||
|
|
||||||
|
Note: Because HTML documents support various features not in the
|
||||||
|
default ElementTree (e.g. doctypes), we suppy our own simple
|
||||||
|
serializer; html5lib.treebuilders.etree.tostring At present this does not
|
||||||
|
have the encoding support offered by the elementtree serializer.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from html5parser import HTMLParser
|
456
planet/html5lib/constants.py
Normal file
456
planet/html5lib/constants.py
Normal file
@ -0,0 +1,456 @@
|
|||||||
|
import string
|
||||||
|
|
||||||
|
try:
|
||||||
|
frozenset
|
||||||
|
except NameError:
|
||||||
|
# Import from the sets module for python 2.3
|
||||||
|
from sets import Set as set
|
||||||
|
from sets import ImmutableSet as frozenset
|
||||||
|
|
||||||
|
EOF = None
|
||||||
|
|
||||||
|
contentModelFlags = {
|
||||||
|
"PCDATA":0,
|
||||||
|
"RCDATA":1,
|
||||||
|
"CDATA":2,
|
||||||
|
"PLAINTEXT":3
|
||||||
|
}
|
||||||
|
|
||||||
|
scopingElements = frozenset((
|
||||||
|
"button",
|
||||||
|
"caption",
|
||||||
|
"html",
|
||||||
|
"marquee",
|
||||||
|
"object",
|
||||||
|
"table",
|
||||||
|
"td",
|
||||||
|
"th"
|
||||||
|
))
|
||||||
|
|
||||||
|
formattingElements = frozenset((
|
||||||
|
"a",
|
||||||
|
"b",
|
||||||
|
"big",
|
||||||
|
"em",
|
||||||
|
"font",
|
||||||
|
"i",
|
||||||
|
"nobr",
|
||||||
|
"s",
|
||||||
|
"small",
|
||||||
|
"strike",
|
||||||
|
"strong",
|
||||||
|
"tt",
|
||||||
|
"u"
|
||||||
|
))
|
||||||
|
|
||||||
|
specialElements = frozenset((
|
||||||
|
"address",
|
||||||
|
"area",
|
||||||
|
"base",
|
||||||
|
"basefont",
|
||||||
|
"bgsound",
|
||||||
|
"blockquote",
|
||||||
|
"body",
|
||||||
|
"br",
|
||||||
|
"center",
|
||||||
|
"col",
|
||||||
|
"colgroup",
|
||||||
|
"dd",
|
||||||
|
"dir",
|
||||||
|
"div",
|
||||||
|
"dl",
|
||||||
|
"dt",
|
||||||
|
"embed",
|
||||||
|
"fieldset",
|
||||||
|
"form",
|
||||||
|
"frame",
|
||||||
|
"frameset",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"head",
|
||||||
|
"hr",
|
||||||
|
"iframe",
|
||||||
|
"image",
|
||||||
|
"img",
|
||||||
|
"input",
|
||||||
|
"isindex",
|
||||||
|
"li",
|
||||||
|
"link",
|
||||||
|
"listing",
|
||||||
|
"menu",
|
||||||
|
"meta",
|
||||||
|
"noembed",
|
||||||
|
"noframes",
|
||||||
|
"noscript",
|
||||||
|
"ol",
|
||||||
|
"optgroup",
|
||||||
|
"option",
|
||||||
|
"p",
|
||||||
|
"param",
|
||||||
|
"plaintext",
|
||||||
|
"pre",
|
||||||
|
"script",
|
||||||
|
"select",
|
||||||
|
"spacer",
|
||||||
|
"style",
|
||||||
|
"tbody",
|
||||||
|
"textarea",
|
||||||
|
"tfoot",
|
||||||
|
"thead",
|
||||||
|
"title",
|
||||||
|
"tr",
|
||||||
|
"ul",
|
||||||
|
"wbr"
|
||||||
|
))
|
||||||
|
|
||||||
|
spaceCharacters = frozenset((
|
||||||
|
u"\t",
|
||||||
|
u"\n",
|
||||||
|
u"\u000B",
|
||||||
|
u"\u000C",
|
||||||
|
u" "
|
||||||
|
))
|
||||||
|
|
||||||
|
tableInsertModeElements = frozenset((
|
||||||
|
"table",
|
||||||
|
"tbody",
|
||||||
|
"tfoot",
|
||||||
|
"thead",
|
||||||
|
"tr"
|
||||||
|
))
|
||||||
|
|
||||||
|
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||||
|
asciiLetters = frozenset(string.ascii_letters)
|
||||||
|
digits = frozenset(string.digits)
|
||||||
|
hexDigits = frozenset(string.hexdigits)
|
||||||
|
|
||||||
|
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
||||||
|
for c in string.ascii_uppercase])
|
||||||
|
|
||||||
|
# Heading elements need to be ordered
|
||||||
|
headingElements = (
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6"
|
||||||
|
)
|
||||||
|
|
||||||
|
# XXX What about event-source and command?
|
||||||
|
voidElements = frozenset((
|
||||||
|
"base",
|
||||||
|
"link",
|
||||||
|
"meta",
|
||||||
|
"hr",
|
||||||
|
"br",
|
||||||
|
"img",
|
||||||
|
"embed",
|
||||||
|
"param",
|
||||||
|
"area",
|
||||||
|
"col",
|
||||||
|
"input"
|
||||||
|
))
|
||||||
|
|
||||||
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||||
|
# therefore can't be a frozenset.
|
||||||
|
entitiesWindows1252 = (
|
||||||
|
8364, # 0x80 0x20AC EURO SIGN
|
||||||
|
65533, # 0x81 UNDEFINED
|
||||||
|
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||||
|
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||||
|
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||||
|
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||||
|
8224, # 0x86 0x2020 DAGGER
|
||||||
|
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||||
|
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||||
|
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||||
|
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||||
|
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||||
|
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||||
|
65533, # 0x8D UNDEFINED
|
||||||
|
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||||
|
65533, # 0x8F UNDEFINED
|
||||||
|
65533, # 0x90 UNDEFINED
|
||||||
|
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||||
|
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||||
|
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||||
|
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||||
|
8226, # 0x95 0x2022 BULLET
|
||||||
|
8211, # 0x96 0x2013 EN DASH
|
||||||
|
8212, # 0x97 0x2014 EM DASH
|
||||||
|
732, # 0x98 0x02DC SMALL TILDE
|
||||||
|
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||||
|
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||||
|
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||||
|
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||||
|
65533, # 0x9D UNDEFINED
|
||||||
|
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||||
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||||
|
)
|
||||||
|
|
||||||
|
entities = {
|
||||||
|
"AElig": u"\u00C6",
|
||||||
|
"Aacute": u"\u00C1",
|
||||||
|
"Acirc": u"\u00C2",
|
||||||
|
"Agrave": u"\u00C0",
|
||||||
|
"Alpha": u"\u0391",
|
||||||
|
"Aring": u"\u00C5",
|
||||||
|
"Atilde": u"\u00C3",
|
||||||
|
"Auml": u"\u00C4",
|
||||||
|
"Beta": u"\u0392",
|
||||||
|
"Ccedil": u"\u00C7",
|
||||||
|
"Chi": u"\u03A7",
|
||||||
|
"Dagger": u"\u2021",
|
||||||
|
"Delta": u"\u0394",
|
||||||
|
"ETH": u"\u00D0",
|
||||||
|
"Eacute": u"\u00C9",
|
||||||
|
"Ecirc": u"\u00CA",
|
||||||
|
"Egrave": u"\u00C8",
|
||||||
|
"Epsilon": u"\u0395",
|
||||||
|
"Eta": u"\u0397",
|
||||||
|
"Euml": u"\u00CB",
|
||||||
|
"Gamma": u"\u0393",
|
||||||
|
"Iacute": u"\u00CD",
|
||||||
|
"Icirc": u"\u00CE",
|
||||||
|
"Igrave": u"\u00CC",
|
||||||
|
"Iota": u"\u0399",
|
||||||
|
"Iuml": u"\u00CF",
|
||||||
|
"Kappa": u"\u039A",
|
||||||
|
"Lambda": u"\u039B",
|
||||||
|
"Mu": u"\u039C",
|
||||||
|
"Ntilde": u"\u00D1",
|
||||||
|
"Nu": u"\u039D",
|
||||||
|
"OElig": u"\u0152",
|
||||||
|
"Oacute": u"\u00D3",
|
||||||
|
"Ocirc": u"\u00D4",
|
||||||
|
"Ograve": u"\u00D2",
|
||||||
|
"Omega": u"\u03A9",
|
||||||
|
"Omicron": u"\u039F",
|
||||||
|
"Oslash": u"\u00D8",
|
||||||
|
"Otilde": u"\u00D5",
|
||||||
|
"Ouml": u"\u00D6",
|
||||||
|
"Phi": u"\u03A6",
|
||||||
|
"Pi": u"\u03A0",
|
||||||
|
"Prime": u"\u2033",
|
||||||
|
"Psi": u"\u03A8",
|
||||||
|
"Rho": u"\u03A1",
|
||||||
|
"Scaron": u"\u0160",
|
||||||
|
"Sigma": u"\u03A3",
|
||||||
|
"THORN": u"\u00DE",
|
||||||
|
"Tau": u"\u03A4",
|
||||||
|
"Theta": u"\u0398",
|
||||||
|
"Uacute": u"\u00DA",
|
||||||
|
"Ucirc": u"\u00DB",
|
||||||
|
"Ugrave": u"\u00D9",
|
||||||
|
"Upsilon": u"\u03A5",
|
||||||
|
"Uuml": u"\u00DC",
|
||||||
|
"Xi": u"\u039E",
|
||||||
|
"Yacute": u"\u00DD",
|
||||||
|
"Yuml": u"\u0178",
|
||||||
|
"Zeta": u"\u0396",
|
||||||
|
"aacute": u"\u00E1",
|
||||||
|
"acirc": u"\u00E2",
|
||||||
|
"acute": u"\u00B4",
|
||||||
|
"aelig": u"\u00E6",
|
||||||
|
"agrave": u"\u00E0",
|
||||||
|
"alefsym": u"\u2135",
|
||||||
|
"alpha": u"\u03B1",
|
||||||
|
"amp": u"\u0026",
|
||||||
|
"AMP": u"\u0026",
|
||||||
|
"and": u"\u2227",
|
||||||
|
"ang": u"\u2220",
|
||||||
|
"apos": u"\u0027",
|
||||||
|
"aring": u"\u00E5",
|
||||||
|
"asymp": u"\u2248",
|
||||||
|
"atilde": u"\u00E3",
|
||||||
|
"auml": u"\u00E4",
|
||||||
|
"bdquo": u"\u201E",
|
||||||
|
"beta": u"\u03B2",
|
||||||
|
"brvbar": u"\u00A6",
|
||||||
|
"bull": u"\u2022",
|
||||||
|
"cap": u"\u2229",
|
||||||
|
"ccedil": u"\u00E7",
|
||||||
|
"cedil": u"\u00B8",
|
||||||
|
"cent": u"\u00A2",
|
||||||
|
"chi": u"\u03C7",
|
||||||
|
"circ": u"\u02C6",
|
||||||
|
"clubs": u"\u2663",
|
||||||
|
"cong": u"\u2245",
|
||||||
|
"copy": u"\u00A9",
|
||||||
|
"COPY": u"\u00A9",
|
||||||
|
"crarr": u"\u21B5",
|
||||||
|
"cup": u"\u222A",
|
||||||
|
"curren": u"\u00A4",
|
||||||
|
"dArr": u"\u21D3",
|
||||||
|
"dagger": u"\u2020",
|
||||||
|
"darr": u"\u2193",
|
||||||
|
"deg": u"\u00B0",
|
||||||
|
"delta": u"\u03B4",
|
||||||
|
"diams": u"\u2666",
|
||||||
|
"divide": u"\u00F7",
|
||||||
|
"eacute": u"\u00E9",
|
||||||
|
"ecirc": u"\u00EA",
|
||||||
|
"egrave": u"\u00E8",
|
||||||
|
"empty": u"\u2205",
|
||||||
|
"emsp": u"\u2003",
|
||||||
|
"ensp": u"\u2002",
|
||||||
|
"epsilon": u"\u03B5",
|
||||||
|
"equiv": u"\u2261",
|
||||||
|
"eta": u"\u03B7",
|
||||||
|
"eth": u"\u00F0",
|
||||||
|
"euml": u"\u00EB",
|
||||||
|
"euro": u"\u20AC",
|
||||||
|
"exist": u"\u2203",
|
||||||
|
"fnof": u"\u0192",
|
||||||
|
"forall": u"\u2200",
|
||||||
|
"frac12": u"\u00BD",
|
||||||
|
"frac14": u"\u00BC",
|
||||||
|
"frac34": u"\u00BE",
|
||||||
|
"frasl": u"\u2044",
|
||||||
|
"gamma": u"\u03B3",
|
||||||
|
"ge": u"\u2265",
|
||||||
|
"gt": u"\u003E",
|
||||||
|
"GT": u"\u003E",
|
||||||
|
"hArr": u"\u21D4",
|
||||||
|
"harr": u"\u2194",
|
||||||
|
"hearts": u"\u2665",
|
||||||
|
"hellip": u"\u2026",
|
||||||
|
"iacute": u"\u00ED",
|
||||||
|
"icirc": u"\u00EE",
|
||||||
|
"iexcl": u"\u00A1",
|
||||||
|
"igrave": u"\u00EC",
|
||||||
|
"image": u"\u2111",
|
||||||
|
"infin": u"\u221E",
|
||||||
|
"int": u"\u222B",
|
||||||
|
"iota": u"\u03B9",
|
||||||
|
"iquest": u"\u00BF",
|
||||||
|
"isin": u"\u2208",
|
||||||
|
"iuml": u"\u00EF",
|
||||||
|
"kappa": u"\u03BA",
|
||||||
|
"lArr": u"\u21D0",
|
||||||
|
"lambda": u"\u03BB",
|
||||||
|
"lang": u"\u2329",
|
||||||
|
"laquo": u"\u00AB",
|
||||||
|
"larr": u"\u2190",
|
||||||
|
"lceil": u"\u2308",
|
||||||
|
"ldquo": u"\u201C",
|
||||||
|
"le": u"\u2264",
|
||||||
|
"lfloor": u"\u230A",
|
||||||
|
"lowast": u"\u2217",
|
||||||
|
"loz": u"\u25CA",
|
||||||
|
"lrm": u"\u200E",
|
||||||
|
"lsaquo": u"\u2039",
|
||||||
|
"lsquo": u"\u2018",
|
||||||
|
"lt": u"\u003C",
|
||||||
|
"LT": u"\u003C",
|
||||||
|
"macr": u"\u00AF",
|
||||||
|
"mdash": u"\u2014",
|
||||||
|
"micro": u"\u00B5",
|
||||||
|
"middot": u"\u00B7",
|
||||||
|
"minus": u"\u2212",
|
||||||
|
"mu": u"\u03BC",
|
||||||
|
"nabla": u"\u2207",
|
||||||
|
"nbsp": u"\u00A0",
|
||||||
|
"ndash": u"\u2013",
|
||||||
|
"ne": u"\u2260",
|
||||||
|
"ni": u"\u220B",
|
||||||
|
"not": u"\u00AC",
|
||||||
|
"notin": u"\u2209",
|
||||||
|
"nsub": u"\u2284",
|
||||||
|
"ntilde": u"\u00F1",
|
||||||
|
"nu": u"\u03BD",
|
||||||
|
"oacute": u"\u00F3",
|
||||||
|
"ocirc": u"\u00F4",
|
||||||
|
"oelig": u"\u0153",
|
||||||
|
"ograve": u"\u00F2",
|
||||||
|
"oline": u"\u203E",
|
||||||
|
"omega": u"\u03C9",
|
||||||
|
"omicron": u"\u03BF",
|
||||||
|
"oplus": u"\u2295",
|
||||||
|
"or": u"\u2228",
|
||||||
|
"ordf": u"\u00AA",
|
||||||
|
"ordm": u"\u00BA",
|
||||||
|
"oslash": u"\u00F8",
|
||||||
|
"otilde": u"\u00F5",
|
||||||
|
"otimes": u"\u2297",
|
||||||
|
"ouml": u"\u00F6",
|
||||||
|
"para": u"\u00B6",
|
||||||
|
"part": u"\u2202",
|
||||||
|
"permil": u"\u2030",
|
||||||
|
"perp": u"\u22A5",
|
||||||
|
"phi": u"\u03C6",
|
||||||
|
"pi": u"\u03C0",
|
||||||
|
"piv": u"\u03D6",
|
||||||
|
"plusmn": u"\u00B1",
|
||||||
|
"pound": u"\u00A3",
|
||||||
|
"prime": u"\u2032",
|
||||||
|
"prod": u"\u220F",
|
||||||
|
"prop": u"\u221D",
|
||||||
|
"psi": u"\u03C8",
|
||||||
|
"quot": u"\u0022",
|
||||||
|
"QUOT": u"\u0022",
|
||||||
|
"rArr": u"\u21D2",
|
||||||
|
"radic": u"\u221A",
|
||||||
|
"rang": u"\u232A",
|
||||||
|
"raquo": u"\u00BB",
|
||||||
|
"rarr": u"\u2192",
|
||||||
|
"rceil": u"\u2309",
|
||||||
|
"rdquo": u"\u201D",
|
||||||
|
"real": u"\u211C",
|
||||||
|
"reg": u"\u00AE",
|
||||||
|
"REG": u"\u00AE",
|
||||||
|
"rfloor": u"\u230B",
|
||||||
|
"rho": u"\u03C1",
|
||||||
|
"rlm": u"\u200F",
|
||||||
|
"rsaquo": u"\u203A",
|
||||||
|
"rsquo": u"\u2019",
|
||||||
|
"sbquo": u"\u201A",
|
||||||
|
"scaron": u"\u0161",
|
||||||
|
"sdot": u"\u22C5",
|
||||||
|
"sect": u"\u00A7",
|
||||||
|
"shy": u"\u00AD",
|
||||||
|
"sigma": u"\u03C3",
|
||||||
|
"sigmaf": u"\u03C2",
|
||||||
|
"sim": u"\u223C",
|
||||||
|
"spades": u"\u2660",
|
||||||
|
"sub": u"\u2282",
|
||||||
|
"sube": u"\u2286",
|
||||||
|
"sum": u"\u2211",
|
||||||
|
"sup": u"\u2283",
|
||||||
|
"sup1": u"\u00B9",
|
||||||
|
"sup2": u"\u00B2",
|
||||||
|
"sup3": u"\u00B3",
|
||||||
|
"supe": u"\u2287",
|
||||||
|
"szlig": u"\u00DF",
|
||||||
|
"tau": u"\u03C4",
|
||||||
|
"there4": u"\u2234",
|
||||||
|
"theta": u"\u03B8",
|
||||||
|
"thetasym": u"\u03D1",
|
||||||
|
"thinsp": u"\u2009",
|
||||||
|
"thorn": u"\u00FE",
|
||||||
|
"tilde": u"\u02DC",
|
||||||
|
"times": u"\u00D7",
|
||||||
|
"trade": u"\u2122",
|
||||||
|
"uArr": u"\u21D1",
|
||||||
|
"uacute": u"\u00FA",
|
||||||
|
"uarr": u"\u2191",
|
||||||
|
"ucirc": u"\u00FB",
|
||||||
|
"ugrave": u"\u00F9",
|
||||||
|
"uml": u"\u00A8",
|
||||||
|
"upsih": u"\u03D2",
|
||||||
|
"upsilon": u"\u03C5",
|
||||||
|
"uuml": u"\u00FC",
|
||||||
|
"weierp": u"\u2118",
|
||||||
|
"xi": u"\u03BE",
|
||||||
|
"yacute": u"\u00FD",
|
||||||
|
"yen": u"\u00A5",
|
||||||
|
"yuml": u"\u00FF",
|
||||||
|
"zeta": u"\u03B6",
|
||||||
|
"zwj": u"\u200D",
|
||||||
|
"zwnj": u"\u200C"
|
||||||
|
}
|
1719
planet/html5lib/html5parser.py
Normal file
1719
planet/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load Diff
202
planet/html5lib/inputstream.py
Normal file
202
planet/html5lib/inputstream.py
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
import codecs
|
||||||
|
import re
|
||||||
|
|
||||||
|
from constants import EOF
|
||||||
|
|
||||||
|
class HTMLInputStream(object):
|
||||||
|
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||||
|
|
||||||
|
This class takes care of character encoding and removing or replacing
|
||||||
|
incorrect byte-sequences and also provides column and line tracking.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, source, encoding=None):
|
||||||
|
"""Initialises the HTMLInputStream.
|
||||||
|
|
||||||
|
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||||
|
for use by the HTML5Lib.
|
||||||
|
|
||||||
|
source can be either a file-object, local filename or a string.
|
||||||
|
|
||||||
|
The optional encoding parameter must be a string that indicates
|
||||||
|
the encoding. If specified, that encoding will be used,
|
||||||
|
regardless of any BOM or later declaration (such as in a meta
|
||||||
|
element)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# List of where new lines occur
|
||||||
|
self.newLines = []
|
||||||
|
|
||||||
|
# Encoding Information
|
||||||
|
self.charEncoding = encoding
|
||||||
|
|
||||||
|
# Raw Stream
|
||||||
|
self.rawStream = self.openStream(source)
|
||||||
|
|
||||||
|
# Try to detect the encoding of the stream by looking for a BOM
|
||||||
|
detectedEncoding = self.detectEncoding()
|
||||||
|
|
||||||
|
# If an encoding was specified or detected from the BOM don't allow
|
||||||
|
# the encoding to be changed futher into the stream
|
||||||
|
if self.charEncoding or detectedEncoding:
|
||||||
|
self.allowEncodingOverride = False
|
||||||
|
else:
|
||||||
|
self.allowEncodingOverride = True
|
||||||
|
|
||||||
|
# If an encoding wasn't specified, use the encoding detected from the
|
||||||
|
# BOM, if present, otherwise use the default encoding
|
||||||
|
if not self.charEncoding:
|
||||||
|
self.charEncoding = detectedEncoding or "cp1252"
|
||||||
|
|
||||||
|
# Read bytes from stream decoding them into Unicode
|
||||||
|
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||||
|
|
||||||
|
# Normalize new lines and null characters
|
||||||
|
uString = re.sub('\r\n?', '\n', uString)
|
||||||
|
uString = re.sub('\x00', '\xFFFD', uString)
|
||||||
|
|
||||||
|
# Convert the unicode string into a list to be used as the data stream
|
||||||
|
self.dataStream = uString
|
||||||
|
|
||||||
|
self.queue = []
|
||||||
|
|
||||||
|
# Reset position in the list to read from
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def openStream(self, source):
|
||||||
|
"""Produces a file object from source.
|
||||||
|
|
||||||
|
source can be either a file object, local filename or a string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Already a file object
|
||||||
|
if hasattr(source, 'read'):
|
||||||
|
stream = source
|
||||||
|
else:
|
||||||
|
# Otherwise treat source as a string and convert to a file object
|
||||||
|
import cStringIO
|
||||||
|
stream = cStringIO.StringIO(str(source))
|
||||||
|
return stream
|
||||||
|
|
||||||
|
def detectEncoding(self):
|
||||||
|
# Attempts to detect the character encoding of the stream. If
|
||||||
|
# an encoding can be determined from the BOM return the name of the
|
||||||
|
# encoding otherwise return None
|
||||||
|
bomDict = {
|
||||||
|
codecs.BOM_UTF8: 'utf-8',
|
||||||
|
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||||
|
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Go to beginning of file and read in 4 bytes
|
||||||
|
self.rawStream.seek(0)
|
||||||
|
string = self.rawStream.read(4)
|
||||||
|
|
||||||
|
# Try detecting the BOM using bytes from the string
|
||||||
|
encoding = bomDict.get(string[:3]) # UTF-8
|
||||||
|
seek = 3
|
||||||
|
if not encoding:
|
||||||
|
encoding = bomDict.get(string[:2]) # UTF-16
|
||||||
|
seek = 2
|
||||||
|
if not encoding:
|
||||||
|
encoding = bomDict.get(string) # UTF-32
|
||||||
|
seek = 4
|
||||||
|
|
||||||
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
|
# set it to the start of the stream
|
||||||
|
self.rawStream.seek(encoding and seek or 0)
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
def declareEncoding(self, encoding):
|
||||||
|
"""Report the encoding declared by the meta element
|
||||||
|
|
||||||
|
If the encoding is currently only guessed, then this
|
||||||
|
will read subsequent characters in that encoding.
|
||||||
|
|
||||||
|
If the encoding is not compatible with the guessed encoding
|
||||||
|
and non-US-ASCII characters have been seen, return True indicating
|
||||||
|
parsing will have to begin again.
|
||||||
|
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def determineNewLines(self):
|
||||||
|
# Looks through the stream to find where new lines occur so
|
||||||
|
# the position method can tell where it is.
|
||||||
|
self.newLines.append(0)
|
||||||
|
for i in xrange(len(self.dataStream)):
|
||||||
|
if self.dataStream[i] == u"\n":
|
||||||
|
self.newLines.append(i)
|
||||||
|
|
||||||
|
def position(self):
|
||||||
|
"""Returns (line, col) of the current position in the stream."""
|
||||||
|
# Generate list of new lines first time around
|
||||||
|
if not self.newLines:
|
||||||
|
self.determineNewLines()
|
||||||
|
|
||||||
|
line = 0
|
||||||
|
tell = self.tell
|
||||||
|
for pos in self.newLines:
|
||||||
|
if pos < tell:
|
||||||
|
line += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
col = tell - self.newLines[line-1] - 1
|
||||||
|
return (line, col)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Resets the position in the stream back to the start."""
|
||||||
|
self.tell = 0
|
||||||
|
|
||||||
|
def char(self):
|
||||||
|
""" Read one character from the stream or queue if available. Return
|
||||||
|
EOF when EOF is reached.
|
||||||
|
"""
|
||||||
|
if self.queue:
|
||||||
|
return self.queue.pop(0)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.tell += 1
|
||||||
|
return self.dataStream[self.tell - 1]
|
||||||
|
except:
|
||||||
|
return EOF
|
||||||
|
|
||||||
|
def charsUntil(self, characters, opposite = False):
|
||||||
|
""" Returns a string of characters from the stream up to but not
|
||||||
|
including any character in characters or EOF. characters can be
|
||||||
|
any container that supports the in method being called on it.
|
||||||
|
"""
|
||||||
|
charStack = [self.char()]
|
||||||
|
|
||||||
|
# First from the queue
|
||||||
|
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
||||||
|
and self.queue:
|
||||||
|
charStack.append(self.queue.pop(0))
|
||||||
|
|
||||||
|
# Then the rest
|
||||||
|
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||||
|
try:
|
||||||
|
self.tell += 1
|
||||||
|
charStack.append(self.dataStream[self.tell - 1])
|
||||||
|
except:
|
||||||
|
charStack.append(EOF)
|
||||||
|
|
||||||
|
# Put the character stopped on back to the front of the queue
|
||||||
|
# from where it came.
|
||||||
|
self.queue.insert(0, charStack.pop())
|
||||||
|
return "".join(charStack)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
stream = HTMLInputStream("../tests/utf-8-bom.html")
|
||||||
|
|
||||||
|
c = stream.char()
|
||||||
|
while c:
|
||||||
|
line, col = stream.position()
|
||||||
|
if c == u"\n":
|
||||||
|
print "Line %s, Column %s: Line Feed" % (line, col)
|
||||||
|
else:
|
||||||
|
print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
|
||||||
|
c = stream.char()
|
||||||
|
print "EOF"
|
106
planet/html5lib/liberalxmlparser.py
Normal file
106
planet/html5lib/liberalxmlparser.py
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
"""
|
||||||
|
Warning: this module is experimental and subject to change and even removal
|
||||||
|
at any time.
|
||||||
|
|
||||||
|
For background/rationale, see:
|
||||||
|
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||||
|
* http://tinyurl.com/ylfj8k (and follow-ups)
|
||||||
|
|
||||||
|
References:
|
||||||
|
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||||
|
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||||
|
|
||||||
|
@@TODO:
|
||||||
|
* Build a Treebuilder that produces Python DOM objects:
|
||||||
|
http://docs.python.org/lib/module-xml.dom.html
|
||||||
|
* Produce SAX events based on the produced DOM. This is intended not to
|
||||||
|
support streaming, but rather to support application level compatibility.
|
||||||
|
* Optional namespace support
|
||||||
|
* Special case the output of XHTML <script> elements so that the empty
|
||||||
|
element syntax is never used, even when the src attribute is provided.
|
||||||
|
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
|
||||||
|
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
||||||
|
* Map illegal XML characters to U+FFFD, possibly with additional markup in
|
||||||
|
the case of XHTML
|
||||||
|
* Selectively lowercase only XHTML, but not foreign markup
|
||||||
|
"""
|
||||||
|
|
||||||
|
import html5parser
|
||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
class XHTMLParser(html5parser.HTMLParser):
|
||||||
|
""" liberal XMTHML parser """
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||||
|
|
||||||
|
def normalizeToken(self, token):
|
||||||
|
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||||
|
# We need to remove the duplicate attributes and convert attributes
|
||||||
|
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||||
|
|
||||||
|
# AT When Python 2.4 is widespread we should use
|
||||||
|
# dict(reversed(token.data))
|
||||||
|
token["data"] = dict(token["data"][::-1])
|
||||||
|
|
||||||
|
# For EmptyTags, process both a Start and an End tag
|
||||||
|
if token["type"] == "EmptyTag":
|
||||||
|
self.phase.processStartTag(token["name"], token["data"])
|
||||||
|
token["data"] = {}
|
||||||
|
token["type"] = "EndTag"
|
||||||
|
|
||||||
|
return token
|
||||||
|
|
||||||
|
class XhmlRootPhase(html5parser.RootElementPhase):
|
||||||
|
def insertHtmlElement(self):
|
||||||
|
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
|
||||||
|
self.tree.openElements.append(element)
|
||||||
|
self.tree.document.appendChild(element)
|
||||||
|
self.parser.phase = self.parser.phases["beforeHead"]
|
||||||
|
|
||||||
|
class XMLParser(XHTMLParser):
|
||||||
|
""" liberal XML parser """
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
XHTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||||
|
|
||||||
|
class XmlRootPhase(html5parser.Phase):
|
||||||
|
""" Prime the Xml parser """
|
||||||
|
def __getattr__(self, name):
|
||||||
|
self.tree.openElements.append(self.tree.document)
|
||||||
|
self.parser.phase = XmlElementPhase(self.parser, self.tree)
|
||||||
|
return getattr(self.parser.phase, name)
|
||||||
|
|
||||||
|
class XmlElementPhase(html5parser.Phase):
|
||||||
|
""" Generic handling for all XML elements """
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
html5parser.Phase.__init__(self, *args, **kwargs)
|
||||||
|
self.startTagHandler = html5parser.utils.MethodDispatcher([])
|
||||||
|
self.startTagHandler.default = self.startTagOther
|
||||||
|
self.endTagHandler = html5parser.utils.MethodDispatcher([])
|
||||||
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
|
def startTagOther(self, name, attributes):
|
||||||
|
element = self.tree.createElement(name, attributes)
|
||||||
|
self.tree.openElements[-1].appendChild(element)
|
||||||
|
self.tree.openElements.append(element)
|
||||||
|
|
||||||
|
def endTagOther(self, name):
|
||||||
|
for node in self.tree.openElements[::-1]:
|
||||||
|
if node.name == name:
|
||||||
|
self.tree.generateImpliedEndTags()
|
||||||
|
if self.tree.openElements[-1].name != name:
|
||||||
|
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||||
|
"."))
|
||||||
|
while self.tree.openElements.pop() != node:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.parser.parseError()
|
||||||
|
|
||||||
|
def processCharacters(self, data):
|
||||||
|
self.tree.insertText(data)
|
745
planet/html5lib/tokenizer.py
Normal file
745
planet/html5lib/tokenizer.py
Normal file
@ -0,0 +1,745 @@
|
|||||||
|
try:
|
||||||
|
frozenset
|
||||||
|
except NameError:
|
||||||
|
# Import from the sets module for python 2.3
|
||||||
|
from sets import Set as set
|
||||||
|
from sets import ImmutableSet as frozenset
|
||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
from constants import contentModelFlags, spaceCharacters
|
||||||
|
from constants import entitiesWindows1252, entities
|
||||||
|
from constants import asciiLowercase, asciiLetters
|
||||||
|
from constants import digits, hexDigits, EOF
|
||||||
|
|
||||||
|
from inputstream import HTMLInputStream
|
||||||
|
|
||||||
|
class HTMLTokenizer(object):
|
||||||
|
""" This class takes care of tokenizing HTML.
|
||||||
|
|
||||||
|
* self.currentToken
|
||||||
|
Holds the token that is currently being processed.
|
||||||
|
|
||||||
|
* self.state
|
||||||
|
Holds a reference to the method to be invoked... XXX
|
||||||
|
|
||||||
|
* self.states
|
||||||
|
Holds a mapping between states and methods that implement the state.
|
||||||
|
|
||||||
|
* self.stream
|
||||||
|
Points to HTMLInputStream object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# XXX need to fix documentation
|
||||||
|
|
||||||
|
def __init__(self, stream, encoding=None):
|
||||||
|
self.stream = HTMLInputStream(stream, encoding)
|
||||||
|
|
||||||
|
self.states = {
|
||||||
|
"data":self.dataState,
|
||||||
|
"entityData":self.entityDataState,
|
||||||
|
"tagOpen":self.tagOpenState,
|
||||||
|
"closeTagOpen":self.closeTagOpenState,
|
||||||
|
"tagName":self.tagNameState,
|
||||||
|
"beforeAttributeName":self.beforeAttributeNameState,
|
||||||
|
"attributeName":self.attributeNameState,
|
||||||
|
"afterAttributeName":self.afterAttributeNameState,
|
||||||
|
"beforeAttributeValue":self.beforeAttributeValueState,
|
||||||
|
"attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
|
||||||
|
"attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
|
||||||
|
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||||
|
"bogusComment":self.bogusCommentState,
|
||||||
|
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||||
|
"comment":self.commentState,
|
||||||
|
"commentDash":self.commentDashState,
|
||||||
|
"commentEnd":self.commentEndState,
|
||||||
|
"doctype":self.doctypeState,
|
||||||
|
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||||
|
"doctypeName":self.doctypeNameState,
|
||||||
|
"afterDoctypeName":self.afterDoctypeNameState,
|
||||||
|
"bogusDoctype":self.bogusDoctypeState
|
||||||
|
}
|
||||||
|
|
||||||
|
# Setup the initial tokenizer state
|
||||||
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||||
|
self.state = self.states["data"]
|
||||||
|
|
||||||
|
# The current token being created
|
||||||
|
self.currentToken = None
|
||||||
|
|
||||||
|
# Tokens to be processed.
|
||||||
|
self.tokenQueue = []
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
""" This is where the magic happens.
|
||||||
|
|
||||||
|
We do our usually processing through the states and when we have a token
|
||||||
|
to return we yield the token which pauses processing until the next token
|
||||||
|
is requested.
|
||||||
|
"""
|
||||||
|
self.stream.reset()
|
||||||
|
self.tokenQueue = []
|
||||||
|
# Start processing. When EOF is reached self.state will return False
|
||||||
|
# instead of True and the loop will terminate.
|
||||||
|
while self.state():
|
||||||
|
while self.tokenQueue:
|
||||||
|
yield self.tokenQueue.pop(0)
|
||||||
|
|
||||||
|
# Below are various helper functions the tokenizer states use worked out.
|
||||||
|
def processSolidusInTag(self):
|
||||||
|
"""If the next character is a '>', convert the currentToken into
|
||||||
|
an EmptyTag
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We need to consume another character to make sure it's a ">"
|
||||||
|
data = self.stream.char()
|
||||||
|
|
||||||
|
if self.currentToken["type"] == "StartTag" and data == u">":
|
||||||
|
self.currentToken["type"] = "EmptyTag"
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Solidus (/) incorrectly placed in tag.")})
|
||||||
|
|
||||||
|
# The character we just consumed need to be put back on the stack so it
|
||||||
|
# doesn't get lost...
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
|
||||||
|
def consumeNumberEntity(self, isHex):
|
||||||
|
"""This function returns either U+FFFD or the character based on the
|
||||||
|
decimal or hexadecimal representation. It also discards ";" if present.
|
||||||
|
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
|
||||||
|
"""
|
||||||
|
|
||||||
|
allowed = digits
|
||||||
|
radix = 10
|
||||||
|
if isHex:
|
||||||
|
allowed = hexDigits
|
||||||
|
radix = 16
|
||||||
|
|
||||||
|
char = u"\uFFFD"
|
||||||
|
charStack = []
|
||||||
|
|
||||||
|
# Consume all the characters that are in range while making sure we
|
||||||
|
# don't hit an EOF.
|
||||||
|
c = self.stream.char()
|
||||||
|
while c in allowed and c is not EOF:
|
||||||
|
charStack.append(c)
|
||||||
|
c = self.stream.char()
|
||||||
|
|
||||||
|
# Convert the set of characters consumed to an int.
|
||||||
|
charAsInt = int("".join(charStack), radix)
|
||||||
|
|
||||||
|
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||||
|
# smaller) we need to do the "windows trick".
|
||||||
|
if 127 < charAsInt < 160:
|
||||||
|
#XXX - removed parse error from windows 1252 entity for now
|
||||||
|
#we may want to reenable this later
|
||||||
|
#self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||||
|
|
||||||
|
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||||
|
|
||||||
|
# 0 is not a good number.
|
||||||
|
if charAsInt == 0:
|
||||||
|
charAsInt = 65533
|
||||||
|
|
||||||
|
try:
|
||||||
|
# XXX We should have a separate function that does "int" to
|
||||||
|
# "unicodestring" conversion since this doesn't always work
|
||||||
|
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||||
|
char = unichr(charAsInt)
|
||||||
|
except:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity couldn't be converted to character.")})
|
||||||
|
|
||||||
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
|
# invoke parseError on parser.
|
||||||
|
if c != u";":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity didn't end with ';'.")})
|
||||||
|
self.stream.queue.append(c)
|
||||||
|
|
||||||
|
return char
|
||||||
|
|
||||||
|
def consumeEntity(self):
|
||||||
|
char = None
|
||||||
|
charStack = [self.stream.char()]
|
||||||
|
if charStack[0] == u"#":
|
||||||
|
# We might have a number entity here.
|
||||||
|
charStack.extend([self.stream.char(), self.stream.char()])
|
||||||
|
if EOF in charStack:
|
||||||
|
# If we reach the end of the file put everything up to EOF
|
||||||
|
# back in the queue
|
||||||
|
charStack = charStack[:charStack.index(EOF)]
|
||||||
|
self.stream.queue.extend(charStack)
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity expected. Got end of file instead.")})
|
||||||
|
else:
|
||||||
|
if charStack[1].lower() == u"x" \
|
||||||
|
and charStack[2] in hexDigits:
|
||||||
|
# Hexadecimal entity detected.
|
||||||
|
self.stream.queue.append(charStack[2])
|
||||||
|
char = self.consumeNumberEntity(True)
|
||||||
|
elif charStack[1] in digits:
|
||||||
|
# Decimal entity detected.
|
||||||
|
self.stream.queue.extend(charStack[1:])
|
||||||
|
char = self.consumeNumberEntity(False)
|
||||||
|
else:
|
||||||
|
# No number entity detected.
|
||||||
|
self.stream.queue.extend(charStack)
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity expected but none found.")})
|
||||||
|
# Break out if we reach the end of the file
|
||||||
|
elif charStack[0] == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Entity expected. Got end of file instead.")})
|
||||||
|
else:
|
||||||
|
# At this point in the process might have named entity. Entities
|
||||||
|
# are stored in the global variable "entities".
|
||||||
|
#
|
||||||
|
# Consume characters and compare to these to a substring of the
|
||||||
|
# entity names in the list until the substring no longer matches.
|
||||||
|
filteredEntityList = [e for e in entities if \
|
||||||
|
e.startswith(charStack[0])]
|
||||||
|
|
||||||
|
def entitiesStartingWith(name):
|
||||||
|
return [e for e in filteredEntityList if e.startswith(name)]
|
||||||
|
|
||||||
|
while charStack[-1] != EOF and\
|
||||||
|
entitiesStartingWith("".join(charStack)):
|
||||||
|
charStack.append(self.stream.char())
|
||||||
|
|
||||||
|
# At this point we have a string that starts with some characters
|
||||||
|
# that may match an entity
|
||||||
|
entityName = None
|
||||||
|
|
||||||
|
# Try to find the longest entity the string will match
|
||||||
|
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||||
|
possibleEntityName = "".join(charStack[:entityLength])
|
||||||
|
if possibleEntityName in entities:
|
||||||
|
entityName = possibleEntityName
|
||||||
|
break
|
||||||
|
|
||||||
|
if entityName is not None:
|
||||||
|
char = entities[entityName]
|
||||||
|
|
||||||
|
# Check whether or not the last character returned can be
|
||||||
|
# discarded or needs to be put back.
|
||||||
|
if not charStack[-1] == ";":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Named entity did not ';'.")})
|
||||||
|
self.stream.queue.extend(charStack[entityLength:])
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Named entity expected. Got none.")})
|
||||||
|
self.stream.queue.extend(charStack)
|
||||||
|
return char
|
||||||
|
|
||||||
|
def processEntityInAttribute(self):
|
||||||
|
"""This method replaces the need for "entityInAttributeValueState".
|
||||||
|
"""
|
||||||
|
entity = self.consumeEntity()
|
||||||
|
if entity:
|
||||||
|
self.currentToken["data"][-1][1] += entity
|
||||||
|
else:
|
||||||
|
self.currentToken["data"][-1][1] += u"&"
|
||||||
|
|
||||||
|
def emitCurrentToken(self):
|
||||||
|
"""This method is a generic handler for emitting the StartTag,
|
||||||
|
EndTag, Comment and Doctype. It also sets the state to
|
||||||
|
"data" because that's what's needed after a token has been emitted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
|
||||||
|
# considered harmful it should be ok here given that the classes are for
|
||||||
|
# internal usage.
|
||||||
|
|
||||||
|
token = self.currentToken
|
||||||
|
|
||||||
|
# If an end tag has attributes it's a parse error and they should
|
||||||
|
# be removed
|
||||||
|
if token["type"] == "EndTag" and token["data"]:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("End tag contains unexpected attributes.")})
|
||||||
|
token["data"] = {}
|
||||||
|
|
||||||
|
# Add token to the queue to be yielded
|
||||||
|
self.tokenQueue.append(token)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
|
||||||
|
def emitCurrentTokenWithParseError(self, data=None):
|
||||||
|
# XXX if we want useful error messages we need to inline this method
|
||||||
|
"""This method is equivalent to emitCurrentToken (well, it invokes it)
|
||||||
|
except that it also puts "data" back on the characters queue if a data
|
||||||
|
argument is provided and it throws a parse error."""
|
||||||
|
if data:
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("XXX Something is wrong with the emitted token.")})
|
||||||
|
self.emitCurrentToken()
|
||||||
|
|
||||||
|
def attributeValueQuotedStateHandler(self, quoteType):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == quoteType:
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
elif data == u"&":
|
||||||
|
self.processEntityInAttribute()
|
||||||
|
elif data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
|
||||||
|
(quoteType, u"&"))
|
||||||
|
|
||||||
|
# Below are the various tokenizer states worked out.
|
||||||
|
|
||||||
|
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||||
|
# documents to figure out what the order of the various if and elif
|
||||||
|
# statements should be.
|
||||||
|
|
||||||
|
def dataState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == u"&" and self.contentModelFlag in\
|
||||||
|
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||||
|
self.state = self.states["entityData"]
|
||||||
|
elif data == u"<" and self.contentModelFlag !=\
|
||||||
|
contentModelFlags["PLAINTEXT"]:
|
||||||
|
self.state = self.states["tagOpen"]
|
||||||
|
elif data == EOF:
|
||||||
|
# Tokenization ends.
|
||||||
|
return False
|
||||||
|
elif data in spaceCharacters:
|
||||||
|
# Directly after emitting a token you switch back to the "data
|
||||||
|
# state". At that point spaceCharacters are important so they are
|
||||||
|
# emitted separately.
|
||||||
|
# XXX need to check if we don't need a special "spaces" flag on
|
||||||
|
# characters.
|
||||||
|
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||||
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data":
|
||||||
|
data + self.stream.charsUntil((u"&", u"<"))})
|
||||||
|
return True
|
||||||
|
|
||||||
|
def entityDataState(self):
|
||||||
|
entity = self.consumeEntity()
|
||||||
|
if entity:
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": entity})
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"&"})
|
||||||
|
self.state = self.states["data"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def tagOpenState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||||
|
if data == u"!":
|
||||||
|
self.state = self.states["markupDeclarationOpen"]
|
||||||
|
elif data == u"/":
|
||||||
|
self.state = self.states["closeTagOpen"]
|
||||||
|
elif data in asciiLetters:
|
||||||
|
self.currentToken =\
|
||||||
|
{"type": "StartTag", "name": data, "data": []}
|
||||||
|
self.state = self.states["tagName"]
|
||||||
|
elif data == u">":
|
||||||
|
# XXX In theory it could be something besides a tag name. But
|
||||||
|
# do we really care?
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected tag name. Got '>' instead.")})
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == u"?":
|
||||||
|
# XXX In theory it could be something besides a tag name. But
|
||||||
|
# do we really care?
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.state = self.states["bogusComment"]
|
||||||
|
else:
|
||||||
|
# XXX
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected tag name. Got something else instead")})
|
||||||
|
# XXX can't we do "<" + data here?
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
# We know the content model flag is set to either RCDATA or CDATA
|
||||||
|
# now because this state can never be entered with the PLAINTEXT
|
||||||
|
# flag.
|
||||||
|
if data == u"/":
|
||||||
|
self.state = self.states["closeTagOpen"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def closeTagOpenState(self):
|
||||||
|
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
||||||
|
contentModelFlags["CDATA"]):
|
||||||
|
charStack = []
|
||||||
|
|
||||||
|
# So far we know that "</" has been consumed. We now need to know
|
||||||
|
# whether the next few characters match the name of last emitted
|
||||||
|
# start tag which also happens to be the currentToken. We also need
|
||||||
|
# to have the character directly after the characters that could
|
||||||
|
# match the start tag name.
|
||||||
|
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||||
|
charStack.append(self.stream.char())
|
||||||
|
# Make sure we don't get hit by EOF
|
||||||
|
if charStack[-1] == EOF:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Since this is just for checking. We put the characters back on
|
||||||
|
# the stack.
|
||||||
|
self.stream.queue.extend(charStack)
|
||||||
|
|
||||||
|
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||||
|
and charStack[-1] in (spaceCharacters |
|
||||||
|
frozenset((u">", u"/", u"<", EOF))):
|
||||||
|
# Because the characters are correct we can safely switch to
|
||||||
|
# PCDATA mode now. This also means we don't have to do it when
|
||||||
|
# emitting the end tag token.
|
||||||
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected closing tag after seeing '</'. None found.")})
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||||
|
self.state = self.states["data"]
|
||||||
|
|
||||||
|
# Need to return here since we don't want the rest of the
|
||||||
|
# method to be walked through.
|
||||||
|
return True
|
||||||
|
|
||||||
|
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in asciiLetters:
|
||||||
|
self.currentToken =\
|
||||||
|
{"type": "EndTag", "name": data, "data": []}
|
||||||
|
self.state = self.states["tagName"]
|
||||||
|
elif data == u">":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected closing tag. Unexpected end of file.")})
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
# XXX data can be '...
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.state = self.states["bogusComment"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def tagNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
elif data in asciiLetters:
|
||||||
|
self.currentToken["name"] += data +\
|
||||||
|
self.stream.charsUntil(asciiLetters, True)
|
||||||
|
elif data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
elif data == u"/":
|
||||||
|
self.processSolidusInTag()
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
else:
|
||||||
|
self.currentToken["name"] += data
|
||||||
|
return True
|
||||||
|
|
||||||
|
def beforeAttributeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.stream.charsUntil(spaceCharacters, True)
|
||||||
|
elif data in asciiLetters:
|
||||||
|
self.currentToken["data"].append([data, ""])
|
||||||
|
self.state = self.states["attributeName"]
|
||||||
|
elif data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data == u"/":
|
||||||
|
self.processSolidusInTag()
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.currentToken["data"].append([data, ""])
|
||||||
|
self.state = self.states["attributeName"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def attributeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
leavingThisState = True
|
||||||
|
if data == u"=":
|
||||||
|
self.state = self.states["beforeAttributeValue"]
|
||||||
|
elif data in asciiLetters:
|
||||||
|
self.currentToken["data"][-1][0] += data +\
|
||||||
|
self.stream.charsUntil(asciiLetters, True)
|
||||||
|
leavingThisState = False
|
||||||
|
elif data == u">":
|
||||||
|
# XXX If we emit here the attributes are converted to a dict
|
||||||
|
# without being checked and when the code below runs we error
|
||||||
|
# because data is a dict not a list
|
||||||
|
pass
|
||||||
|
elif data in spaceCharacters:
|
||||||
|
self.state = self.states["afterAttributeName"]
|
||||||
|
elif data == u"/":
|
||||||
|
self.processSolidusInTag()
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
leavingThisState = False
|
||||||
|
else:
|
||||||
|
self.currentToken["data"][-1][0] += data
|
||||||
|
leavingThisState = False
|
||||||
|
|
||||||
|
if leavingThisState:
|
||||||
|
# Attributes are not dropped at this stage. That happens when the
|
||||||
|
# start tag token is emitted so values can still be safely appended
|
||||||
|
# to attributes, but we do want to report the parse error in time.
|
||||||
|
for name, value in self.currentToken["data"][:-1]:
|
||||||
|
if self.currentToken["data"][-1][0] == name:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Dropped duplicate attribute on tag.")})
|
||||||
|
# XXX Fix for above XXX
|
||||||
|
if data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def afterAttributeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.stream.charsUntil(spaceCharacters, True)
|
||||||
|
elif data == u"=":
|
||||||
|
self.state = self.states["beforeAttributeValue"]
|
||||||
|
elif data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data in asciiLetters:
|
||||||
|
self.currentToken["data"].append([data, ""])
|
||||||
|
self.state = self.states["attributeName"]
|
||||||
|
elif data == u"/":
|
||||||
|
self.processSolidusInTag()
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.currentToken["data"].append([data, ""])
|
||||||
|
self.state = self.states["attributeName"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def beforeAttributeValueState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.stream.charsUntil(spaceCharacters, True)
|
||||||
|
elif data == u"\"":
|
||||||
|
self.state = self.states["attributeValueDoubleQuoted"]
|
||||||
|
elif data == u"&":
|
||||||
|
self.state = self.states["attributeValueUnQuoted"]
|
||||||
|
self.stream.queue.append(data);
|
||||||
|
elif data == u"'":
|
||||||
|
self.state = self.states["attributeValueSingleQuoted"]
|
||||||
|
elif data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.currentToken["data"][-1][1] += data
|
||||||
|
self.state = self.states["attributeValueUnQuoted"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def attributeValueDoubleQuotedState(self):
|
||||||
|
# AT We could also let self.attributeValueQuotedStateHandler always
|
||||||
|
# return true and then return that directly here. Not sure what is
|
||||||
|
# faster or better...
|
||||||
|
self.attributeValueQuotedStateHandler(u"\"")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def attributeValueSingleQuotedState(self):
|
||||||
|
self.attributeValueQuotedStateHandler(u"'")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def attributeValueUnQuotedState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.state = self.states["beforeAttributeName"]
|
||||||
|
elif data == u"&":
|
||||||
|
self.processEntityInAttribute()
|
||||||
|
elif data == u">":
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data == u"<" or data == EOF:
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||||
|
frozenset(("&", ">","<")) | spaceCharacters)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def bogusCommentState(self):
|
||||||
|
# Make a new comment token and give it as value all the characters
|
||||||
|
# until the first > or EOF (charsUntil checks for EOF automatically)
|
||||||
|
# and emit it.
|
||||||
|
self.tokenQueue.append(
|
||||||
|
{"type": "Comment", "data": self.stream.charsUntil((u">"))})
|
||||||
|
|
||||||
|
# Eat the character directly after the bogus comment which is either a
|
||||||
|
# ">" or an EOF.
|
||||||
|
self.stream.char()
|
||||||
|
self.state = self.states["data"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def markupDeclarationOpenState(self):
|
||||||
|
charStack = [self.stream.char(), self.stream.char()]
|
||||||
|
if charStack == [u"-", u"-"]:
|
||||||
|
self.currentToken = {"type": "Comment", "data": ""}
|
||||||
|
self.state = self.states["comment"]
|
||||||
|
else:
|
||||||
|
for x in xrange(5):
|
||||||
|
charStack.append(self.stream.char())
|
||||||
|
# Put in explicit EOF check
|
||||||
|
if (not EOF in charStack and
|
||||||
|
"".join(charStack).upper() == u"DOCTYPE"):
|
||||||
|
self.currentToken =\
|
||||||
|
{"type": "Doctype", "name": "", "data": True}
|
||||||
|
self.state = self.states["doctype"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||||
|
self.stream.queue.extend(charStack)
|
||||||
|
self.state = self.states["bogusComment"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def commentState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == u"-":
|
||||||
|
self.state = self.states["commentDash"]
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
else:
|
||||||
|
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def commentDashState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == u"-":
|
||||||
|
self.state = self.states["commentEnd"]
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
else:
|
||||||
|
self.currentToken["data"] += u"-" + data +\
|
||||||
|
self.stream.charsUntil(u"-")
|
||||||
|
# Consume the next character which is either a "-" or an EOF as
|
||||||
|
# well so if there's a "-" directly after the "-" we go nicely to
|
||||||
|
# the "comment end state" without emitting a ParseError() there.
|
||||||
|
self.stream.char()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def commentEndState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == u">":
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentToken()
|
||||||
|
elif data == u"-":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected '-' after '--' found in comment.")})
|
||||||
|
self.currentToken["data"] += data
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
else:
|
||||||
|
# XXX
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in comment found.")})
|
||||||
|
self.currentToken["data"] += u"--" + data
|
||||||
|
self.state = self.states["comment"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypeState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.state = self.states["beforeDoctypeName"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("No space after literal string 'DOCTYPE'.")})
|
||||||
|
self.stream.queue.append(data)
|
||||||
|
self.state = self.states["beforeDoctypeName"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def beforeDoctypeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data in asciiLowercase:
|
||||||
|
self.currentToken["name"] = data.upper()
|
||||||
|
self.state = self.states["doctypeName"]
|
||||||
|
elif data == u">":
|
||||||
|
# Character needs to be consumed per the specification so don't
|
||||||
|
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
else:
|
||||||
|
self.currentToken["name"] = data
|
||||||
|
self.state = self.states["doctypeName"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
needsDoctypeCheck = False
|
||||||
|
if data in spaceCharacters:
|
||||||
|
self.state = self.states["afterDoctypeName"]
|
||||||
|
needsDoctypeCheck = True
|
||||||
|
elif data == u">":
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError()
|
||||||
|
else:
|
||||||
|
# We can't just uppercase everything that arrives here. For
|
||||||
|
# instance, non-ASCII characters.
|
||||||
|
if data in asciiLowercase:
|
||||||
|
data = data.upper()
|
||||||
|
self.currentToken["name"] += data
|
||||||
|
needsDoctypeCheck = True
|
||||||
|
|
||||||
|
# After some iterations through this state it should eventually say
|
||||||
|
# "HTML". Otherwise there's an error.
|
||||||
|
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
||||||
|
self.currentToken["data"] = False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def afterDoctypeNameState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data == u">":
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.currentToken["data"] = True
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected space or '>'. Got '" + data + "'")})
|
||||||
|
self.currentToken["data"] = True
|
||||||
|
self.state = self.states["bogusDoctype"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def bogusDoctypeState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == u">":
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
# XXX EMIT
|
||||||
|
self.emitCurrentTokenWithParseError(data)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
return True
|
36
planet/html5lib/treebuilders/__init__.py
Executable file
36
planet/html5lib/treebuilders/__init__.py
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
"""A collection of modules for building different kinds of tree from
|
||||||
|
HTML documents.
|
||||||
|
|
||||||
|
To create a treebuilder for a new type of tree, you need to do
|
||||||
|
implement several things:
|
||||||
|
|
||||||
|
1) A set of classes for various types of elements: Document, Doctype,
|
||||||
|
Comment, Element. These must implement the interface of
|
||||||
|
_base.treebuilders.Node (although comment nodes have a different
|
||||||
|
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||||
|
Textual content may also be implemented as another node type, or not, as
|
||||||
|
your tree implementation requires.
|
||||||
|
|
||||||
|
2) A treebuilder object (called TreeBuilder by convention) that
|
||||||
|
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||||
|
documentClass - the class to use for the bottommost node of a document
|
||||||
|
elementClass - the class to use for HTML Elements
|
||||||
|
commentClass - the class to use for comments
|
||||||
|
doctypeClass - the class to use for doctypes
|
||||||
|
It also has one required method:
|
||||||
|
getDocument - Returns the root node of the complete document tree
|
||||||
|
|
||||||
|
3) If you wish to run the unit tests, you must also create a
|
||||||
|
testSerializer method on your treebuilder which accepts a node and
|
||||||
|
returns a string containing Node and its children serialized according
|
||||||
|
to the format used in the unittests
|
||||||
|
|
||||||
|
The supplied simpletree module provides a python-only implementation
|
||||||
|
of a full treebuilder and is a useful reference for the semantics of
|
||||||
|
the various methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os.path
|
||||||
|
__path__.append(os.path.dirname(__path__[0]))
|
||||||
|
|
||||||
|
import dom, etree, simpletree
|
312
planet/html5lib/treebuilders/_base.py
Executable file
312
planet/html5lib/treebuilders/_base.py
Executable file
@ -0,0 +1,312 @@
|
|||||||
|
from constants import scopingElements, tableInsertModeElements
|
||||||
|
|
||||||
|
# The scope markers are inserted when entering buttons, object elements,
|
||||||
|
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||||
|
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||||
|
Marker = None
|
||||||
|
|
||||||
|
#XXX - TODO; make the default interface more ElementTree-like
|
||||||
|
# rather than DOM-like
|
||||||
|
|
||||||
|
class Node(object):
|
||||||
|
def __init__(self, name):
|
||||||
|
"""Node representing an item in the tree.
|
||||||
|
name - The tag name associated with the node
|
||||||
|
parent - The parent of the current node (or None for the document node)
|
||||||
|
value - The value of the current node (applies to text nodes and
|
||||||
|
comments
|
||||||
|
attributes - a dict holding name, value pairs for attributes of the node
|
||||||
|
childNodes - a list of child nodes of the current node. This must
|
||||||
|
include all elements but not necessarily other node types
|
||||||
|
_flags - A list of miscellaneous flags that can be set on the node
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
self.parent = None
|
||||||
|
self.value = None
|
||||||
|
self.attributes = {}
|
||||||
|
self.childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
attributesStr = " ".join(["%s=\"%s\""%(name, value)
|
||||||
|
for name, value in
|
||||||
|
self.attributes.iteritems()])
|
||||||
|
if attributesStr:
|
||||||
|
return "<%s %s>"%(self.name,attributesStr)
|
||||||
|
else:
|
||||||
|
return "<%s>"%(self.name)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<%s %s>" % (self.__class__, self.name)
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
"""Insert node as a child of the current node
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
"""Insert data as text in the current node, positioned before the
|
||||||
|
start of node insertBefore or to the end of the node's text.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
"""Insert node as a child of the current node, before refNode in the
|
||||||
|
list of child nodes. Raises ValueError if refNode is not a child of
|
||||||
|
the current node"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
"""Remove node from the children of the current node
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
"""Move all the children of the current node to newParent.
|
||||||
|
This is needed so that trees that don't store text as nodes move the
|
||||||
|
text in the correct way
|
||||||
|
"""
|
||||||
|
#XXX - should this method be made more general?
|
||||||
|
for child in self.childNodes:
|
||||||
|
newParent.appendChild(child)
|
||||||
|
self.childNodes = []
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
"""Return a shallow copy of the current node i.e. a node with the same
|
||||||
|
name and attributes but with no parent or child nodes
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text, false otherwise
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
class TreeBuilder(object):
|
||||||
|
"""Base treebuilder implementation
|
||||||
|
documentClass - the class to use for the bottommost node of a document
|
||||||
|
elementClass - the class to use for HTML Elements
|
||||||
|
commentClass - the class to use for comments
|
||||||
|
doctypeClass - the class to use for doctypes
|
||||||
|
"""
|
||||||
|
|
||||||
|
#Document class
|
||||||
|
documentClass = None
|
||||||
|
|
||||||
|
#The class to use for creating a node
|
||||||
|
elementClass = None
|
||||||
|
|
||||||
|
#The class to use for creating comments
|
||||||
|
commentClass = None
|
||||||
|
|
||||||
|
#The class to use for creating doctypes
|
||||||
|
doctypeClass = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.openElements = []
|
||||||
|
self.activeFormattingElements = []
|
||||||
|
|
||||||
|
#XXX - rename these to headElement, formElement
|
||||||
|
self.headPointer = None
|
||||||
|
self.formPointer = None
|
||||||
|
|
||||||
|
self.insertFromTable = False
|
||||||
|
|
||||||
|
self.document = self.documentClass()
|
||||||
|
|
||||||
|
def elementInScope(self, target, tableVariant=False):
|
||||||
|
# Exit early when possible.
|
||||||
|
if self.openElements[-1].name == target:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# AT Use reverse instead of [::-1] when we can rely on Python 2.4
|
||||||
|
# AT How about while True and simply set node to [-1] and set it to
|
||||||
|
# [-2] at the end...
|
||||||
|
for node in self.openElements[::-1]:
|
||||||
|
if node.name == target:
|
||||||
|
return True
|
||||||
|
elif node.name == "table":
|
||||||
|
return False
|
||||||
|
elif not tableVariant and node.name in scopingElements:
|
||||||
|
return False
|
||||||
|
elif node.name == "html":
|
||||||
|
return False
|
||||||
|
assert False # We should never reach this point
|
||||||
|
|
||||||
|
def reconstructActiveFormattingElements(self):
|
||||||
|
# Within this algorithm the order of steps described in the
|
||||||
|
# specification is not quite the same as the order of steps in the
|
||||||
|
# code. It should still do the same though.
|
||||||
|
|
||||||
|
# Step 1: stop the algorithm when there's nothing to do.
|
||||||
|
if not self.activeFormattingElements:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||||
|
i = -1
|
||||||
|
entry = self.activeFormattingElements[i]
|
||||||
|
if entry == Marker or entry in self.openElements:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 6
|
||||||
|
while entry != Marker and entry not in self.openElements:
|
||||||
|
# Step 5: let entry be one earlier in the list.
|
||||||
|
i -= 1
|
||||||
|
try:
|
||||||
|
entry = self.activeFormattingElements[i]
|
||||||
|
except:
|
||||||
|
# Step 4: at this point we need to jump to step 8. By not doing
|
||||||
|
# i += 1 which is also done in step 7 we achieve that.
|
||||||
|
break
|
||||||
|
while True:
|
||||||
|
# Step 7
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Step 8
|
||||||
|
clone = self.activeFormattingElements[i].cloneNode()
|
||||||
|
|
||||||
|
# Step 9
|
||||||
|
element = self.insertElement(clone.name, clone.attributes)
|
||||||
|
|
||||||
|
# Step 10
|
||||||
|
self.activeFormattingElements[i] = element
|
||||||
|
|
||||||
|
# Step 11
|
||||||
|
if element == self.activeFormattingElements[-1]:
|
||||||
|
break
|
||||||
|
|
||||||
|
def clearActiveFormattingElements(self):
|
||||||
|
entry = self.activeFormattingElements.pop()
|
||||||
|
while self.activeFormattingElements and entry != Marker:
|
||||||
|
entry = self.activeFormattingElements.pop()
|
||||||
|
|
||||||
|
def elementInActiveFormattingElements(self, name):
|
||||||
|
"""Check if an element exists between the end of the active
|
||||||
|
formatting elements and the last marker. If it does, return it, else
|
||||||
|
return false"""
|
||||||
|
|
||||||
|
for item in self.activeFormattingElements[::-1]:
|
||||||
|
# Check for Marker first because if it's a Marker it doesn't have a
|
||||||
|
# name attribute.
|
||||||
|
if item == Marker:
|
||||||
|
break
|
||||||
|
elif item.name == name:
|
||||||
|
return item
|
||||||
|
return False
|
||||||
|
|
||||||
|
def insertDoctype(self, name):
|
||||||
|
self.document.appendChild(self.doctypeClass(name))
|
||||||
|
|
||||||
|
def insertComment(self, data, parent=None):
|
||||||
|
if parent is None:
|
||||||
|
parent = self.openElements[-1]
|
||||||
|
parent.appendChild(self.commentClass(data))
|
||||||
|
|
||||||
|
def createElement(self, name, attributes):
|
||||||
|
"""Create an element but don't insert it anywhere"""
|
||||||
|
element = self.elementClass(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
return element
|
||||||
|
|
||||||
|
def _getInsertFromTable(self):
|
||||||
|
return self._insertFromTable
|
||||||
|
|
||||||
|
def _setInsertFromTable(self, value):
|
||||||
|
"""Switch the function used to insert an element from the
|
||||||
|
normal one to the misnested table one and back again"""
|
||||||
|
self._insertFromTable = value
|
||||||
|
if value:
|
||||||
|
self.insertElement = self.insertElementTable
|
||||||
|
else:
|
||||||
|
self.insertElement = self.insertElementNormal
|
||||||
|
|
||||||
|
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||||
|
|
||||||
|
def insertElementNormal(self, name, attributes):
|
||||||
|
element = self.elementClass(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
self.openElements[-1].appendChild(element)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def insertElementTable(self, name, attributes):
|
||||||
|
"""Create an element and insert it into the tree"""
|
||||||
|
element = self.elementClass(name)
|
||||||
|
element.attributes = attributes
|
||||||
|
if self.openElements[-1].name not in tableInsertModeElements:
|
||||||
|
return self.insertElementNormal(name, attributes)
|
||||||
|
else:
|
||||||
|
#We should be in the InTable mode. This means we want to do
|
||||||
|
#special magic element rearranging
|
||||||
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
|
if insertBefore is None:
|
||||||
|
parent.appendChild(element)
|
||||||
|
else:
|
||||||
|
parent.insertBefore(element, insertBefore)
|
||||||
|
self.openElements.append(element)
|
||||||
|
return element
|
||||||
|
|
||||||
|
def insertText(self, data, parent=None):
|
||||||
|
"""Insert text data."""
|
||||||
|
if parent is None:
|
||||||
|
parent = self.openElements[-1]
|
||||||
|
|
||||||
|
if (not(self.insertFromTable) or (self.insertFromTable and
|
||||||
|
self.openElements[-1].name not in
|
||||||
|
tableInsertModeElements)):
|
||||||
|
parent.insertText(data)
|
||||||
|
else:
|
||||||
|
#We should be in the InTable mode. This means we want to do
|
||||||
|
#special magic element rearranging
|
||||||
|
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||||
|
parent.insertText(data, insertBefore)
|
||||||
|
|
||||||
|
def getTableMisnestedNodePosition(self):
|
||||||
|
"""Get the foster parent element, and sibling to insert before
|
||||||
|
(or None) when inserting a misnested table node"""
|
||||||
|
#The foster parent element is the one which comes before the most
|
||||||
|
#recently opened table element
|
||||||
|
#XXX - this is really inelegant
|
||||||
|
lastTable=None
|
||||||
|
fosterParent = None
|
||||||
|
insertBefore = None
|
||||||
|
for elm in self.openElements[::-1]:
|
||||||
|
if elm.name == u"table":
|
||||||
|
lastTable = elm
|
||||||
|
break
|
||||||
|
if lastTable:
|
||||||
|
#XXX - we should really check that this parent is actually a
|
||||||
|
#node here
|
||||||
|
if lastTable.parent:
|
||||||
|
fosterParent = lastTable.parent
|
||||||
|
insertBefore = lastTable
|
||||||
|
else:
|
||||||
|
fosterParent = self.openElements[
|
||||||
|
self.openElements.index(lastTable) - 1]
|
||||||
|
else:
|
||||||
|
assert self.innerHTML
|
||||||
|
fosterParent = self.openElements[0]
|
||||||
|
return fosterParent, insertBefore
|
||||||
|
|
||||||
|
def generateImpliedEndTags(self, exclude=None):
|
||||||
|
name = self.openElements[-1].name
|
||||||
|
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||||
|
and name != exclude):
|
||||||
|
self.openElements.pop()
|
||||||
|
# XXX Until someone has broven that the above breaks stuff I think
|
||||||
|
# we should keep it in.
|
||||||
|
# self.processEndTag(name)
|
||||||
|
self.generateImpliedEndTags(exclude)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
"Return the final tree"
|
||||||
|
return self.document
|
||||||
|
|
||||||
|
def testSerializer(self, node):
|
||||||
|
"""Serialize the subtree of node in the format required by unit tests
|
||||||
|
node - the node from which to start serializing"""
|
||||||
|
raise NotImplementedError
|
127
planet/html5lib/treebuilders/dom.py
Executable file
127
planet/html5lib/treebuilders/dom.py
Executable file
@ -0,0 +1,127 @@
|
|||||||
|
import _base
|
||||||
|
from xml.dom import minidom, Node
|
||||||
|
|
||||||
|
import re
|
||||||
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
|
|
||||||
|
class AttrList:
|
||||||
|
def __init__(self, element):
|
||||||
|
self.element = element
|
||||||
|
def __iter__(self):
|
||||||
|
return self.element.attributes.items().__iter__()
|
||||||
|
def __setitem__(self, name, value):
|
||||||
|
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||||
|
self.element.setAttribute(name, value)
|
||||||
|
def items(self):
|
||||||
|
return self.element.attributes.items()
|
||||||
|
|
||||||
|
class NodeBuilder(_base.Node):
|
||||||
|
def __init__(self, element):
|
||||||
|
_base.Node.__init__(self, element.nodeName)
|
||||||
|
self.element = element
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
node.parent = self
|
||||||
|
self.element.appendChild(node.element)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||||
|
text = self.element.ownerDocument.createTextNode(data)
|
||||||
|
if insertBefore:
|
||||||
|
self.element.insertBefore(text, insertBefore.element)
|
||||||
|
else:
|
||||||
|
self.element.appendChild(text)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
self.element.insertBefore(node.element, refNode.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
self.element.removeChild(node.element)
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
while self.element.hasChildNodes():
|
||||||
|
child = self.element.firstChild
|
||||||
|
self.element.removeChild(child)
|
||||||
|
newParent.element.appendChild(child)
|
||||||
|
self.childNodes = []
|
||||||
|
|
||||||
|
def getAttributes(self):
|
||||||
|
return AttrList(self.element)
|
||||||
|
|
||||||
|
def setAttributes(self, attributes):
|
||||||
|
if attributes:
|
||||||
|
for name, value in attributes.items():
|
||||||
|
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||||
|
self.element.setAttribute(name, value)
|
||||||
|
|
||||||
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
return NodeBuilder(self.element.cloneNode(False))
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
return self.element.hasChildNodes()
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
def documentClass(self):
|
||||||
|
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def doctypeClass(self,name):
|
||||||
|
domimpl = minidom.getDOMImplementation()
|
||||||
|
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
||||||
|
|
||||||
|
def elementClass(self, name):
|
||||||
|
return NodeBuilder(self.dom.createElement(name))
|
||||||
|
|
||||||
|
def commentClass(self, data):
|
||||||
|
return NodeBuilder(self.dom.createComment(data))
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self.dom.appendChild(node.element)
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.dom
|
||||||
|
|
||||||
|
def insertText(self, data, parent=None):
|
||||||
|
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||||
|
if parent <> self:
|
||||||
|
_base.TreeBuilder.insertText(self, data, parent)
|
||||||
|
else:
|
||||||
|
# HACK: allow text nodes as children of the document node
|
||||||
|
if hasattr(self.dom, '_child_node_types'):
|
||||||
|
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||||
|
self.dom._child_node_types=list(self.dom._child_node_types)
|
||||||
|
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||||
|
self.dom.appendChild(self.dom.createTextNode(data))
|
||||||
|
|
||||||
|
name = None
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
element.normalize()
|
||||||
|
rv = []
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||||
|
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||||
|
rv.append("#document")
|
||||||
|
elif element.nodeType == Node.COMMENT_NODE:
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||||
|
elif element.nodeType == Node.TEXT_NODE:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
|
||||||
|
if element.hasAttributes():
|
||||||
|
for name, value in element.attributes.items():
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.childNodes:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
208
planet/html5lib/treebuilders/etree.py
Executable file
208
planet/html5lib/treebuilders/etree.py
Executable file
@ -0,0 +1,208 @@
|
|||||||
|
try:
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
except ImportError:
|
||||||
|
from elementtree import ElementTree
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
class Element(_base.Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
self._element = ElementTree.Element(name)
|
||||||
|
self.name = name
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
#Set the element text and tail to the empty string rather than None
|
||||||
|
#XXX - is this desirable or should we do it on a case by case basis?
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.tail = ""
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._element.tag = name
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return self._element.tag
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._element.attrib
|
||||||
|
|
||||||
|
def _setAttributes(self, attributes):
|
||||||
|
#Delete existing attributes first
|
||||||
|
#XXX - there may be a better way to do this...
|
||||||
|
for key in self._element.attrib.keys():
|
||||||
|
del self._element.attrib[key]
|
||||||
|
for key, value in attributes.iteritems():
|
||||||
|
self._element.set(key, value)
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
def _setChildNodes(self, value):
|
||||||
|
del self._element[:]
|
||||||
|
self._childNodes = []
|
||||||
|
for element in value:
|
||||||
|
self.insertChild(element)
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes, _setChildNodes)
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text"""
|
||||||
|
return bool(self._element.text or self._element.getchildren())
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self._childNodes.append(node)
|
||||||
|
self._element.append(node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self._element.getchildren().index(refNode._element)
|
||||||
|
self._element.insert(index, node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
self._element.remove(node._element)
|
||||||
|
node.parent=None
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
if not(len(self._element)):
|
||||||
|
self._element.text += data
|
||||||
|
elif insertBefore is None:
|
||||||
|
#Insert the text as the tail of the last child element
|
||||||
|
self._element[-1].tail += data
|
||||||
|
else:
|
||||||
|
#Insert the text before the specified node
|
||||||
|
children = self._element.getchildren()
|
||||||
|
index = children.index(insertBefore._element)
|
||||||
|
if index > 0:
|
||||||
|
self._element[index-1].tail += data
|
||||||
|
else:
|
||||||
|
self._element.text += data
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
element = Element(self.name)
|
||||||
|
element.attributes = self.attributes
|
||||||
|
return element
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
if newParent.childNodes:
|
||||||
|
newParent.childNodes[-1]._element.tail += self._element.text
|
||||||
|
else:
|
||||||
|
newParent._element.text += self._element.text
|
||||||
|
self._element.text = ""
|
||||||
|
_base.Node.reparentChildren(self, newParent)
|
||||||
|
|
||||||
|
class Comment(Element):
|
||||||
|
def __init__(self, data):
|
||||||
|
Element.__init__(self, Comment)
|
||||||
|
self._element.text = data
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
def _setData(self, value):
|
||||||
|
self._element.text = value
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
class DocumentType(Element):
|
||||||
|
def __init__(self, name):
|
||||||
|
Element.__init__(self, DocumentType)
|
||||||
|
self._element.text = name
|
||||||
|
|
||||||
|
class Document(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, Document)
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if element.tag is DocumentType:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||||
|
elif element.tag is Document:
|
||||||
|
rv.append("#document")
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
elif element.tag is Comment:
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
for name, value in element.attrib.iteritems():
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if element.tail:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def tostring(element):
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element):
|
||||||
|
if element.tag is DocumentType:
|
||||||
|
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||||
|
elif element.tag is Document:
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
elif element.tag is Comment:
|
||||||
|
rv.append("<!--%s-->"%(element.text,))
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>"%(element.tag,))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||||
|
for name, value in element.attrib.iteritems()])
|
||||||
|
rv.append("<%s %s>"%(element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>"%(element.tag,))
|
||||||
|
|
||||||
|
if element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = Element
|
||||||
|
commentClass = Comment
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.document._element
|
153
planet/html5lib/treebuilders/simpletree.py
Executable file
153
planet/html5lib/treebuilders/simpletree.py
Executable file
@ -0,0 +1,153 @@
|
|||||||
|
import _base
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
|
# Really crappy basic implementation of a DOM-core like thing
|
||||||
|
class Node(_base.Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
self.name = name
|
||||||
|
self.parent = None
|
||||||
|
self.value = None
|
||||||
|
self.childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<%s %s>" % (self.__class__, self.name)
|
||||||
|
|
||||||
|
def printTree(self, indent=0):
|
||||||
|
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||||
|
for child in self.childNodes:
|
||||||
|
tree += child.printTree(indent + 2)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
def appendChild(self, node, index=None):
|
||||||
|
if (isinstance(node, TextNode) and self.childNodes and
|
||||||
|
isinstance(self.childNodes[-1], TextNode)):
|
||||||
|
self.childNodes[-1].value += node.value
|
||||||
|
else:
|
||||||
|
self.childNodes.append(node)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
if insertBefore is None:
|
||||||
|
self.appendChild(TextNode(data))
|
||||||
|
else:
|
||||||
|
self.insertBefore(TextNode(data), insertBefore)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self.childNodes.index(refNode)
|
||||||
|
if (isinstance(node, TextNode) and index > 0 and
|
||||||
|
isinstance(self.childNodes[index - 1], TextNode)):
|
||||||
|
self.childNodes[index - 1].value += node.value
|
||||||
|
else:
|
||||||
|
self.childNodes.insert(index, node)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
try:
|
||||||
|
self.childNodes.remove(node)
|
||||||
|
except:
|
||||||
|
# XXX
|
||||||
|
raise
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
newNode = type(self)(self.name)
|
||||||
|
for attr, value in self.attributes.iteritems():
|
||||||
|
newNode.attributes[attr] = value
|
||||||
|
newNode.value = self.value
|
||||||
|
return newNode
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text"""
|
||||||
|
return bool(self.childNodes)
|
||||||
|
|
||||||
|
class Document(Node):
|
||||||
|
def __init__(self):
|
||||||
|
Node.__init__(self, None)
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return "#document"
|
||||||
|
|
||||||
|
def printTree(self):
|
||||||
|
tree = unicode(self)
|
||||||
|
for child in self.childNodes:
|
||||||
|
tree += child.printTree(2)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
def toxml(self, encoding="utf=8"):
|
||||||
|
result = ''
|
||||||
|
for child in self.childNodes:
|
||||||
|
result += child.toxml()
|
||||||
|
return result.encode(encoding)
|
||||||
|
|
||||||
|
class DocumentType(Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
Node.__init__(self, name)
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return "<!DOCTYPE %s>" % self.name
|
||||||
|
|
||||||
|
class TextNode(Node):
|
||||||
|
def __init__(self, value):
|
||||||
|
Node.__init__(self, None)
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return "\"%s\"" % self.value
|
||||||
|
|
||||||
|
def toxml(self):
|
||||||
|
return escape(self.value)
|
||||||
|
|
||||||
|
class Element(Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
Node.__init__(self, name)
|
||||||
|
self.attributes = {}
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return "<%s>" % self.name
|
||||||
|
|
||||||
|
def printTree(self, indent):
|
||||||
|
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||||
|
indent += 2
|
||||||
|
if self.attributes:
|
||||||
|
for name, value in self.attributes.iteritems():
|
||||||
|
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||||
|
for child in self.childNodes:
|
||||||
|
tree += child.printTree(indent)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
def toxml(self):
|
||||||
|
result = '<' + self.name
|
||||||
|
if self.attributes:
|
||||||
|
for name,value in self.attributes.iteritems():
|
||||||
|
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||||
|
if self.childNodes:
|
||||||
|
result += '>'
|
||||||
|
for child in self.childNodes:
|
||||||
|
result += child.toxml()
|
||||||
|
result += '</%s>' % self.name
|
||||||
|
else:
|
||||||
|
result += '/>'
|
||||||
|
return result
|
||||||
|
|
||||||
|
class CommentNode(Node):
|
||||||
|
def __init__(self, data):
|
||||||
|
Node.__init__(self, None)
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return "<!-- %s -->" % self.data
|
||||||
|
|
||||||
|
toxml = __unicode__
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = Element
|
||||||
|
commentClass = CommentNode
|
||||||
|
|
||||||
|
def testSerializer(self, node):
|
||||||
|
return node.printTree()
|
36
planet/html5lib/utils.py
Normal file
36
planet/html5lib/utils.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
try:
|
||||||
|
frozenset
|
||||||
|
except NameError:
|
||||||
|
#Import from the sets module for python 2.3
|
||||||
|
from sets import Set as set
|
||||||
|
from sets import ImmutableSet as frozenset
|
||||||
|
|
||||||
|
class MethodDispatcher(dict):
|
||||||
|
"""Dict with 2 special properties:
|
||||||
|
|
||||||
|
On initiation, keys that are lists, sets or tuples are converted to
|
||||||
|
multiple keys so accessing any one of the items in the original
|
||||||
|
list-like object returns the matching value
|
||||||
|
|
||||||
|
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||||
|
md["foo"] == "baz"
|
||||||
|
|
||||||
|
A default value which can be set through the default attribute.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, items=()):
|
||||||
|
# Using _dictEntries instead of directly assigning to self is about
|
||||||
|
# twice as fast. Please do careful performance testing before changing
|
||||||
|
# anything here.
|
||||||
|
_dictEntries = []
|
||||||
|
for name,value in items:
|
||||||
|
if type(name) in (list, tuple, frozenset, set):
|
||||||
|
for item in name:
|
||||||
|
_dictEntries.append((item, value))
|
||||||
|
else:
|
||||||
|
_dictEntries.append((name, value))
|
||||||
|
dict.__init__(self, _dictEntries)
|
||||||
|
self.default = None
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
return dict.get(self, key, self.default)
|
@ -15,9 +15,9 @@ Todo:
|
|||||||
"""
|
"""
|
||||||
import re, time, md5, sgmllib
|
import re, time, md5, sgmllib
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
from xml.dom import minidom
|
from xml.dom import minidom, Node
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup
|
||||||
from xml.parsers.expat import ExpatError
|
from planet.html5lib import liberalxmlparser, treebuilders
|
||||||
import planet, config
|
import planet, config
|
||||||
|
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -59,22 +59,6 @@ def cssid(name):
|
|||||||
name = nonalpha.sub('-',name).lower()
|
name = nonalpha.sub('-',name).lower()
|
||||||
return name.strip('-')
|
return name.strip('-')
|
||||||
|
|
||||||
def normalize(text, bozo):
|
|
||||||
""" convert everything to well formed XML """
|
|
||||||
if text.has_key('type'):
|
|
||||||
if text.type.lower().find('html')<0:
|
|
||||||
text['value'] = escape(text.value)
|
|
||||||
text['type'] = 'text/html'
|
|
||||||
if text.type.lower() == 'text/html' or bozo:
|
|
||||||
dom=BeautifulSoup(text.value,convertEntities="html")
|
|
||||||
for tag in dom.findAll(True):
|
|
||||||
for attr,value in tag.attrs:
|
|
||||||
value=sgmllib.charref.sub(ncr2c,value)
|
|
||||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
|
||||||
tag[attr]=value
|
|
||||||
text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
|
|
||||||
return text
|
|
||||||
|
|
||||||
def id(xentry, entry):
|
def id(xentry, entry):
|
||||||
""" copy or compute an id for the entry """
|
""" copy or compute an id for the entry """
|
||||||
|
|
||||||
@ -150,27 +134,32 @@ def author(xentry, name, detail):
|
|||||||
def content(xentry, name, detail, bozo):
|
def content(xentry, name, detail, bozo):
|
||||||
""" insert a content-like element into the entry """
|
""" insert a content-like element into the entry """
|
||||||
if not detail or not detail.value: return
|
if not detail or not detail.value: return
|
||||||
normalize(detail, bozo)
|
|
||||||
|
data = None
|
||||||
|
xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||||||
xdoc = xentry.ownerDocument
|
xdoc = xentry.ownerDocument
|
||||||
xcontent = xdoc.createElement(name)
|
xcontent = xdoc.createElement(name)
|
||||||
|
|
||||||
try:
|
|
||||||
# see if the resulting text is a well-formed XML fragment
|
|
||||||
div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
|
||||||
if isinstance(detail.value,unicode):
|
if isinstance(detail.value,unicode):
|
||||||
detail.value=detail.value.encode('utf-8')
|
detail.value=detail.value.encode('utf-8')
|
||||||
data = minidom.parseString(div % detail.value).documentElement
|
|
||||||
|
|
||||||
if detail.value.find('<') < 0:
|
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
||||||
xcontent.appendChild(data.firstChild)
|
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||||
|
for body in html.documentElement.childNodes:
|
||||||
|
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||||
|
if body.nodeName != 'body': continue
|
||||||
|
for div in body.childNodes:
|
||||||
|
if div.nodeType != Node.ELEMENT_NODE: continue
|
||||||
|
if div.nodeName != 'div': continue
|
||||||
|
div.normalize()
|
||||||
|
if len(div.childNodes) == 1 and \
|
||||||
|
div.firstChild.nodeType == Node.TEXT_NODE:
|
||||||
|
data = div.firstChild
|
||||||
else:
|
else:
|
||||||
|
data = div
|
||||||
xcontent.setAttribute('type', 'xhtml')
|
xcontent.setAttribute('type', 'xhtml')
|
||||||
xcontent.appendChild(data)
|
break
|
||||||
|
|
||||||
except ExpatError:
|
if data: xcontent.appendChild(data)
|
||||||
# leave as html
|
|
||||||
xcontent.setAttribute('type', 'html')
|
|
||||||
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
|
|
||||||
|
|
||||||
if detail.get("language"):
|
if detail.get("language"):
|
||||||
xcontent.setAttribute('xml:lang', detail.language)
|
xcontent.setAttribute('xml:lang', detail.language)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
<!--
|
<!--
|
||||||
Description: illegal control character
|
Description: illegal control character
|
||||||
Expect: content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
|
Expect: content[0].value == u'Page 1\ufffdPage 2'
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<feed xmns="http://www.w3.org/2005/Atom">
|
<feed xmns="http://www.w3.org/2005/Atom">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user