Switch from Beautiful Soup to html5lib
This commit is contained in:
parent
04ca707443
commit
3024af031f
@ -33,8 +33,9 @@
|
||||
<ul>
|
||||
<li><a href="http://www.planetplanet.org/">Planet</a></li>
|
||||
<li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
|
||||
<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
|
||||
<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
|
||||
<li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
|
||||
<li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
|
||||
<li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
|
||||
<li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
|
||||
</ul>
|
||||
|
@ -11,7 +11,7 @@
|
||||
<h2>Normalization</h2>
|
||||
<p>Venus builds on, and extends, the <a
|
||||
href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
|
||||
href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
|
||||
href="http://code.google.com/p/html5lib/">html5lib</a> to
|
||||
convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
|
||||
meaning that you don't have to worry about funky feeds, tag soup, or character
|
||||
encoding.</p>
|
||||
@ -48,7 +48,7 @@ other security risks are removed.</p>
|
||||
links are resolved</a> within the HTML. This is also done for links
|
||||
in other areas in the feed too.</p>
|
||||
<p>Finally, unmatched tags are closed. This is done with a
|
||||
<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>. Additionally, a
|
||||
<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>. Additionally, a
|
||||
<a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
|
||||
subset of MathML</a>, as well as a
|
||||
<a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
|
||||
|
@ -69,7 +69,7 @@
|
||||
<g font-size="32" fill="#FFF" text-anchor="middle">
|
||||
<text x="350" y="380" fill="#F00">Spider</text>
|
||||
<text x="350" y="460">Universal Feed Parser</text>
|
||||
<text x="350" y="530">BeautifulSoup</text>
|
||||
<text x="350" y="530">html5lib</text>
|
||||
<text x="350" y="600">Reconstitute</text>
|
||||
<text x="350" y="750">Filter(s)</text>
|
||||
<text x="850" y="250" fill="#F00">Splice</text>
|
||||
|
Before Width: | Height: | Size: 4.3 KiB After Width: | Height: | Size: 4.3 KiB |
File diff suppressed because it is too large
Load Diff
34
planet/html5lib/__init__.py
Normal file
34
planet/html5lib/__init__.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""
|
||||
HTML parsing library based on the WHATWG "HTML5"
|
||||
specification. The parser is designed to be compatible with existing
|
||||
HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage:
|
||||
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
p = html5lib.HTMLParser()
|
||||
tree = p.parse(f)
|
||||
|
||||
By default the returned treeformat is a custom "simpletree", similar
|
||||
to a DOM tree; each element has attributes childNodes and parent
|
||||
holding the parents and children respectively, a name attribute
|
||||
holding the Element name, a data attribute holding the element data
|
||||
(for text and comment nodes) and an attributes dictionary holding the
|
||||
element's attributes (for Element nodes).
|
||||
|
||||
To get output in ElementTree format:
|
||||
|
||||
import html5lib
|
||||
from html5lib.treebuilders import etree
|
||||
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
||||
elementtree = p.parse(f)
|
||||
|
||||
Note: Because HTML documents support various features not in the
|
||||
default ElementTree (e.g. doctypes), we suppy our own simple
|
||||
serializer; html5lib.treebuilders.etree.tostring At present this does not
|
||||
have the encoding support offered by the elementtree serializer.
|
||||
|
||||
"""
|
||||
from html5parser import HTMLParser
|
456
planet/html5lib/constants.py
Normal file
456
planet/html5lib/constants.py
Normal file
@ -0,0 +1,456 @@
|
||||
import string
|
||||
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
EOF = None
|
||||
|
||||
contentModelFlags = {
|
||||
"PCDATA":0,
|
||||
"RCDATA":1,
|
||||
"CDATA":2,
|
||||
"PLAINTEXT":3
|
||||
}
|
||||
|
||||
scopingElements = frozenset((
|
||||
"button",
|
||||
"caption",
|
||||
"html",
|
||||
"marquee",
|
||||
"object",
|
||||
"table",
|
||||
"td",
|
||||
"th"
|
||||
))
|
||||
|
||||
formattingElements = frozenset((
|
||||
"a",
|
||||
"b",
|
||||
"big",
|
||||
"em",
|
||||
"font",
|
||||
"i",
|
||||
"nobr",
|
||||
"s",
|
||||
"small",
|
||||
"strike",
|
||||
"strong",
|
||||
"tt",
|
||||
"u"
|
||||
))
|
||||
|
||||
specialElements = frozenset((
|
||||
"address",
|
||||
"area",
|
||||
"base",
|
||||
"basefont",
|
||||
"bgsound",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"center",
|
||||
"col",
|
||||
"colgroup",
|
||||
"dd",
|
||||
"dir",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"form",
|
||||
"frame",
|
||||
"frameset",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"hr",
|
||||
"iframe",
|
||||
"image",
|
||||
"img",
|
||||
"input",
|
||||
"isindex",
|
||||
"li",
|
||||
"link",
|
||||
"listing",
|
||||
"menu",
|
||||
"meta",
|
||||
"noembed",
|
||||
"noframes",
|
||||
"noscript",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"p",
|
||||
"param",
|
||||
"plaintext",
|
||||
"pre",
|
||||
"script",
|
||||
"select",
|
||||
"spacer",
|
||||
"style",
|
||||
"tbody",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"title",
|
||||
"tr",
|
||||
"ul",
|
||||
"wbr"
|
||||
))
|
||||
|
||||
spaceCharacters = frozenset((
|
||||
u"\t",
|
||||
u"\n",
|
||||
u"\u000B",
|
||||
u"\u000C",
|
||||
u" "
|
||||
))
|
||||
|
||||
tableInsertModeElements = frozenset((
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr"
|
||||
))
|
||||
|
||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||
asciiLetters = frozenset(string.ascii_letters)
|
||||
digits = frozenset(string.digits)
|
||||
hexDigits = frozenset(string.hexdigits)
|
||||
|
||||
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
||||
for c in string.ascii_uppercase])
|
||||
|
||||
# Heading elements need to be ordered
|
||||
headingElements = (
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6"
|
||||
)
|
||||
|
||||
# XXX What about event-source and command?
|
||||
voidElements = frozenset((
|
||||
"base",
|
||||
"link",
|
||||
"meta",
|
||||
"hr",
|
||||
"br",
|
||||
"img",
|
||||
"embed",
|
||||
"param",
|
||||
"area",
|
||||
"col",
|
||||
"input"
|
||||
))
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||
# therefore can't be a frozenset.
|
||||
entitiesWindows1252 = (
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
)
|
||||
|
||||
entities = {
|
||||
"AElig": u"\u00C6",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha": u"\u0391",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta": u"\u0392",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi": u"\u03A7",
|
||||
"Dagger": u"\u2021",
|
||||
"Delta": u"\u0394",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon": u"\u0395",
|
||||
"Eta": u"\u0397",
|
||||
"Euml": u"\u00CB",
|
||||
"Gamma": u"\u0393",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota": u"\u0399",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa": u"\u039A",
|
||||
"Lambda": u"\u039B",
|
||||
"Mu": u"\u039C",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu": u"\u039D",
|
||||
"OElig": u"\u0152",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega": u"\u03A9",
|
||||
"Omicron": u"\u039F",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi": u"\u03A6",
|
||||
"Pi": u"\u03A0",
|
||||
"Prime": u"\u2033",
|
||||
"Psi": u"\u03A8",
|
||||
"Rho": u"\u03A1",
|
||||
"Scaron": u"\u0160",
|
||||
"Sigma": u"\u03A3",
|
||||
"THORN": u"\u00DE",
|
||||
"Tau": u"\u03A4",
|
||||
"Theta": u"\u0398",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon": u"\u03A5",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi": u"\u039E",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml": u"\u0178",
|
||||
"Zeta": u"\u0396",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc": u"\u00E2",
|
||||
"acute": u"\u00B4",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym": u"\u2135",
|
||||
"alpha": u"\u03B1",
|
||||
"amp": u"\u0026",
|
||||
"AMP": u"\u0026",
|
||||
"and": u"\u2227",
|
||||
"ang": u"\u2220",
|
||||
"apos": u"\u0027",
|
||||
"aring": u"\u00E5",
|
||||
"asymp": u"\u2248",
|
||||
"atilde": u"\u00E3",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo": u"\u201E",
|
||||
"beta": u"\u03B2",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull": u"\u2022",
|
||||
"cap": u"\u2229",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil": u"\u00B8",
|
||||
"cent": u"\u00A2",
|
||||
"chi": u"\u03C7",
|
||||
"circ": u"\u02C6",
|
||||
"clubs": u"\u2663",
|
||||
"cong": u"\u2245",
|
||||
"copy": u"\u00A9",
|
||||
"COPY": u"\u00A9",
|
||||
"crarr": u"\u21B5",
|
||||
"cup": u"\u222A",
|
||||
"curren": u"\u00A4",
|
||||
"dArr": u"\u21D3",
|
||||
"dagger": u"\u2020",
|
||||
"darr": u"\u2193",
|
||||
"deg": u"\u00B0",
|
||||
"delta": u"\u03B4",
|
||||
"diams": u"\u2666",
|
||||
"divide": u"\u00F7",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave": u"\u00E8",
|
||||
"empty": u"\u2205",
|
||||
"emsp": u"\u2003",
|
||||
"ensp": u"\u2002",
|
||||
"epsilon": u"\u03B5",
|
||||
"equiv": u"\u2261",
|
||||
"eta": u"\u03B7",
|
||||
"eth": u"\u00F0",
|
||||
"euml": u"\u00EB",
|
||||
"euro": u"\u20AC",
|
||||
"exist": u"\u2203",
|
||||
"fnof": u"\u0192",
|
||||
"forall": u"\u2200",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl": u"\u2044",
|
||||
"gamma": u"\u03B3",
|
||||
"ge": u"\u2265",
|
||||
"gt": u"\u003E",
|
||||
"GT": u"\u003E",
|
||||
"hArr": u"\u21D4",
|
||||
"harr": u"\u2194",
|
||||
"hearts": u"\u2665",
|
||||
"hellip": u"\u2026",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave": u"\u00EC",
|
||||
"image": u"\u2111",
|
||||
"infin": u"\u221E",
|
||||
"int": u"\u222B",
|
||||
"iota": u"\u03B9",
|
||||
"iquest": u"\u00BF",
|
||||
"isin": u"\u2208",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa": u"\u03BA",
|
||||
"lArr": u"\u21D0",
|
||||
"lambda": u"\u03BB",
|
||||
"lang": u"\u2329",
|
||||
"laquo": u"\u00AB",
|
||||
"larr": u"\u2190",
|
||||
"lceil": u"\u2308",
|
||||
"ldquo": u"\u201C",
|
||||
"le": u"\u2264",
|
||||
"lfloor": u"\u230A",
|
||||
"lowast": u"\u2217",
|
||||
"loz": u"\u25CA",
|
||||
"lrm": u"\u200E",
|
||||
"lsaquo": u"\u2039",
|
||||
"lsquo": u"\u2018",
|
||||
"lt": u"\u003C",
|
||||
"LT": u"\u003C",
|
||||
"macr": u"\u00AF",
|
||||
"mdash": u"\u2014",
|
||||
"micro": u"\u00B5",
|
||||
"middot": u"\u00B7",
|
||||
"minus": u"\u2212",
|
||||
"mu": u"\u03BC",
|
||||
"nabla": u"\u2207",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash": u"\u2013",
|
||||
"ne": u"\u2260",
|
||||
"ni": u"\u220B",
|
||||
"not": u"\u00AC",
|
||||
"notin": u"\u2209",
|
||||
"nsub": u"\u2284",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu": u"\u03BD",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig": u"\u0153",
|
||||
"ograve": u"\u00F2",
|
||||
"oline": u"\u203E",
|
||||
"omega": u"\u03C9",
|
||||
"omicron": u"\u03BF",
|
||||
"oplus": u"\u2295",
|
||||
"or": u"\u2228",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes": u"\u2297",
|
||||
"ouml": u"\u00F6",
|
||||
"para": u"\u00B6",
|
||||
"part": u"\u2202",
|
||||
"permil": u"\u2030",
|
||||
"perp": u"\u22A5",
|
||||
"phi": u"\u03C6",
|
||||
"pi": u"\u03C0",
|
||||
"piv": u"\u03D6",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound": u"\u00A3",
|
||||
"prime": u"\u2032",
|
||||
"prod": u"\u220F",
|
||||
"prop": u"\u221D",
|
||||
"psi": u"\u03C8",
|
||||
"quot": u"\u0022",
|
||||
"QUOT": u"\u0022",
|
||||
"rArr": u"\u21D2",
|
||||
"radic": u"\u221A",
|
||||
"rang": u"\u232A",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr": u"\u2192",
|
||||
"rceil": u"\u2309",
|
||||
"rdquo": u"\u201D",
|
||||
"real": u"\u211C",
|
||||
"reg": u"\u00AE",
|
||||
"REG": u"\u00AE",
|
||||
"rfloor": u"\u230B",
|
||||
"rho": u"\u03C1",
|
||||
"rlm": u"\u200F",
|
||||
"rsaquo": u"\u203A",
|
||||
"rsquo": u"\u2019",
|
||||
"sbquo": u"\u201A",
|
||||
"scaron": u"\u0161",
|
||||
"sdot": u"\u22C5",
|
||||
"sect": u"\u00A7",
|
||||
"shy": u"\u00AD",
|
||||
"sigma": u"\u03C3",
|
||||
"sigmaf": u"\u03C2",
|
||||
"sim": u"\u223C",
|
||||
"spades": u"\u2660",
|
||||
"sub": u"\u2282",
|
||||
"sube": u"\u2286",
|
||||
"sum": u"\u2211",
|
||||
"sup": u"\u2283",
|
||||
"sup1": u"\u00B9",
|
||||
"sup2": u"\u00B2",
|
||||
"sup3": u"\u00B3",
|
||||
"supe": u"\u2287",
|
||||
"szlig": u"\u00DF",
|
||||
"tau": u"\u03C4",
|
||||
"there4": u"\u2234",
|
||||
"theta": u"\u03B8",
|
||||
"thetasym": u"\u03D1",
|
||||
"thinsp": u"\u2009",
|
||||
"thorn": u"\u00FE",
|
||||
"tilde": u"\u02DC",
|
||||
"times": u"\u00D7",
|
||||
"trade": u"\u2122",
|
||||
"uArr": u"\u21D1",
|
||||
"uacute": u"\u00FA",
|
||||
"uarr": u"\u2191",
|
||||
"ucirc": u"\u00FB",
|
||||
"ugrave": u"\u00F9",
|
||||
"uml": u"\u00A8",
|
||||
"upsih": u"\u03D2",
|
||||
"upsilon": u"\u03C5",
|
||||
"uuml": u"\u00FC",
|
||||
"weierp": u"\u2118",
|
||||
"xi": u"\u03BE",
|
||||
"yacute": u"\u00FD",
|
||||
"yen": u"\u00A5",
|
||||
"yuml": u"\u00FF",
|
||||
"zeta": u"\u03B6",
|
||||
"zwj": u"\u200D",
|
||||
"zwnj": u"\u200C"
|
||||
}
|
1719
planet/html5lib/html5parser.py
Normal file
1719
planet/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load Diff
202
planet/html5lib/inputstream.py
Normal file
202
planet/html5lib/inputstream.py
Normal file
@ -0,0 +1,202 @@
|
||||
import codecs
|
||||
import re
|
||||
|
||||
from constants import EOF
|
||||
|
||||
class HTMLInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, encoding=None):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by the HTML5Lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
# List of where new lines occur
|
||||
self.newLines = []
|
||||
|
||||
# Encoding Information
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Raw Stream
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Try to detect the encoding of the stream by looking for a BOM
|
||||
detectedEncoding = self.detectEncoding()
|
||||
|
||||
# If an encoding was specified or detected from the BOM don't allow
|
||||
# the encoding to be changed futher into the stream
|
||||
if self.charEncoding or detectedEncoding:
|
||||
self.allowEncodingOverride = False
|
||||
else:
|
||||
self.allowEncodingOverride = True
|
||||
|
||||
# If an encoding wasn't specified, use the encoding detected from the
|
||||
# BOM, if present, otherwise use the default encoding
|
||||
if not self.charEncoding:
|
||||
self.charEncoding = detectedEncoding or "cp1252"
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||
|
||||
# Normalize new lines and null characters
|
||||
uString = re.sub('\r\n?', '\n', uString)
|
||||
uString = re.sub('\x00', '\xFFFD', uString)
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
self.dataStream = uString
|
||||
|
||||
self.queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
self.reset()
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
|
||||
def detectEncoding(self):
|
||||
# Attempts to detect the character encoding of the stream. If
|
||||
# an encoding can be determined from the BOM return the name of the
|
||||
# encoding otherwise return None
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
self.rawStream.seek(0)
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def declareEncoding(self, encoding):
|
||||
"""Report the encoding declared by the meta element
|
||||
|
||||
If the encoding is currently only guessed, then this
|
||||
will read subsequent characters in that encoding.
|
||||
|
||||
If the encoding is not compatible with the guessed encoding
|
||||
and non-US-ASCII characters have been seen, return True indicating
|
||||
parsing will have to begin again.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
def determineNewLines(self):
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
self.newLines.append(0)
|
||||
for i in xrange(len(self.dataStream)):
|
||||
if self.dataStream[i] == u"\n":
|
||||
self.newLines.append(i)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
# Generate list of new lines first time around
|
||||
if not self.newLines:
|
||||
self.determineNewLines()
|
||||
|
||||
line = 0
|
||||
tell = self.tell
|
||||
for pos in self.newLines:
|
||||
if pos < tell:
|
||||
line += 1
|
||||
else:
|
||||
break
|
||||
col = tell - self.newLines[line-1] - 1
|
||||
return (line, col)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the position in the stream back to the start."""
|
||||
self.tell = 0
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
if self.queue:
|
||||
return self.queue.pop(0)
|
||||
else:
|
||||
try:
|
||||
self.tell += 1
|
||||
return self.dataStream[self.tell - 1]
|
||||
except:
|
||||
return EOF
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
any container that supports the in method being called on it.
|
||||
"""
|
||||
charStack = [self.char()]
|
||||
|
||||
# First from the queue
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
||||
and self.queue:
|
||||
charStack.append(self.queue.pop(0))
|
||||
|
||||
# Then the rest
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||
try:
|
||||
self.tell += 1
|
||||
charStack.append(self.dataStream[self.tell - 1])
|
||||
except:
|
||||
charStack.append(EOF)
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
self.queue.insert(0, charStack.pop())
|
||||
return "".join(charStack)
|
||||
|
||||
if __name__ == "__main__":
|
||||
stream = HTMLInputStream("../tests/utf-8-bom.html")
|
||||
|
||||
c = stream.char()
|
||||
while c:
|
||||
line, col = stream.position()
|
||||
if c == u"\n":
|
||||
print "Line %s, Column %s: Line Feed" % (line, col)
|
||||
else:
|
||||
print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
|
||||
c = stream.char()
|
||||
print "EOF"
|
106
planet/html5lib/liberalxmlparser.py
Normal file
106
planet/html5lib/liberalxmlparser.py
Normal file
@ -0,0 +1,106 @@
|
||||
"""
|
||||
Warning: this module is experimental and subject to change and even removal
|
||||
at any time.
|
||||
|
||||
For background/rationale, see:
|
||||
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||
* http://tinyurl.com/ylfj8k (and follow-ups)
|
||||
|
||||
References:
|
||||
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
|
||||
@@TODO:
|
||||
* Build a Treebuilder that produces Python DOM objects:
|
||||
http://docs.python.org/lib/module-xml.dom.html
|
||||
* Produce SAX events based on the produced DOM. This is intended not to
|
||||
support streaming, but rather to support application level compatibility.
|
||||
* Optional namespace support
|
||||
* Special case the output of XHTML <script> elements so that the empty
|
||||
element syntax is never used, even when the src attribute is provided.
|
||||
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
|
||||
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
||||
* Map illegal XML characters to U+FFFD, possibly with additional markup in
|
||||
the case of XHTML
|
||||
* Selectively lowercase only XHTML, but not foreign markup
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
class XHTMLParser(html5parser.HTMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
# AT When Python 2.4 is widespread we should use
|
||||
# dict(reversed(token.data))
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
return token
|
||||
|
||||
class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
def insertHtmlElement(self):
|
||||
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
|
||||
self.tree.openElements.append(element)
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XMLParser(XHTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
XHTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
self.tree.openElements.append(self.tree.document)
|
||||
self.parser.phase = XmlElementPhase(self.parser, self.tree)
|
||||
return getattr(self.parser.phase, name)
|
||||
|
||||
class XmlElementPhase(html5parser.Phase):
|
||||
""" Generic handling for all XML elements """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.Phase.__init__(self, *args, **kwargs)
|
||||
self.startTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
self.endTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
|
||||
def endTagOther(self, name):
|
||||
for node in self.tree.openElements[::-1]:
|
||||
if node.name == name:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
"."))
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
self.parser.parseError()
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.tree.insertText(data)
|
745
planet/html5lib/tokenizer.py
Normal file
745
planet/html5lib/tokenizer.py
Normal file
@ -0,0 +1,745 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from constants import contentModelFlags, spaceCharacters
|
||||
from constants import entitiesWindows1252, entities
|
||||
from constants import asciiLowercase, asciiLetters
|
||||
from constants import digits, hexDigits, EOF
|
||||
|
||||
from inputstream import HTMLInputStream
|
||||
|
||||
class HTMLTokenizer(object):
|
||||
""" This class takes care of tokenizing HTML.
|
||||
|
||||
* self.currentToken
|
||||
Holds the token that is currently being processed.
|
||||
|
||||
* self.state
|
||||
Holds a reference to the method to be invoked... XXX
|
||||
|
||||
* self.states
|
||||
Holds a mapping between states and methods that implement the state.
|
||||
|
||||
* self.stream
|
||||
Points to HTMLInputStream object.
|
||||
"""
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def __init__(self, stream, encoding=None):
|
||||
self.stream = HTMLInputStream(stream, encoding)
|
||||
|
||||
self.states = {
|
||||
"data":self.dataState,
|
||||
"entityData":self.entityDataState,
|
||||
"tagOpen":self.tagOpenState,
|
||||
"closeTagOpen":self.closeTagOpenState,
|
||||
"tagName":self.tagNameState,
|
||||
"beforeAttributeName":self.beforeAttributeNameState,
|
||||
"attributeName":self.attributeNameState,
|
||||
"afterAttributeName":self.afterAttributeNameState,
|
||||
"beforeAttributeValue":self.beforeAttributeValueState,
|
||||
"attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
|
||||
"attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
|
||||
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||
"bogusComment":self.bogusCommentState,
|
||||
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||
"comment":self.commentState,
|
||||
"commentDash":self.commentDashState,
|
||||
"commentEnd":self.commentEndState,
|
||||
"doctype":self.doctypeState,
|
||||
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||
"doctypeName":self.doctypeNameState,
|
||||
"afterDoctypeName":self.afterDoctypeNameState,
|
||||
"bogusDoctype":self.bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
self.state = self.states["data"]
|
||||
|
||||
# The current token being created
|
||||
self.currentToken = None
|
||||
|
||||
# Tokens to be processed.
|
||||
self.tokenQueue = []
|
||||
|
||||
def __iter__(self):
|
||||
""" This is where the magic happens.
|
||||
|
||||
We do our usually processing through the states and when we have a token
|
||||
to return we yield the token which pauses processing until the next token
|
||||
is requested.
|
||||
"""
|
||||
self.stream.reset()
|
||||
self.tokenQueue = []
|
||||
# Start processing. When EOF is reached self.state will return False
|
||||
# instead of True and the loop will terminate.
|
||||
while self.state():
|
||||
while self.tokenQueue:
|
||||
yield self.tokenQueue.pop(0)
|
||||
|
||||
# Below are various helper functions the tokenizer states use worked out.
|
||||
def processSolidusInTag(self):
|
||||
"""If the next character is a '>', convert the currentToken into
|
||||
an EmptyTag
|
||||
"""
|
||||
|
||||
# We need to consume another character to make sure it's a ">"
|
||||
data = self.stream.char()
|
||||
|
||||
if self.currentToken["type"] == "StartTag" and data == u">":
|
||||
self.currentToken["type"] = "EmptyTag"
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
self.stream.queue.append(data)
|
||||
|
||||
def consumeNumberEntity(self, isHex):
|
||||
"""This function returns either U+FFFD or the character based on the
|
||||
decimal or hexadecimal representation. It also discards ";" if present.
|
||||
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
|
||||
"""
|
||||
|
||||
allowed = digits
|
||||
radix = 10
|
||||
if isHex:
|
||||
allowed = hexDigits
|
||||
radix = 16
|
||||
|
||||
char = u"\uFFFD"
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
# don't hit an EOF.
|
||||
c = self.stream.char()
|
||||
while c in allowed and c is not EOF:
|
||||
charStack.append(c)
|
||||
c = self.stream.char()
|
||||
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = int("".join(charStack), radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if 127 < charAsInt < 160:
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#self.tokenQueue.append({"type": "ParseError", "data":
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0:
|
||||
charAsInt = 65533
|
||||
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != u";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
self.stream.queue.append(c)
|
||||
|
||||
return char
|
||||
|
||||
def consumeEntity(self):
|
||||
char = None
|
||||
charStack = [self.stream.char()]
|
||||
if charStack[0] == u"#":
|
||||
# We might have a number entity here.
|
||||
charStack.extend([self.stream.char(), self.stream.char()])
|
||||
if EOF in charStack:
|
||||
# If we reach the end of the file put everything up to EOF
|
||||
# back in the queue
|
||||
charStack = charStack[:charStack.index(EOF)]
|
||||
self.stream.queue.extend(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else:
|
||||
if charStack[1].lower() == u"x" \
|
||||
and charStack[2] in hexDigits:
|
||||
# Hexadecimal entity detected.
|
||||
self.stream.queue.append(charStack[2])
|
||||
char = self.consumeNumberEntity(True)
|
||||
elif charStack[1] in digits:
|
||||
# Decimal entity detected.
|
||||
self.stream.queue.extend(charStack[1:])
|
||||
char = self.consumeNumberEntity(False)
|
||||
else:
|
||||
# No number entity detected.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected but none found.")})
|
||||
# Break out if we reach the end of the file
|
||||
elif charStack[0] == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
else:
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
#
|
||||
# Consume characters and compare to these to a substring of the
|
||||
# entity names in the list until the substring no longer matches.
|
||||
filteredEntityList = [e for e in entities if \
|
||||
e.startswith(charStack[0])]
|
||||
|
||||
def entitiesStartingWith(name):
|
||||
return [e for e in filteredEntityList if e.startswith(name)]
|
||||
|
||||
while charStack[-1] != EOF and\
|
||||
entitiesStartingWith("".join(charStack)):
|
||||
charStack.append(self.stream.char())
|
||||
|
||||
# At this point we have a string that starts with some characters
|
||||
# that may match an entity
|
||||
entityName = None
|
||||
|
||||
# Try to find the longest entity the string will match
|
||||
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||
possibleEntityName = "".join(charStack[:entityLength])
|
||||
if possibleEntityName in entities:
|
||||
entityName = possibleEntityName
|
||||
break
|
||||
|
||||
if entityName is not None:
|
||||
char = entities[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity did not ';'.")})
|
||||
self.stream.queue.extend(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity expected. Got none.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
return char
|
||||
|
||||
def processEntityInAttribute(self):
|
||||
"""This method replaces the need for "entityInAttributeValueState".
|
||||
"""
|
||||
entity = self.consumeEntity()
|
||||
if entity:
|
||||
self.currentToken["data"][-1][1] += entity
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += u"&"
|
||||
|
||||
def emitCurrentToken(self):
|
||||
"""This method is a generic handler for emitting the StartTag,
|
||||
EndTag, Comment and Doctype. It also sets the state to
|
||||
"data" because that's what's needed after a token has been emitted.
|
||||
"""
|
||||
|
||||
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
|
||||
# considered harmful it should be ok here given that the classes are for
|
||||
# internal usage.
|
||||
|
||||
token = self.currentToken
|
||||
|
||||
# If an end tag has attributes it's a parse error and they should
|
||||
# be removed
|
||||
if token["type"] == "EndTag" and token["data"]:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("End tag contains unexpected attributes.")})
|
||||
token["data"] = {}
|
||||
|
||||
# Add token to the queue to be yielded
|
||||
self.tokenQueue.append(token)
|
||||
self.state = self.states["data"]
|
||||
|
||||
def emitCurrentTokenWithParseError(self, data=None):
|
||||
# XXX if we want useful error messages we need to inline this method
|
||||
"""This method is equivalent to emitCurrentToken (well, it invokes it)
|
||||
except that it also puts "data" back on the characters queue if a data
|
||||
argument is provided and it throws a parse error."""
|
||||
if data:
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("XXX Something is wrong with the emitted token.")})
|
||||
self.emitCurrentToken()
|
||||
|
||||
def attributeValueQuotedStateHandler(self, quoteType):
|
||||
data = self.stream.char()
|
||||
if data == quoteType:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
|
||||
(quoteType, u"&"))
|
||||
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||
# documents to figure out what the order of the various if and elif
|
||||
# statements should be.
|
||||
|
||||
def dataState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||
self.state = self.states["entityData"]
|
||||
elif data == u"<" and self.contentModelFlag !=\
|
||||
contentModelFlags["PLAINTEXT"]:
|
||||
self.state = self.states["tagOpen"]
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
elif data in spaceCharacters:
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data":
|
||||
data + self.stream.charsUntil((u"&", u"<"))})
|
||||
return True
|
||||
|
||||
def entityDataState(self):
|
||||
entity = self.consumeEntity()
|
||||
if entity:
|
||||
self.tokenQueue.append({"type": "Characters", "data": entity})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"&"})
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def tagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
if data == u"!":
|
||||
self.state = self.states["markupDeclarationOpen"]
|
||||
elif data == u"/":
|
||||
self.state = self.states["closeTagOpen"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "StartTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||
self.state = self.states["data"]
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
# XXX can't we do "<" + data here?
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
# now because this state can never be entered with the PLAINTEXT
|
||||
# flag.
|
||||
if data == u"/":
|
||||
self.state = self.states["closeTagOpen"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def closeTagOpenState(self):
|
||||
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
||||
contentModelFlags["CDATA"]):
|
||||
charStack = []
|
||||
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||
charStack.append(self.stream.char())
|
||||
# Make sure we don't get hit by EOF
|
||||
if charStack[-1] == EOF:
|
||||
break
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
|
||||
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
and charStack[-1] in (spaceCharacters |
|
||||
frozenset((u">", u"/", u"<", EOF))):
|
||||
# Because the characters are correct we can safely switch to
|
||||
# PCDATA mode now. This also means we don't have to do it when
|
||||
# emitting the end tag token.
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
|
||||
# Need to return here since we don't want the rest of the
|
||||
# method to be walked through.
|
||||
return True
|
||||
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "EndTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be '...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def tagNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["name"] += data +\
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
else:
|
||||
self.currentToken["name"] += data
|
||||
return True
|
||||
|
||||
def beforeAttributeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
return True
|
||||
|
||||
def attributeNameState(self):
|
||||
data = self.stream.char()
|
||||
leavingThisState = True
|
||||
if data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"][-1][0] += data +\
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
leavingThisState = False
|
||||
elif data == u">":
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
pass
|
||||
elif data in spaceCharacters:
|
||||
self.state = self.states["afterAttributeName"]
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
leavingThisState = False
|
||||
else:
|
||||
self.currentToken["data"][-1][0] += data
|
||||
leavingThisState = False
|
||||
|
||||
if leavingThisState:
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
for name, value in self.currentToken["data"][:-1]:
|
||||
if self.currentToken["data"][-1][0] == name:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
# XXX Fix for above XXX
|
||||
if data == u">":
|
||||
self.emitCurrentToken()
|
||||
return True
|
||||
|
||||
def afterAttributeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
return True
|
||||
|
||||
def beforeAttributeValueState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data == u"\"":
|
||||
self.state = self.states["attributeValueDoubleQuoted"]
|
||||
elif data == u"&":
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
self.stream.queue.append(data);
|
||||
elif data == u"'":
|
||||
self.state = self.states["attributeValueSingleQuoted"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
return True
|
||||
|
||||
def attributeValueDoubleQuotedState(self):
|
||||
# AT We could also let self.attributeValueQuotedStateHandler always
|
||||
# return true and then return that directly here. Not sure what is
|
||||
# faster or better...
|
||||
self.attributeValueQuotedStateHandler(u"\"")
|
||||
return True
|
||||
|
||||
def attributeValueSingleQuotedState(self):
|
||||
self.attributeValueQuotedStateHandler(u"'")
|
||||
return True
|
||||
|
||||
def attributeValueUnQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
frozenset(("&", ">","<")) | spaceCharacters)
|
||||
return True
|
||||
|
||||
def bogusCommentState(self):
|
||||
# Make a new comment token and give it as value all the characters
|
||||
# until the first > or EOF (charsUntil checks for EOF automatically)
|
||||
# and emit it.
|
||||
self.tokenQueue.append(
|
||||
{"type": "Comment", "data": self.stream.charsUntil((u">"))})
|
||||
|
||||
# Eat the character directly after the bogus comment which is either a
|
||||
# ">" or an EOF.
|
||||
self.stream.char()
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def markupDeclarationOpenState(self):
|
||||
charStack = [self.stream.char(), self.stream.char()]
|
||||
if charStack == [u"-", u"-"]:
|
||||
self.currentToken = {"type": "Comment", "data": ""}
|
||||
self.state = self.states["comment"]
|
||||
else:
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
# Put in explicit EOF check
|
||||
if (not EOF in charStack and
|
||||
"".join(charStack).upper() == u"DOCTYPE"):
|
||||
self.currentToken =\
|
||||
{"type": "Doctype", "name": "", "data": True}
|
||||
self.state = self.states["doctype"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def commentState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentDash"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
return True
|
||||
|
||||
def commentDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["data"] += u"-" + data +\
|
||||
self.stream.charsUntil(u"-")
|
||||
# Consume the next character which is either a "-" or an EOF as
|
||||
# well so if there's a "-" directly after the "-" we go nicely to
|
||||
# the "comment end state" without emitting a ParseError() there.
|
||||
self.stream.char()
|
||||
return True
|
||||
|
||||
def commentEndState(self):
|
||||
data = self.stream.char()
|
||||
if data == u">":
|
||||
# XXX EMIT
|
||||
self.emitCurrentToken()
|
||||
elif data == u"-":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
self.currentToken["data"] += data
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in comment found.")})
|
||||
self.currentToken["data"] += u"--" + data
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
def doctypeState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
return True
|
||||
|
||||
def beforeDoctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data in asciiLowercase:
|
||||
self.currentToken["name"] = data.upper()
|
||||
self.state = self.states["doctypeName"]
|
||||
elif data == u">":
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["name"] = data
|
||||
self.state = self.states["doctypeName"]
|
||||
return True
|
||||
|
||||
def doctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
needsDoctypeCheck = False
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["afterDoctypeName"]
|
||||
needsDoctypeCheck = True
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if data in asciiLowercase:
|
||||
data = data.upper()
|
||||
self.currentToken["name"] += data
|
||||
needsDoctypeCheck = True
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
||||
self.currentToken["data"] = False
|
||||
return True
|
||||
|
||||
def afterDoctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.currentToken["data"] = True
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
self.currentToken["data"] = True
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def bogusDoctypeState(self):
|
||||
data = self.stream.char()
|
||||
if data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
pass
|
||||
return True
|
36
planet/html5lib/treebuilders/__init__.py
Executable file
36
planet/html5lib/treebuilders/__init__.py
Executable file
@ -0,0 +1,36 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
__path__.append(os.path.dirname(__path__[0]))
|
||||
|
||||
import dom, etree, simpletree
|
312
planet/html5lib/treebuilders/_base.py
Executable file
312
planet/html5lib/treebuilders/_base.py
Executable file
@ -0,0 +1,312 @@
|
||||
from constants import scopingElements, tableInsertModeElements
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like
|
||||
# rather than DOM-like
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, name):
|
||||
"""Node representing an item in the tree.
|
||||
name - The tag name associated with the node
|
||||
parent - The parent of the current node (or None for the document node)
|
||||
value - The value of the current node (applies to text nodes and
|
||||
comments
|
||||
attributes - a dict holding name, value pairs for attributes of the node
|
||||
childNodes - a list of child nodes of the current node. This must
|
||||
include all elements but not necessarily other node types
|
||||
_flags - A list of miscellaneous flags that can be set on the node
|
||||
"""
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.attributes = {}
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __unicode__(self):
|
||||
attributesStr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in
|
||||
self.attributes.iteritems()])
|
||||
if attributesStr:
|
||||
return "<%s %s>"%(self.name,attributesStr)
|
||||
else:
|
||||
return "<%s>"%(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
"""Insert data as text in the current node, positioned before the
|
||||
start of node insertBefore or to the end of the node's text.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
"""Insert node as a child of the current node, before refNode in the
|
||||
list of child nodes. Raises ValueError if refNode is not a child of
|
||||
the current node"""
|
||||
raise NotImplementedError
|
||||
|
||||
def removeChild(self, node):
|
||||
"""Remove node from the children of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
"""Move all the children of the current node to newParent.
|
||||
This is needed so that trees that don't store text as nodes move the
|
||||
text in the correct way
|
||||
"""
|
||||
#XXX - should this method be made more general?
|
||||
for child in self.childNodes:
|
||||
newParent.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def cloneNode(self):
|
||||
"""Return a shallow copy of the current node i.e. a node with the same
|
||||
name and attributes but with no parent or child nodes
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Base treebuilder implementation
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
"""
|
||||
|
||||
#Document class
|
||||
documentClass = None
|
||||
|
||||
#The class to use for creating a node
|
||||
elementClass = None
|
||||
|
||||
#The class to use for creating comments
|
||||
commentClass = None
|
||||
|
||||
#The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.openElements = []
|
||||
self.activeFormattingElements = []
|
||||
|
||||
#XXX - rename these to headElement, formElement
|
||||
self.headPointer = None
|
||||
self.formPointer = None
|
||||
|
||||
self.insertFromTable = False
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, tableVariant=False):
|
||||
# Exit early when possible.
|
||||
if self.openElements[-1].name == target:
|
||||
return True
|
||||
|
||||
# AT Use reverse instead of [::-1] when we can rely on Python 2.4
|
||||
# AT How about while True and simply set node to [-1] and set it to
|
||||
# [-2] at the end...
|
||||
for node in self.openElements[::-1]:
|
||||
if node.name == target:
|
||||
return True
|
||||
elif node.name == "table":
|
||||
return False
|
||||
elif not tableVariant and node.name in scopingElements:
|
||||
return False
|
||||
elif node.name == "html":
|
||||
return False
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
if not self.activeFormattingElements:
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = -1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
i -= 1
|
||||
try:
|
||||
entry = self.activeFormattingElements[i]
|
||||
except:
|
||||
# Step 4: at this point we need to jump to step 8. By not doing
|
||||
# i += 1 which is also done in step 7 we achieve that.
|
||||
break
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
clone = self.activeFormattingElements[i].cloneNode()
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement(clone.name, clone.attributes)
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
if element == self.activeFormattingElements[-1]:
|
||||
break
|
||||
|
||||
def clearActiveFormattingElements(self):
|
||||
entry = self.activeFormattingElements.pop()
|
||||
while self.activeFormattingElements and entry != Marker:
|
||||
entry = self.activeFormattingElements.pop()
|
||||
|
||||
def elementInActiveFormattingElements(self, name):
|
||||
"""Check if an element exists between the end of the active
|
||||
formatting elements and the last marker. If it does, return it, else
|
||||
return false"""
|
||||
|
||||
for item in self.activeFormattingElements[::-1]:
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
if item == Marker:
|
||||
break
|
||||
elif item.name == name:
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertDoctype(self, name):
|
||||
self.document.appendChild(self.doctypeClass(name))
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(data))
|
||||
|
||||
def createElement(self, name, attributes):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
return self._insertFromTable
|
||||
|
||||
def _setInsertFromTable(self, value):
|
||||
"""Switch the function used to insert an element from the
|
||||
normal one to the misnested table one and back again"""
|
||||
self._insertFromTable = value
|
||||
if value:
|
||||
self.insertElement = self.insertElementTable
|
||||
else:
|
||||
self.insertElement = self.insertElementNormal
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, name, attributes):
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, name, attributes):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(name, attributes)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
if insertBefore is None:
|
||||
parent.appendChild(element)
|
||||
else:
|
||||
parent.insertBefore(element, insertBefore)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
"""Insert text data."""
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not(self.insertFromTable) or (self.insertFromTable and
|
||||
self.openElements[-1].name not in
|
||||
tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
parent.insertText(data, insertBefore)
|
||||
|
||||
def getTableMisnestedNodePosition(self):
|
||||
"""Get the foster parent element, and sibling to insert before
|
||||
(or None) when inserting a misnested table node"""
|
||||
#The foster parent element is the one which comes before the most
|
||||
#recently opened table element
|
||||
#XXX - this is really inelegant
|
||||
lastTable=None
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == u"table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
#XXX - we should really check that this parent is actually a
|
||||
#node here
|
||||
if lastTable.parent:
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else:
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
assert self.innerHTML
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX Until someone has broven that the above breaks stuff I think
|
||||
# we should keep it in.
|
||||
# self.processEndTag(name)
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
"Return the final tree"
|
||||
return self.document
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
node - the node from which to start serializing"""
|
||||
raise NotImplementedError
|
127
planet/html5lib/treebuilders/dom.py
Executable file
127
planet/html5lib/treebuilders/dom.py
Executable file
@ -0,0 +1,127 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
|
||||
class AttrList:
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
def __iter__(self):
|
||||
return self.element.attributes.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
def items(self):
|
||||
return self.element.attributes.items()
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
else:
|
||||
self.element.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
self.element.insertBefore(node.element, refNode.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.hasChildNodes():
|
||||
child = self.element.firstChild
|
||||
self.element.removeChild(child)
|
||||
newParent.element.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
return NodeBuilder(self.element.cloneNode(False))
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
return self
|
||||
|
||||
def doctypeClass(self,name):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
||||
|
||||
def elementClass(self, name):
|
||||
return NodeBuilder(self.dom.createElement(name))
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
if parent <> self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||
self.dom._child_node_types=list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
name = None
|
||||
|
||||
def testSerializer(element):
|
||||
element.normalize()
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
|
||||
if element.hasAttributes():
|
||||
for name, value in element.attributes.items():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
208
planet/html5lib/treebuilders/etree.py
Executable file
208
planet/html5lib/treebuilders/etree.py
Executable file
@ -0,0 +1,208 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
from elementtree import ElementTree
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
#Set the element text and tail to the empty string rather than None
|
||||
#XXX - is this desirable or should we do it on a case by case basis?
|
||||
self._element.text = ""
|
||||
self._element.tail = ""
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
Element.__init__(self, Comment)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
153
planet/html5lib/treebuilders/simpletree.py
Executable file
153
planet/html5lib/treebuilders/simpletree.py
Executable file
@ -0,0 +1,153 @@
|
||||
import _base
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
class Node(_base.Node):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
def printTree(self, indent=0):
|
||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent + 2)
|
||||
return tree
|
||||
|
||||
def appendChild(self, node, index=None):
|
||||
if (isinstance(node, TextNode) and self.childNodes and
|
||||
isinstance(self.childNodes[-1], TextNode)):
|
||||
self.childNodes[-1].value += node.value
|
||||
else:
|
||||
self.childNodes.append(node)
|
||||
node.parent = self
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if insertBefore is None:
|
||||
self.appendChild(TextNode(data))
|
||||
else:
|
||||
self.insertBefore(TextNode(data), insertBefore)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.childNodes.index(refNode)
|
||||
if (isinstance(node, TextNode) and index > 0 and
|
||||
isinstance(self.childNodes[index - 1], TextNode)):
|
||||
self.childNodes[index - 1].value += node.value
|
||||
else:
|
||||
self.childNodes.insert(index, node)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
try:
|
||||
self.childNodes.remove(node)
|
||||
except:
|
||||
# XXX
|
||||
raise
|
||||
node.parent = None
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
newNode.value = self.value
|
||||
return newNode
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self.childNodes)
|
||||
|
||||
class Document(Node):
|
||||
def __init__(self):
|
||||
Node.__init__(self, None)
|
||||
|
||||
def __unicode__(self):
|
||||
return "#document"
|
||||
|
||||
def printTree(self):
|
||||
tree = unicode(self)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ''
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
return result.encode(encoding)
|
||||
|
||||
class DocumentType(Node):
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!DOCTYPE %s>" % self.name
|
||||
|
||||
class TextNode(Node):
|
||||
def __init__(self, value):
|
||||
Node.__init__(self, None)
|
||||
self.value = value
|
||||
|
||||
def __unicode__(self):
|
||||
return "\"%s\"" % self.value
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
|
||||
class Element(Node):
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
return "<%s>" % self.name
|
||||
|
||||
def printTree(self, indent):
|
||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
for name,value in self.attributes.iteritems():
|
||||
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += '>'
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
result += '</%s>' % self.name
|
||||
else:
|
||||
result += '/>'
|
||||
return result
|
||||
|
||||
class CommentNode(Node):
|
||||
def __init__(self, data):
|
||||
Node.__init__(self, None)
|
||||
self.data = data
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!-- %s -->" % self.data
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = CommentNode
|
||||
|
||||
def testSerializer(self, node):
|
||||
return node.printTree()
|
36
planet/html5lib/utils.py
Normal file
36
planet/html5lib/utils.py
Normal file
@ -0,0 +1,36 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
#Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
# Using _dictEntries instead of directly assigning to self is about
|
||||
# twice as fast. Please do careful performance testing before changing
|
||||
# anything here.
|
||||
_dictEntries = []
|
||||
for name,value in items:
|
||||
if type(name) in (list, tuple, frozenset, set):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
@ -15,9 +15,9 @@ Todo:
|
||||
"""
|
||||
import re, time, md5, sgmllib
|
||||
from xml.sax.saxutils import escape
|
||||
from xml.dom import minidom
|
||||
from xml.dom import minidom, Node
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from xml.parsers.expat import ExpatError
|
||||
from planet.html5lib import liberalxmlparser, treebuilders
|
||||
import planet, config
|
||||
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -59,22 +59,6 @@ def cssid(name):
|
||||
name = nonalpha.sub('-',name).lower()
|
||||
return name.strip('-')
|
||||
|
||||
def normalize(text, bozo):
|
||||
""" convert everything to well formed XML """
|
||||
if text.has_key('type'):
|
||||
if text.type.lower().find('html')<0:
|
||||
text['value'] = escape(text.value)
|
||||
text['type'] = 'text/html'
|
||||
if text.type.lower() == 'text/html' or bozo:
|
||||
dom=BeautifulSoup(text.value,convertEntities="html")
|
||||
for tag in dom.findAll(True):
|
||||
for attr,value in tag.attrs:
|
||||
value=sgmllib.charref.sub(ncr2c,value)
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
tag[attr]=value
|
||||
text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
|
||||
return text
|
||||
|
||||
def id(xentry, entry):
|
||||
""" copy or compute an id for the entry """
|
||||
|
||||
@ -150,27 +134,32 @@ def author(xentry, name, detail):
|
||||
def content(xentry, name, detail, bozo):
|
||||
""" insert a content-like element into the entry """
|
||||
if not detail or not detail.value: return
|
||||
normalize(detail, bozo)
|
||||
|
||||
data = None
|
||||
xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||||
xdoc = xentry.ownerDocument
|
||||
xcontent = xdoc.createElement(name)
|
||||
if isinstance(detail.value,unicode):
|
||||
detail.value=detail.value.encode('utf-8')
|
||||
|
||||
try:
|
||||
# see if the resulting text is a well-formed XML fragment
|
||||
div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||||
if isinstance(detail.value,unicode):
|
||||
detail.value=detail.value.encode('utf-8')
|
||||
data = minidom.parseString(div % detail.value).documentElement
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||
for body in html.documentElement.childNodes:
|
||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||
if body.nodeName != 'body': continue
|
||||
for div in body.childNodes:
|
||||
if div.nodeType != Node.ELEMENT_NODE: continue
|
||||
if div.nodeName != 'div': continue
|
||||
div.normalize()
|
||||
if len(div.childNodes) == 1 and \
|
||||
div.firstChild.nodeType == Node.TEXT_NODE:
|
||||
data = div.firstChild
|
||||
else:
|
||||
data = div
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
break
|
||||
|
||||
if detail.value.find('<') < 0:
|
||||
xcontent.appendChild(data.firstChild)
|
||||
else:
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
xcontent.appendChild(data)
|
||||
|
||||
except ExpatError:
|
||||
# leave as html
|
||||
xcontent.setAttribute('type', 'html')
|
||||
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
|
||||
if data: xcontent.appendChild(data)
|
||||
|
||||
if detail.get("language"):
|
||||
xcontent.setAttribute('xml:lang', detail.language)
|
||||
|
@ -1,6 +1,6 @@
|
||||
<!--
|
||||
Description: illegal control character
|
||||
Expect: content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
|
||||
Expect: content[0].value == u'Page 1\ufffdPage 2'
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
|
Loading…
x
Reference in New Issue
Block a user