Update to latest html5lib; move packaged dependencies to vendor directory

This commit is contained in:
Sam Ruby 2007-06-25 10:49:51 -04:00
parent 65e41f7b22
commit fc90da7fc0
49 changed files with 2883 additions and 800 deletions

View File

@ -1,5 +1,5 @@
import sys
from planet import html5lib
import html5lib
tree=html5lib.treebuilders.dom.TreeBuilder
parser = html5lib.html5parser.HTMLParser(tree=tree)
document = parser.parse(sys.stdin)

View File

@ -23,8 +23,9 @@ from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
import planet
from planet import config, feedparser
from planet import config
from planet.spider import filename
import feedparser
log = planet.logger
options = config.filter_options(sys.argv[0])

View File

@ -32,7 +32,9 @@ def getLogger(level, format):
loggerParms = (level,format)
return logger
sys.path.append(os.path.join(os.path.dirname(__file__),'vendor'))
# Configure feed parser
from planet import feedparser
import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0

View File

@ -1,42 +0,0 @@
"""A collection of modules for building different kinds of tree from
HTML documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
"""
import os.path
__path__.append(os.path.dirname(__path__[0]))
import dom
import simpletree
try:
import etree
except:
pass

View File

@ -1,5 +0,0 @@
import etreefull
class TreeBuilder(etreefull.TreeBuilder):
def getDocument(self):
return self.document._element.find("html")

View File

@ -1,227 +0,0 @@
try:
from xml.etree import ElementTree
except ImportError:
try:
from elementtree import ElementTree
except:
pass
import _base
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element.attributes = self.attributes
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
Element.__init__(self, None)
self._element = ElementTree.Comment(data)
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, DocumentType)
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, Document)
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, DocumentFragment)
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if element.tag is DocumentType:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag is Document:
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag is ElementTree.Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if element.tag is DocumentType:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag is Document:
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif element.tag is ElementTree.Comment:
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.document._element
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element

View File

@ -16,7 +16,8 @@ Todo:
import re, time, md5, sgmllib
from xml.sax.saxutils import escape
from xml.dom import minidom, Node
from planet.html5lib import liberalxmlparser, treebuilders
from html5lib import liberalxmlparser
from html5lib.treebuilders import dom
import planet, config
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -154,7 +155,7 @@ def content(xentry, name, detail, bozo):
data = minidom.parseString(xdiv % detail.value).documentElement
xcontent.setAttribute('type', 'xhtml')
else:
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue

View File

@ -1,6 +1,7 @@
from xml.sax.saxutils import escape
import sgmllib, time, os, sys, new, urlparse, re
from planet import config, feedparser, htmltmpl
from planet import config, feedparser
import htmltmpl
voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))

View File

@ -340,7 +340,7 @@ def spiderPlanet(only_if_new = False):
log.info("Socket timeout set to %d seconds", timeout)
except:
try:
from planet import timeoutsocket
import timeoutsocket
timeoutsocket.setDefaultSocketTimeout(float(timeout))
log.info("Socket timeout set to %d seconds", timeout)
except:

View File

@ -119,8 +119,8 @@ spaceCharacters = frozenset((
tableInsertModeElements = frozenset((
"table",
"tbody",
"tfoot",
"thead",
"tfoot",
"thead",
"tr"
))
@ -133,7 +133,7 @@ hexDigits = frozenset(string.hexdigits)
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
for c in string.ascii_uppercase])
# Heading elements need to be ordered
# Heading elements need to be ordered
headingElements = (
"h1",
"h2",
@ -158,6 +158,38 @@ voidElements = frozenset((
"input"
))
cdataElements = frozenset(('title', 'textarea'))
rcdataElements = frozenset((
'style',
'script',
'xmp',
'iframe',
'noembed',
'noframes',
'noscript'
))
booleanAttributes = {
"": frozenset(("irrelevant",)),
"style": frozenset(("scoped",)),
"img": frozenset(("ismap",)),
"audio": frozenset(("autoplay","controls")),
"video": frozenset(("autoplay","controls")),
"script": frozenset(("defer", "async")),
"details": frozenset(("open",)),
"datagrid": frozenset(("multiple", "disabled")),
"command": frozenset(("hidden", "disabled", "checked", "default")),
"menu": frozenset(("autosubmit",)),
"fieldset": frozenset(("disabled", "readonly")),
"option": frozenset(("disabled", "readonly", "selected")),
"optgroup": frozenset(("disabled", "readonly")),
"button": frozenset(("disabled", "autofocus")),
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
"output": frozenset(("disabled", "readonly")),
}
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
# therefore can't be a frozenset.
entitiesWindows1252 = (
@ -196,265 +228,372 @@ entitiesWindows1252 = (
)
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
"Aacute": u"\u00C1",
"Acirc": u"\u00C2",
"Agrave": u"\u00C0",
"Alpha": u"\u0391",
"Aring": u"\u00C5",
"Atilde": u"\u00C3",
"Auml": u"\u00C4",
"Beta": u"\u0392",
"Ccedil": u"\u00C7",
"Chi": u"\u03A7",
"Dagger": u"\u2021",
"Delta": u"\u0394",
"ETH": u"\u00D0",
"Eacute": u"\u00C9",
"Ecirc": u"\u00CA",
"Egrave": u"\u00C8",
"Epsilon": u"\u0395",
"Eta": u"\u0397",
"Euml": u"\u00CB",
"Gamma": u"\u0393",
"Iacute": u"\u00CD",
"Icirc": u"\u00CE",
"Igrave": u"\u00CC",
"Iota": u"\u0399",
"Iuml": u"\u00CF",
"Kappa": u"\u039A",
"Lambda": u"\u039B",
"Mu": u"\u039C",
"Ntilde": u"\u00D1",
"Nu": u"\u039D",
"OElig": u"\u0152",
"Oacute": u"\u00D3",
"Ocirc": u"\u00D4",
"Ograve": u"\u00D2",
"Omega": u"\u03A9",
"Omicron": u"\u039F",
"Oslash": u"\u00D8",
"Otilde": u"\u00D5",
"Ouml": u"\u00D6",
"Phi": u"\u03A6",
"Pi": u"\u03A0",
"Prime": u"\u2033",
"Psi": u"\u03A8",
"Rho": u"\u03A1",
"Scaron": u"\u0160",
"Sigma": u"\u03A3",
"THORN": u"\u00DE",
"Tau": u"\u03A4",
"Theta": u"\u0398",
"Uacute": u"\u00DA",
"Ucirc": u"\u00DB",
"Ugrave": u"\u00D9",
"Upsilon": u"\u03A5",
"Uuml": u"\u00DC",
"Xi": u"\u039E",
"Yacute": u"\u00DD",
"Yuml": u"\u0178",
"Zeta": u"\u0396",
"aacute": u"\u00E1",
"acirc": u"\u00E2",
"acute": u"\u00B4",
"aelig": u"\u00E6",
"agrave": u"\u00E0",
"alefsym": u"\u2135",
"alpha": u"\u03B1",
"amp": u"\u0026",
"AMP;": u"\u0026",
"AMP": u"\u0026",
"and": u"\u2227",
"ang": u"\u2220",
"apos": u"\u0027",
"aring": u"\u00E5",
"asymp": u"\u2248",
"atilde": u"\u00E3",
"auml": u"\u00E4",
"bdquo": u"\u201E",
"beta": u"\u03B2",
"brvbar": u"\u00A6",
"bull": u"\u2022",
"cap": u"\u2229",
"ccedil": u"\u00E7",
"cedil": u"\u00B8",
"cent": u"\u00A2",
"chi": u"\u03C7",
"circ": u"\u02C6",
"clubs": u"\u2663",
"cong": u"\u2245",
"copy": u"\u00A9",
"Aacute;": u"\u00C1",
"Aacute": u"\u00C1",
"Acirc;": u"\u00C2",
"Acirc": u"\u00C2",
"Agrave;": u"\u00C0",
"Agrave": u"\u00C0",
"Alpha;": u"\u0391",
"Aring;": u"\u00C5",
"Aring": u"\u00C5",
"Atilde;": u"\u00C3",
"Atilde": u"\u00C3",
"Auml;": u"\u00C4",
"Auml": u"\u00C4",
"Beta;": u"\u0392",
"COPY;": u"\u00A9",
"COPY": u"\u00A9",
"crarr": u"\u21B5",
"cup": u"\u222A",
"curren": u"\u00A4",
"dArr": u"\u21D3",
"dagger": u"\u2020",
"darr": u"\u2193",
"deg": u"\u00B0",
"delta": u"\u03B4",
"diams": u"\u2666",
"divide": u"\u00F7",
"eacute": u"\u00E9",
"ecirc": u"\u00EA",
"egrave": u"\u00E8",
"empty": u"\u2205",
"emsp": u"\u2003",
"ensp": u"\u2002",
"epsilon": u"\u03B5",
"equiv": u"\u2261",
"eta": u"\u03B7",
"eth": u"\u00F0",
"euml": u"\u00EB",
"euro": u"\u20AC",
"exist": u"\u2203",
"fnof": u"\u0192",
"forall": u"\u2200",
"frac12": u"\u00BD",
"frac14": u"\u00BC",
"frac34": u"\u00BE",
"frasl": u"\u2044",
"gamma": u"\u03B3",
"ge": u"\u2265",
"gt": u"\u003E",
"Ccedil;": u"\u00C7",
"Ccedil": u"\u00C7",
"Chi;": u"\u03A7",
"Dagger;": u"\u2021",
"Delta;": u"\u0394",
"ETH;": u"\u00D0",
"ETH": u"\u00D0",
"Eacute;": u"\u00C9",
"Eacute": u"\u00C9",
"Ecirc;": u"\u00CA",
"Ecirc": u"\u00CA",
"Egrave;": u"\u00C8",
"Egrave": u"\u00C8",
"Epsilon;": u"\u0395",
"Eta;": u"\u0397",
"Euml;": u"\u00CB",
"Euml": u"\u00CB",
"GT;": u"\u003E",
"GT": u"\u003E",
"hArr": u"\u21D4",
"harr": u"\u2194",
"hearts": u"\u2665",
"hellip": u"\u2026",
"iacute": u"\u00ED",
"icirc": u"\u00EE",
"iexcl": u"\u00A1",
"igrave": u"\u00EC",
"image": u"\u2111",
"infin": u"\u221E",
"int": u"\u222B",
"iota": u"\u03B9",
"iquest": u"\u00BF",
"isin": u"\u2208",
"iuml": u"\u00EF",
"kappa": u"\u03BA",
"lArr": u"\u21D0",
"lambda": u"\u03BB",
"lang": u"\u2329",
"laquo": u"\u00AB",
"larr": u"\u2190",
"lceil": u"\u2308",
"ldquo": u"\u201C",
"le": u"\u2264",
"lfloor": u"\u230A",
"lowast": u"\u2217",
"loz": u"\u25CA",
"lrm": u"\u200E",
"lsaquo": u"\u2039",
"lsquo": u"\u2018",
"lt": u"\u003C",
"Gamma;": u"\u0393",
"Iacute;": u"\u00CD",
"Iacute": u"\u00CD",
"Icirc;": u"\u00CE",
"Icirc": u"\u00CE",
"Igrave;": u"\u00CC",
"Igrave": u"\u00CC",
"Iota;": u"\u0399",
"Iuml;": u"\u00CF",
"Iuml": u"\u00CF",
"Kappa;": u"\u039A",
"LT;": u"\u003C",
"LT": u"\u003C",
"macr": u"\u00AF",
"mdash": u"\u2014",
"micro": u"\u00B5",
"middot": u"\u00B7",
"minus": u"\u2212",
"mu": u"\u03BC",
"nabla": u"\u2207",
"nbsp": u"\u00A0",
"ndash": u"\u2013",
"ne": u"\u2260",
"ni": u"\u220B",
"not": u"\u00AC",
"notin": u"\u2209",
"nsub": u"\u2284",
"ntilde": u"\u00F1",
"nu": u"\u03BD",
"oacute": u"\u00F3",
"ocirc": u"\u00F4",
"oelig": u"\u0153",
"ograve": u"\u00F2",
"oline": u"\u203E",
"omega": u"\u03C9",
"omicron": u"\u03BF",
"oplus": u"\u2295",
"or": u"\u2228",
"ordf": u"\u00AA",
"ordm": u"\u00BA",
"oslash": u"\u00F8",
"otilde": u"\u00F5",
"otimes": u"\u2297",
"ouml": u"\u00F6",
"para": u"\u00B6",
"part": u"\u2202",
"permil": u"\u2030",
"perp": u"\u22A5",
"phi": u"\u03C6",
"pi": u"\u03C0",
"piv": u"\u03D6",
"plusmn": u"\u00B1",
"pound": u"\u00A3",
"prime": u"\u2032",
"prod": u"\u220F",
"prop": u"\u221D",
"psi": u"\u03C8",
"quot": u"\u0022",
"Lambda;": u"\u039B",
"Mu;": u"\u039C",
"Ntilde;": u"\u00D1",
"Ntilde": u"\u00D1",
"Nu;": u"\u039D",
"OElig;": u"\u0152",
"Oacute;": u"\u00D3",
"Oacute": u"\u00D3",
"Ocirc;": u"\u00D4",
"Ocirc": u"\u00D4",
"Ograve;": u"\u00D2",
"Ograve": u"\u00D2",
"Omega;": u"\u03A9",
"Omicron;": u"\u039F",
"Oslash;": u"\u00D8",
"Oslash": u"\u00D8",
"Otilde;": u"\u00D5",
"Otilde": u"\u00D5",
"Ouml;": u"\u00D6",
"Ouml": u"\u00D6",
"Phi;": u"\u03A6",
"Pi;": u"\u03A0",
"Prime;": u"\u2033",
"Psi;": u"\u03A8",
"QUOT;": u"\u0022",
"QUOT": u"\u0022",
"rArr": u"\u21D2",
"radic": u"\u221A",
"rang": u"\u232A",
"raquo": u"\u00BB",
"rarr": u"\u2192",
"rceil": u"\u2309",
"rdquo": u"\u201D",
"real": u"\u211C",
"reg": u"\u00AE",
"REG;": u"\u00AE",
"REG": u"\u00AE",
"rfloor": u"\u230B",
"rho": u"\u03C1",
"rlm": u"\u200F",
"rsaquo": u"\u203A",
"rsquo": u"\u2019",
"sbquo": u"\u201A",
"scaron": u"\u0161",
"sdot": u"\u22C5",
"Rho;": u"\u03A1",
"Scaron;": u"\u0160",
"Sigma;": u"\u03A3",
"THORN;": u"\u00DE",
"THORN": u"\u00DE",
"TRADE;": u"\u2122",
"Tau;": u"\u03A4",
"Theta;": u"\u0398",
"Uacute;": u"\u00DA",
"Uacute": u"\u00DA",
"Ucirc;": u"\u00DB",
"Ucirc": u"\u00DB",
"Ugrave;": u"\u00D9",
"Ugrave": u"\u00D9",
"Upsilon;": u"\u03A5",
"Uuml;": u"\u00DC",
"Uuml": u"\u00DC",
"Xi;": u"\u039E",
"Yacute;": u"\u00DD",
"Yacute": u"\u00DD",
"Yuml;": u"\u0178",
"Zeta;": u"\u0396",
"aacute;": u"\u00E1",
"aacute": u"\u00E1",
"acirc;": u"\u00E2",
"acirc": u"\u00E2",
"acute;": u"\u00B4",
"acute": u"\u00B4",
"aelig;": u"\u00E6",
"aelig": u"\u00E6",
"agrave;": u"\u00E0",
"agrave": u"\u00E0",
"alefsym;": u"\u2135",
"alpha;": u"\u03B1",
"amp;": u"\u0026",
"amp": u"\u0026",
"and;": u"\u2227",
"ang;": u"\u2220",
"apos;": u"\u0027",
"aring;": u"\u00E5",
"aring": u"\u00E5",
"asymp;": u"\u2248",
"atilde;": u"\u00E3",
"atilde": u"\u00E3",
"auml;": u"\u00E4",
"auml": u"\u00E4",
"bdquo;": u"\u201E",
"beta;": u"\u03B2",
"brvbar;": u"\u00A6",
"brvbar": u"\u00A6",
"bull;": u"\u2022",
"cap;": u"\u2229",
"ccedil;": u"\u00E7",
"ccedil": u"\u00E7",
"cedil;": u"\u00B8",
"cedil": u"\u00B8",
"cent;": u"\u00A2",
"cent": u"\u00A2",
"chi;": u"\u03C7",
"circ;": u"\u02C6",
"clubs;": u"\u2663",
"cong;": u"\u2245",
"copy;": u"\u00A9",
"copy": u"\u00A9",
"crarr;": u"\u21B5",
"cup;": u"\u222A",
"curren;": u"\u00A4",
"curren": u"\u00A4",
"dArr;": u"\u21D3",
"dagger;": u"\u2020",
"darr;": u"\u2193",
"deg;": u"\u00B0",
"deg": u"\u00B0",
"delta;": u"\u03B4",
"diams;": u"\u2666",
"divide;": u"\u00F7",
"divide": u"\u00F7",
"eacute;": u"\u00E9",
"eacute": u"\u00E9",
"ecirc;": u"\u00EA",
"ecirc": u"\u00EA",
"egrave;": u"\u00E8",
"egrave": u"\u00E8",
"empty;": u"\u2205",
"emsp;": u"\u2003",
"ensp;": u"\u2002",
"epsilon;": u"\u03B5",
"equiv;": u"\u2261",
"eta;": u"\u03B7",
"eth;": u"\u00F0",
"eth": u"\u00F0",
"euml;": u"\u00EB",
"euml": u"\u00EB",
"euro;": u"\u20AC",
"exist;": u"\u2203",
"fnof;": u"\u0192",
"forall;": u"\u2200",
"frac12;": u"\u00BD",
"frac12": u"\u00BD",
"frac14;": u"\u00BC",
"frac14": u"\u00BC",
"frac34;": u"\u00BE",
"frac34": u"\u00BE",
"frasl;": u"\u2044",
"gamma;": u"\u03B3",
"ge;": u"\u2265",
"gt;": u"\u003E",
"gt": u"\u003E",
"hArr;": u"\u21D4",
"harr;": u"\u2194",
"hearts;": u"\u2665",
"hellip;": u"\u2026",
"iacute;": u"\u00ED",
"iacute": u"\u00ED",
"icirc;": u"\u00EE",
"icirc": u"\u00EE",
"iexcl;": u"\u00A1",
"iexcl": u"\u00A1",
"igrave;": u"\u00EC",
"igrave": u"\u00EC",
"image;": u"\u2111",
"infin;": u"\u221E",
"int;": u"\u222B",
"iota;": u"\u03B9",
"iquest;": u"\u00BF",
"iquest": u"\u00BF",
"isin;": u"\u2208",
"iuml;": u"\u00EF",
"iuml": u"\u00EF",
"kappa;": u"\u03BA",
"lArr;": u"\u21D0",
"lambda;": u"\u03BB",
"lang;": u"\u3008",
"laquo;": u"\u00AB",
"laquo": u"\u00AB",
"larr;": u"\u2190",
"lceil;": u"\u2308",
"ldquo;": u"\u201C",
"le;": u"\u2264",
"lfloor;": u"\u230A",
"lowast;": u"\u2217",
"loz;": u"\u25CA",
"lrm;": u"\u200E",
"lsaquo;": u"\u2039",
"lsquo;": u"\u2018",
"lt;": u"\u003C",
"lt": u"\u003C",
"macr;": u"\u00AF",
"macr": u"\u00AF",
"mdash;": u"\u2014",
"micro;": u"\u00B5",
"micro": u"\u00B5",
"middot;": u"\u00B7",
"middot": u"\u00B7",
"minus;": u"\u2212",
"mu;": u"\u03BC",
"nabla;": u"\u2207",
"nbsp;": u"\u00A0",
"nbsp": u"\u00A0",
"ndash;": u"\u2013",
"ne;": u"\u2260",
"ni;": u"\u220B",
"not;": u"\u00AC",
"not": u"\u00AC",
"notin;": u"\u2209",
"nsub;": u"\u2284",
"ntilde;": u"\u00F1",
"ntilde": u"\u00F1",
"nu;": u"\u03BD",
"oacute;": u"\u00F3",
"oacute": u"\u00F3",
"ocirc;": u"\u00F4",
"ocirc": u"\u00F4",
"oelig;": u"\u0153",
"ograve;": u"\u00F2",
"ograve": u"\u00F2",
"oline;": u"\u203E",
"omega;": u"\u03C9",
"omicron;": u"\u03BF",
"oplus;": u"\u2295",
"or;": u"\u2228",
"ordf;": u"\u00AA",
"ordf": u"\u00AA",
"ordm;": u"\u00BA",
"ordm": u"\u00BA",
"oslash;": u"\u00F8",
"oslash": u"\u00F8",
"otilde;": u"\u00F5",
"otilde": u"\u00F5",
"otimes;": u"\u2297",
"ouml;": u"\u00F6",
"ouml": u"\u00F6",
"para;": u"\u00B6",
"para": u"\u00B6",
"part;": u"\u2202",
"permil;": u"\u2030",
"perp;": u"\u22A5",
"phi;": u"\u03C6",
"pi;": u"\u03C0",
"piv;": u"\u03D6",
"plusmn;": u"\u00B1",
"plusmn": u"\u00B1",
"pound;": u"\u00A3",
"pound": u"\u00A3",
"prime;": u"\u2032",
"prod;": u"\u220F",
"prop;": u"\u221D",
"psi;": u"\u03C8",
"quot;": u"\u0022",
"quot": u"\u0022",
"rArr;": u"\u21D2",
"radic;": u"\u221A",
"rang;": u"\u3009",
"raquo;": u"\u00BB",
"raquo": u"\u00BB",
"rarr;": u"\u2192",
"rceil;": u"\u2309",
"rdquo;": u"\u201D",
"real;": u"\u211C",
"reg;": u"\u00AE",
"reg": u"\u00AE",
"rfloor;": u"\u230B",
"rho;": u"\u03C1",
"rlm;": u"\u200F",
"rsaquo;": u"\u203A",
"rsquo;": u"\u2019",
"sbquo;": u"\u201A",
"scaron;": u"\u0161",
"sdot;": u"\u22C5",
"sect;": u"\u00A7",
"sect": u"\u00A7",
"shy;": u"\u00AD",
"shy": u"\u00AD",
"sigma": u"\u03C3",
"sigmaf": u"\u03C2",
"sim": u"\u223C",
"spades": u"\u2660",
"sub": u"\u2282",
"sube": u"\u2286",
"sum": u"\u2211",
"sup": u"\u2283",
"sigma;": u"\u03C3",
"sigmaf;": u"\u03C2",
"sim;": u"\u223C",
"spades;": u"\u2660",
"sub;": u"\u2282",
"sube;": u"\u2286",
"sum;": u"\u2211",
"sup1;": u"\u00B9",
"sup1": u"\u00B9",
"sup2;": u"\u00B2",
"sup2": u"\u00B2",
"sup3;": u"\u00B3",
"sup3": u"\u00B3",
"supe": u"\u2287",
"sup;": u"\u2283",
"supe;": u"\u2287",
"szlig;": u"\u00DF",
"szlig": u"\u00DF",
"tau": u"\u03C4",
"there4": u"\u2234",
"theta": u"\u03B8",
"thetasym": u"\u03D1",
"thinsp": u"\u2009",
"tau;": u"\u03C4",
"there4;": u"\u2234",
"theta;": u"\u03B8",
"thetasym;": u"\u03D1",
"thinsp;": u"\u2009",
"thorn;": u"\u00FE",
"thorn": u"\u00FE",
"tilde": u"\u02DC",
"tilde;": u"\u02DC",
"times;": u"\u00D7",
"times": u"\u00D7",
"trade": u"\u2122",
"uArr": u"\u21D1",
"trade;": u"\u2122",
"uArr;": u"\u21D1",
"uacute;": u"\u00FA",
"uacute": u"\u00FA",
"uarr": u"\u2191",
"uarr;": u"\u2191",
"ucirc;": u"\u00FB",
"ucirc": u"\u00FB",
"ugrave;": u"\u00F9",
"ugrave": u"\u00F9",
"uml;": u"\u00A8",
"uml": u"\u00A8",
"upsih": u"\u03D2",
"upsilon": u"\u03C5",
"upsih;": u"\u03D2",
"upsilon;": u"\u03C5",
"uuml;": u"\u00FC",
"uuml": u"\u00FC",
"weierp": u"\u2118",
"xi": u"\u03BE",
"weierp;": u"\u2118",
"xi;": u"\u03BE",
"yacute;": u"\u00FD",
"yacute": u"\u00FD",
"yen;": u"\u00A5",
"yen": u"\u00A5",
"yuml;": u"\u00FF",
"yuml": u"\u00FF",
"zeta": u"\u03B6",
"zwj": u"\u200D",
"zwnj": u"\u200C"
"zeta;": u"\u03B6",
"zwj;": u"\u200D",
"zwnj;": u"\u200C"
}
encodings = frozenset((

View File

10
planet/vendor/html5lib/filters/_base.py vendored Normal file
View File

@ -0,0 +1,10 @@
class Filter(object):
def __init__(self, source):
self.source = source
def __iter__(self):
return iter(self.source)
def __getattr__(self, name):
return getattr(self.source, name)

View File

@ -0,0 +1,63 @@
import _base
class Filter(_base.Filter):
def __init__(self, source, encoding):
_base.Filter.__init__(self, source)
self.encoding = encoding
def __iter__(self):
state = "pre_head"
meta_found = (self.encoding is None)
pending = []
for token in _base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag":
if token["name"].lower() == "head":
state = "in_head"
elif type == "EmptyTag":
if token["name"].lower() == "meta":
# replace charset with actual encoding
has_http_equiv_content_type = False
content_index = -1
for i,(name,value) in enumerate(token["data"]):
if name.lower() == 'charset':
token["data"][i] = (u'charset', self.encoding)
meta_found = True
break
elif name == 'http-equiv' and value.lower() == 'content-type':
has_http_equiv_content_type = True
elif name == 'content':
content_index = i
else:
if has_http_equiv_content_type and content_index >= 0:
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
meta_found = True
elif token["name"].lower() == "head" and not meta_found:
# insert meta into empty head
yield {"type": "StartTag", "name": "head",
"data": token["data"]}
yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]}
yield {"type": "EndTag", "name": "head"}
meta_found = True
continue
elif type == "EndTag":
if token["name"].lower() == "head" and pending:
# insert meta into head (if necessary) and flush pending queue
yield pending.pop(0)
if not meta_found:
yield {"type": "EmptyTag", "name": "meta",
"data": [["charset", self.encoding]]}
while pending:
yield pending.pop(0)
meta_found = True
state = "post_head"
if state == "in_head":
pending.append(token)
else:
yield token

90
planet/vendor/html5lib/filters/lint.py vendored Normal file
View File

@ -0,0 +1,90 @@
from gettext import gettext
_ = gettext
import _base
from html5lib.constants import cdataElements, rcdataElements, voidElements
from html5lib.constants import spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class LintError(Exception): pass
class Filter(_base.Filter):
def __iter__(self):
open_elements = []
contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty tag name"))
if type == "StartTag" and name in voidElements:
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
elif type == "EmptyTag" and name not in voidElements:
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, unicode):
raise LintError(_("Attribute name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty attribute name"))
if not isinstance(value, unicode):
raise LintError(_("Attribute value is not a string: %r") % value)
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
elif type == "EndTag":
name = token["name"]
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty tag name"))
if name in voidElements:
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
start_name = open_elements.pop()
if start_name != name:
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
contentModelFlag = "PCDATA"
elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError(_("Comment not in PCDATA content model flag"))
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, unicode):
raise LintError(_("Attribute name is not a string: %r") % data)
if not data:
raise LintError(_(u"%s token with empty data") % type)
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
if not isinstance(name, unicode):
raise LintError(_(u"Tag name is not a string: %r") % name)
if not name:
raise LintError(_(u"Empty tag name"))
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"):
pass
else:
raise LintError(_(u"Unknown token type: %s") % type)
yield token

View File

@ -0,0 +1,175 @@
import _base
class Filter(_base.Filter):
def slider(self):
previous1 = previous2 = None
for token in self.source:
if previous1 is not None:
yield previous2, previous1, token
previous2 = previous1
previous1 = token
yield previous2, previous1, None
def __iter__(self):
for previous, token, next in self.slider():
type = token["type"]
if type == "StartTag":
if token["data"] or not self.is_optional_start(token["name"], previous, next):
yield token
elif type == "EndTag":
if not self.is_optional_end(token["name"], next):
yield token
else:
yield token
def is_optional_start(self, tagname, previous, next):
type = next and next["type"] or None
if tagname in 'html':
# An html element's start tag may be omitted if the first thing
# inside the html element is not a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname == 'head':
# A head element's start tag may be omitted if the first thing
# inside the head element is an element.
return type == "StartTag"
elif tagname == 'body':
# A body element's start tag may be omitted if the first thing
# inside the body element is not a space character or a comment,
# except if the first thing inside the body element is a script
# or style element and the node immediately preceding the body
# element is a head element whose end tag has been omitted.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we do not look at the preceding event, so we never omit
# the body element's start tag if it's followed by a script or
# a style element.
return next["name"] not in ('script', 'style')
else:
return True
elif tagname == 'colgroup':
# A colgroup element's start tag may be omitted if the first thing
# inside the colgroup element is a col element, and if the element
# is not immediately preceeded by another colgroup element whose
# end tag has been omitted.
if type == "StartTag":
# XXX: we do not look at the preceding event, so instead we never
# omit the colgroup element's end tag when it is immediately
# followed by another colgroup element. See is_optional_end.
return next["name"] == "col"
else:
return False
elif tagname == 'tbody':
# A tbody element's start tag may be omitted if the first thing
# inside the tbody element is a tr element, and if the element is
# not immediately preceeded by a tbody, thead, or tfoot element
# whose end tag has been omitted.
if type == "StartTag":
# omit the thead and tfoot elements' end tag when they are
# immediately followed by a tbody element. See is_optional_end.
if previous and previous['type'] == 'EndTag' and \
previous['name'] in ('tbody','thead','tfoot'):
return False
return next["name"] == 'tr'
else:
return False
return False
def is_optional_end(self, tagname, next):
type = next and next["type"] or None
if tagname in ('html', 'head', 'body'):
# An html element's end tag may be omitted if the html element
# is not immediately followed by a space character or a comment.
return type not in ("Comment", "SpaceCharacters")
elif tagname in ('li', 'optgroup', 'option', 'tr'):
# A li element's end tag may be omitted if the li element is
# immediately followed by another li element or if there is
# no more content in the parent element.
# An optgroup element's end tag may be omitted if the optgroup
# element is immediately followed by another optgroup element,
# or if there is no more content in the parent element.
# An option element's end tag may be omitted if the option
# element is immediately followed by another option element,
# or if there is no more content in the parent element.
# A tr element's end tag may be omitted if the tr element is
# immediately followed by another tr element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] == tagname
else:
return type == "EndTag" or type is None
elif tagname in ('dt', 'dd'):
# A dt element's end tag may be omitted if the dt element is
# immediately followed by another dt element or a dd element.
# A dd element's end tag may be omitted if the dd element is
# immediately followed by another dd element or a dt element,
# or if there is no more content in the parent element.
if type == "StartTag":
return next["name"] in ('dt', 'dd')
elif tagname == 'dd':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'p':
# A p element's end tag may be omitted if the p element is
# immediately followed by an address, blockquote, dl, fieldset,
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
# or ul element, or if there is no more content in the parent
# element.
if type == "StartTag":
return next["name"] in ('address', 'blockquote', \
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
else:
return type == "EndTag" or type is None
elif tagname == 'colgroup':
# A colgroup element's end tag may be omitted if the colgroup
# element is not immediately followed by a space character or
# a comment.
if type in ("Comment", "SpaceCharacters"):
return False
elif type == "StartTag":
# XXX: we also look for an immediately following colgroup
# element. See is_optional_start.
return next["name"] != 'colgroup'
else:
return True
elif tagname in ('thead', 'tbody'):
# A thead element's end tag may be omitted if the thead element
# is immediately followed by a tbody or tfoot element.
# A tbody element's end tag may be omitted if the tbody element
# is immediately followed by a tbody or tfoot element, or if
# there is no more content in the parent element.
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] in ['tbody', 'tfoot']
elif tagname == 'tbody':
return type == "EndTag" or type is None
else:
return False
elif tagname == 'tfoot':
# A tfoot element's end tag may be omitted if the tfoot element
# is immediately followed by a tbody element, or if there is no
# more content in the parent element.
# XXX: we never omit the end tag when the following element is
# a tbody. See is_optional_start.
if type == "StartTag":
return next["name"] == 'tbody'
else:
return type == "EndTag" or type is None
elif tagname in ('td', 'th'):
# A td element's end tag may be omitted if the td element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
# A th element's end tag may be omitted if the th element is
# immediately followed by a td or th element, or if there is
# no more content in the parent element.
if type == "StartTag":
return next["name"] in ('td', 'th')
else:
return type == "EndTag" or type is None
return False

View File

@ -0,0 +1,38 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import re
import _base
from html5lib.constants import rcdataElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class Filter(_base.Filter):
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self):
preserve = 0
for token in _base.Filter.__iter__(self):
type = token["type"]
if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements):
preserve += 1
elif type == "EndTag" and preserve:
preserve -= 1
elif not preserve and type == "SpaceCharacters":
continue
elif not preserve and type == "Characters":
token["data"] = collapse_spaces(token["data"])
yield token
def collapse_spaces(text):
return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)

View File

@ -3,14 +3,14 @@
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body>
# always exist.
# * We also deal with content when there's no DOCTYPE.
# It is expected that the specification will catch up with us in due course ;-)
# * </br> creates a <br> element.
#
# We haven't updated DOCTYPE handling yet
#
# It should be trivial to add the following cases. However, we should probably
# also look into comment handling and such then...
# * A <p> element end tag creates an empty <p> element when there's no <p>
# element in scope.
# * A <br> element end tag creates an empty <br> element.
try:
frozenset
@ -20,6 +20,7 @@ except NameError:
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
import sys
import tokenizer
@ -30,27 +31,32 @@ from treebuilders import simpletree
import utils
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements, voidElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. This class is almost always a subclass of
html5lib.treebuilders._base.TreeBuilder
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
"""
# Raise an exception on the first error encountered
self.strict = strict
self.tree = tree()
self.tokenizer_class = tokenizer
self.errors = []
# "quirks" / "almost-standards" / "standards"
self.quirksMode = "standards"
self.phases = {
"initial": InitialPhase(self, self.tree),
"rootElement": RootElementPhase(self, self.tree),
@ -78,15 +84,15 @@ class HTMLParser(object):
self.firstStartTag = False
self.errors = []
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
parseMeta=innerHTML)
self.tokenizer = self.tokenizer_class(stream, encoding,
parseMeta=not innerHTML)
if innerHTML:
self.innerHTML = container.lower()
if self.innerHTML in ('title', 'textarea'):
if self.innerHTML in cdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
elif self.innerHTML in rcdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
elif self.innerHTML == 'plaintext':
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
@ -113,10 +119,12 @@ class HTMLParser(object):
method = getattr(self.phase, "process%s" % type, None)
if type in ("Characters", "SpaceCharacters", "Comment"):
method(token["data"])
elif type in ("StartTag", "Doctype"):
elif type == "StartTag":
method(token["name"], token["data"])
elif type == "EndTag":
method(token["name"])
elif type == "Doctype":
method(token["name"], token["publicId"], token["systemId"], token["correct"])
else:
self.parseError(token["data"])
@ -158,10 +166,6 @@ class HTMLParser(object):
if self.strict:
raise ParseError
def atheistParseError(self):
"""This error is not an error"""
pass
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
@ -171,9 +175,7 @@ class HTMLParser(object):
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
if token["name"] in voidElements:
self.atheistParseError()
else:
if token["name"] not in voidElements:
self.parseError(_("Solidus (/) incorrectly placed in tag."))
token["type"] = "StartTag"
@ -283,7 +285,7 @@ class Phase(object):
# overridden.
self.tree.insertComment(data, self.tree.openElements[-1])
def processDoctype(self, name, error):
def processDoctype(self, name, publicId, systemId, correct):
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
@ -319,10 +321,101 @@ class InitialPhase(Phase):
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processDoctype(self, name, error):
if error:
def processDoctype(self, name, publicId, systemId, correct):
nameLower = name.translate(asciiUpper2Lower)
if nameLower != "html" or publicId != None or\
systemId != None:
self.parser.parseError(_("Erroneous DOCTYPE."))
# XXX need to update DOCTYPE tokens
self.tree.insertDoctype(name)
if publicId == None:
publicId = ""
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
if nameLower != "html":
# XXX quirks mode
pass
else:
if publicId in\
("+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
"-//ietf//dtd html 2.0 level 1//en",
"-//ietf//dtd html 2.0 level 2//en",
"-//ietf//dtd html 2.0 strict level 1//en",
"-//ietf//dtd html 2.0 strict level 2//en",
"-//ietf//dtd html 2.0 strict//en",
"-//ietf//dtd html 2.0//en",
"-//ietf//dtd html 2.1e//en",
"-//ietf//dtd html 3.0//en",
"-//ietf//dtd html 3.0//en//",
"-//ietf//dtd html 3.2 final//en",
"-//ietf//dtd html 3.2//en",
"-//ietf//dtd html 3//en",
"-//ietf//dtd html level 0//en",
"-//ietf//dtd html level 0//en//2.0",
"-//ietf//dtd html level 1//en",
"-//ietf//dtd html level 1//en//2.0",
"-//ietf//dtd html level 2//en",
"-//ietf//dtd html level 2//en//2.0",
"-//ietf//dtd html level 3//en",
"-//ietf//dtd html level 3//en//3.0",
"-//ietf//dtd html strict level 0//en",
"-//ietf//dtd html strict level 0//en//2.0",
"-//ietf//dtd html strict level 1//en",
"-//ietf//dtd html strict level 1//en//2.0",
"-//ietf//dtd html strict level 2//en",
"-//ietf//dtd html strict level 2//en//2.0",
"-//ietf//dtd html strict level 3//en",
"-//ietf//dtd html strict level 3//en//3.0",
"-//ietf//dtd html strict//en",
"-//ietf//dtd html strict//en//2.0",
"-//ietf//dtd html strict//en//3.0",
"-//ietf//dtd html//en",
"-//ietf//dtd html//en//2.0",
"-//ietf//dtd html//en//3.0",
"-//metrius//dtd metrius presentational//en",
"-//microsoft//dtd internet explorer 2.0 html strict//en",
"-//microsoft//dtd internet explorer 2.0 html//en",
"-//microsoft//dtd internet explorer 2.0 tables//en",
"-//microsoft//dtd internet explorer 3.0 html strict//en",
"-//microsoft//dtd internet explorer 3.0 html//en",
"-//microsoft//dtd internet explorer 3.0 tables//en",
"-//netscape comm. corp.//dtd html//en",
"-//netscape comm. corp.//dtd strict html//en",
"-//o'reilly and associates//dtd html 2.0//en",
"-//o'reilly and associates//dtd html extended 1.0//en",
"-//spyglass//dtd html 2.0 extended//en",
"-//sq//dtd html 2.0 hotmetal + extensions//en",
"-//sun microsystems corp.//dtd hotjava html//en",
"-//sun microsystems corp.//dtd hotjava strict html//en",
"-//w3c//dtd html 3 1995-03-24//en",
"-//w3c//dtd html 3.2 draft//en",
"-//w3c//dtd html 3.2 final//en",
"-//w3c//dtd html 3.2//en",
"-//w3c//dtd html 3.2s draft//en",
"-//w3c//dtd html 4.0 frameset//en",
"-//w3c//dtd html 4.0 transitional//en",
"-//w3c//dtd html experimental 19960712//en",
"-//w3c//dtd html experimental 970421//en",
"-//w3c//dtd w3 html//en",
"-//w3o//dtd w3 html 3.0//en",
"-//w3o//dtd w3 html 3.0//en//",
"-//w3o//dtd w3 html strict 3.0//en//",
"-//webtechs//dtd mozilla html 2.0//en",
"-//webtechs//dtd mozilla html//en",
"-/w3c/dtd html 4.0 transitional/en",
"html")\
or (publicId in\
("-//w3c//dtd html 4.01 frameset//EN",
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
or (systemId != None and\
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
#XXX quirks mode
pass
self.parser.phase = self.parser.phases["rootElement"]
def processSpaceCharacters(self, data):
@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("html", self.endTagHtml)
(("html", "head", "body", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase):
self.startTagHead("head", {})
self.parser.phase.processStartTag(name, attributes)
def endTagHtml(self, name):
def endTagImplyHead(self, name):
self.startTagHead("head", {})
self.parser.phase.processEndTag(name)
@ -437,7 +530,7 @@ class InHeadPhase(Phase):
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
("html", self.endTagHtml),
(("html", "body", "br"), self.endTagImplyAfterHead),
(("title", "style", "script"), self.endTagTitleStyleScript)
])
self.endTagHandler.default = self.endTagOther
@ -499,7 +592,11 @@ class InHeadPhase(Phase):
def startTagBaseLinkMeta(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.appendToHead(element)
if (self.tree.headPointer is not None and
self.parser.phase == self.parser.phases["inHead"]):
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
def startTagOther(self, name, attributes):
self.anythingElse()
@ -512,7 +609,7 @@ class InHeadPhase(Phase):
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtml(self, name):
def endTagImplyAfterHead(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
@ -592,9 +689,9 @@ class InBodyPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("script", "style"), self.startTagScriptStyle),
(("base", "link", "meta", "title"),
self.startTagFromHead),
(("base", "link", "meta", "script", "style"),
self.startTagProcessInHead),
("title", self.startTagTitle),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@ -604,8 +701,9 @@ class InBodyPhase(Phase):
("plaintext",self.startTagPlaintext),
(headingElements, self.startTagHeading),
("a", self.startTagA),
(("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
"strong", "tt", "u"),self.startTagFormatting),
(("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
"tt", "u"),self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("marquee", "object"), self.startTagMarqueeObject),
("xmp", self.startTagXmp),
@ -642,7 +740,8 @@ class InBodyPhase(Phase):
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
("br", self.endTagBr),
(("area", "basefont", "bgsound", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
@ -659,11 +758,13 @@ class InBodyPhase(Phase):
self.tree.openElements[-1])
# the real deal
def processSpaceCharactersPre(self, data):
#Sometimes (start of <pre> blocks) we want to drop leading newlines
def processSpaceCharactersDropNewline(self, data):
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
# leading newlines
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
and not self.tree.openElements[-1].hasContent()):
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
or self.tree.openElements[-1].name == "textarea")
and not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.insertText(data)
@ -675,10 +776,10 @@ class InBodyPhase(Phase):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def startTagScriptStyle(self, name, attributes):
def startTagProcessInHead(self, name, attributes):
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagFromHead(self, name, attributes):
def startTagTitle(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that belongs in the head. Moved."))
self.parser.phases["inHead"].processStartTag(name, attributes)
@ -698,7 +799,7 @@ class InBodyPhase(Phase):
self.endTagP("p")
self.tree.insertElement(name, attributes)
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersPre
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, name, attributes):
if self.tree.formPointer:
@ -717,9 +818,16 @@ class InBodyPhase(Phase):
# AT Use reversed in Python 2.4...
for i, node in enumerate(self.tree.openElements[::-1]):
if node.name in stopName:
poppedNodes = []
for j in range(i+1):
self.tree.openElements.pop()
poppedNodes.append(self.tree.openElements.pop())
if i >= 1:
self.parser.parseError("Missing end tag%s (%s)"%
(i > 1 and "s" or "",
", ".join([item.name for item in
poppedNodes[:-1]])))
break
# Phrasing elements are all non special, non scoping, non
# formatting elements
@ -738,14 +846,16 @@ class InBodyPhase(Phase):
def startTagHeading(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
for item in headingElements:
if self.tree.elementInScope(item):
self.parser.parseError(_("Unexpected start tag (" + name +\
")."))
item = self.tree.openElements.pop()
while item.name not in headingElements:
item = self.tree.openElements.pop()
break
# Uncomment the following for IE7 behavior:
#
#for item in headingElements:
# if self.tree.elementInScope(item):
# self.parser.parseError(_("Unexpected start tag (" + name +\
# ")."))
# item = self.tree.openElements.pop()
# while item.name not in headingElements:
# item = self.tree.openElements.pop()
# break
self.tree.insertElement(name, attributes)
def startTagA(self, name, attributes):
@ -765,6 +875,12 @@ class InBodyPhase(Phase):
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(name, attributes)
def startTagNobr(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
if self.tree.elementInScope("nobr"):
self.processEndTag("nobr")
self.addFormattingElement(name, attributes)
def startTagButton(self, name, attributes):
if self.tree.elementInScope("button"):
self.parser.parseError(_("Unexpected start tag (button) implied "
@ -840,6 +956,7 @@ class InBodyPhase(Phase):
# XXX Form element pointer checking here as well...
self.tree.insertElement(name, attributes)
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagCdata(self, name, attributes):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
@ -861,11 +978,13 @@ class InBodyPhase(Phase):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Ignored."))
def startTagNew(self, name, other):
def startTagNew(self, name, attributes):
"""New HTML5 elements, "event-source", "section", "nav",
"article", "aside", "header", "footer", "datagrid", "command"
"""
raise NotImplementedError
sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
self.startTagOther(name, attributes)
#raise NotImplementedError
def startTagOther(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
@ -1082,6 +1201,12 @@ class InBodyPhase(Phase):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Ignored."))
def endTagBr(self, name):
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, {})
self.tree.openElements.pop()
def endTagNone(self, name):
# This handles elements with no end tag.
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
@ -1097,7 +1222,9 @@ class InBodyPhase(Phase):
"""New HTML5 elements, "event-source", "section", "nav",
"article", "aside", "header", "footer", "datagrid", "command"
"""
raise NotImplementedError
sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
self.endTagOther(name)
#raise NotImplementedError
def endTagOther(self, name):
# XXX This logic should be moved into the treebuilder
@ -1222,10 +1349,10 @@ class InTablePhase(Phase):
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.parser.insertFromTable = True
self.tree.insertFromTable = True
# Process the end tag in the "in body" mode
self.parser.phases["inBody"].processEndTag(name)
self.parser.insertFromTable = False
self.tree.insertFromTable = False
class InCaptionPhase(Phase):
@ -1699,7 +1826,7 @@ class AfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
# XXX We should prolly add a handler for "html" here as well...
# XXX We should prolly add a handler for here as well...
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther

View File

@ -31,15 +31,17 @@ class HTMLInputStream(object):
"""
# List of where new lines occur
self.newLines = []
self.newLines = [0]
# Raw Stream
# Raw Stream
self.rawStream = self.openStream(source)
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
self.numBytesMeta = 512
#Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
#Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
@ -48,20 +50,12 @@ class HTMLInputStream(object):
encoding = self.detectEncoding(parseMeta, chardet)
self.charEncoding = encoding
# Read bytes from stream decoding them into Unicode
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
# Normalize new ipythonlines and null characters
uString = re.sub('\r\n?', '\n', uString)
uString = re.sub('\x00', u'\uFFFD', uString)
# Convert the unicode string into a list to be used as the data stream
self.dataStream = uString
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
self.queue = []
# Reset position in the list to read from
self.reset()
self.line = self.col = 0
self.lineLengths = []
def openStream(self, source):
"""Produces a file object from source.
@ -74,6 +68,8 @@ class HTMLInputStream(object):
stream = source
else:
# Otherwise treat source as a string and convert to a file object
if isinstance(source, unicode):
source = source.encode('utf-8')
import cStringIO
stream = cStringIO.StringIO(str(source))
return stream
@ -90,10 +86,18 @@ class HTMLInputStream(object):
#Guess with chardet, if avaliable
if encoding is None and chardet:
try:
import chardet
buffer = self.rawStream.read()
encoding = chardet.detect(buffer)['encoding']
self.rawStream = self.openStream(buffer)
from chardet.universaldetector import UniversalDetector
buffers = []
detector = UniversalDetector()
while not detector.done:
buffer = self.rawStream.read(self.numBytesChardet)
if not buffer:
break
buffers.append(buffer)
detector.feed(buffer)
detector.close()
encoding = detector.result['encoding']
self.seek("".join(buffers), 0)
except ImportError:
pass
# If all else fails use the default encoding
@ -119,60 +123,83 @@ class HTMLInputStream(object):
}
# Go to beginning of file and read in 4 bytes
self.rawStream.seek(0)
string = self.rawStream.read(4)
# Try detecting the BOM using bytes from the string
encoding = bomDict.get(string[:3]) # UTF-8
encoding = bomDict.get(string[:3]) # UTF-8
seek = 3
if not encoding:
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
# Need to detect UTF-32 before UTF-16
encoding = bomDict.get(string) # UTF-32
seek = 4
if not encoding:
encoding = bomDict.get(string) # UTF-32
seek = 4
encoding = bomDict.get(string[:2]) # UTF-16
seek = 2
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
self.seek(string, encoding and seek or 0)
return encoding
def seek(self, buffer, n):
"""Unget buffer[n:]"""
if hasattr(self.rawStream, 'unget'):
self.rawStream.unget(buffer[n:])
return
if hasattr(self.rawStream, 'seek'):
try:
self.rawStream.seek(n)
return
except IOError:
pass
class BufferedStream:
def __init__(self, data, stream):
self.data = data
self.stream = stream
def read(self, chars=-1):
if chars == -1 or chars > len(self.data):
result = self.data
self.data = ''
if chars == -1:
return result + self.stream.read()
else:
return result + self.stream.read(chars-len(result))
elif not self.data:
return self.stream.read(chars)
else:
result = self.data[:chars]
self.data = self.data[chars:]
return result
def unget(self, data):
if self.data:
self.data += data
else:
self.data = data
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
"""
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
self.rawStream.seek(0)
buffer = self.rawStream.read(self.numBytesMeta)
parser = EncodingParser(buffer)
self.seek(buffer, 0)
return parser.getEncoding()
def determineNewLines(self):
# Looks through the stream to find where new lines occur so
# the position method can tell where it is.
self.newLines.append(0)
for i in xrange(len(self.dataStream)):
if self.dataStream[i] == u"\n":
self.newLines.append(i)
def position(self):
"""Returns (line, col) of the current position in the stream."""
# Generate list of new lines first time around
if not self.newLines:
self.determineNewLines()
line = 0
tell = self.tell
for pos in self.newLines:
if pos < tell:
line += 1
line, col = self.line, self.col
for c in self.queue[::-1]:
if c == '\n':
line -= 1
assert col == 0
col = self.lineLengths[line]
else:
break
col = tell - self.newLines[line-1] - 1
return (line, col)
def reset(self):
"""Resets the position in the stream back to the start."""
self.tell = 0
col -= 1
return (line + 1, col)
def char(self):
""" Read one character from the stream or queue if available. Return
@ -181,12 +208,28 @@ class HTMLInputStream(object):
if self.queue:
return self.queue.pop(0)
else:
try:
self.tell += 1
return self.dataStream[self.tell - 1]
except:
c = self.dataStream.read(1, 1)
if not c:
self.col += 1
return EOF
# Normalize newlines and null characters
if c == '\x00': c = u'\uFFFD'
if c == '\r':
c = self.dataStream.read(1, 1)
if c != '\n':
self.queue.insert(0, unicode(c))
c = '\n'
# update position in stream
if c == '\n':
self.lineLengths.append(self.col)
self.line += 1
self.col = 0
else:
self.col += 1
return unicode(c)
def charsUntil(self, characters, opposite = False):
""" Returns a string of characters from the stream up to but not
including any character in characters or EOF. characters can be
@ -194,23 +237,20 @@ class HTMLInputStream(object):
"""
charStack = [self.char()]
# First from the queue
while charStack[-1] and (charStack[-1] in characters) == opposite \
and self.queue:
charStack.append(self.queue.pop(0))
# Then the rest
while charStack[-1] and (charStack[-1] in characters) == opposite:
try:
self.tell += 1
charStack.append(self.dataStream[self.tell - 1])
except:
charStack.append(EOF)
charStack.append(self.char())
# Put the character stopped on back to the front of the queue
# from where it came.
self.queue.insert(0, charStack.pop())
return "".join(charStack)
c = charStack.pop()
if c != EOF:
self.queue.insert(0, c)
return u"".join(charStack)
def unget(self, chars):
if chars:
self.queue = list(chars) + self.queue
class EncodingBytes(str):
"""String-like object with an assosiated position and various extra methods

View File

@ -15,10 +15,13 @@ References:
"""
import html5parser
from constants import voidElements
from constants import voidElements, contentModelFlags
import gettext
_ = gettext.gettext
from xml.dom import XHTML_NAMESPACE
from xml.sax.saxutils import unescape
class XMLParser(html5parser.HTMLParser):
""" liberal XML parser """
@ -45,6 +48,11 @@ class XMLParser(html5parser.HTMLParser):
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
elif token["type"] == "Characters":
# un-escape rcdataElements (e.g. style, script)
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
token["data"] = unescape(token["data"])
elif token["type"] == "Comment":
# Rescue CDATA from the comments
if (token["data"].startswith("[CDATA[") and
@ -66,16 +74,21 @@ class XHTMLParser(XMLParser):
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag" and \
token["name"] not in voidElements and \
token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
break
if token["type"] == "EndTag":
if token["name"] in voidElements:
if not self.tree.openElements or \
self.tree.openElements[-1].name != token["name"]:
token["type"] = "EmptyTag"
if not token.has_key("data"): token["data"] = {}
else:
self.tree.insertText('')
if token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] != XHTML_NAMESPACE:
break
else:
self.tree.insertText('')
return token

189
planet/vendor/html5lib/sanitizer.py vendored Normal file
View File

@ -0,0 +1,189 @@
import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
class HTMLSanitizer(HTMLTokenizer):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
'ul', 'var']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
'font-family', 'font-size', 'font-stretch', 'font-style',
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
'origin', 'overline-position', 'overline-thickness', 'panose-1',
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
'xlink:href', 'xml:base']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs' ]
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
allowed_css_properties = acceptable_css_properties
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
del attrs[attr]
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
yield token
else:
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
yield token
elif token["type"] == "Comment":
pass
else:
yield token
def sanitize_css(self, style):
# disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
for keyword in value.split():
if not keyword in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)

View File

@ -0,0 +1,3 @@
from htmlserializer import HTMLSerializer
from xhtmlserializer import XHTMLSerializer

View File

@ -0,0 +1,216 @@
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
from html5lib.filters.whitespace import Filter as WhitespaceFilter
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
unicode_encode_errors = "strict"
else:
unicode_encode_errors = "htmlentityreplace"
from html5lib.constants import entities
encode_entity_map = {}
for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
for c in ex.object[exc.start:exc.end]:
c = encode_entity_map.get(c)
if c:
res.append("&")
res.append(c)
res.append(";")
else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
return (u"".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error(unicode_encode_errors, htmlentityreplace_errors)
del register_error
def encode(text, encoding):
return text.encode(encoding, unicode_encode_errors)
class HTMLSerializer(object):
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
escape_lt_in_attrs = False
escape_rcdata = False
omit_optional_tags = True
strip_whitespace = False
inject_meta_charset = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
self.use_best_quote_char = False
for attr in self.options:
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
self.errors = []
self.strict = False
def serialize(self, treewalker, encoding=None):
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
treewalker = WhitespaceFilter(treewalker)
if self.omit_optional_tags:
treewalker = OptionalTagFilter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = u"<!DOCTYPE %s>" % token["name"]
if encoding:
yield doctype.encode(encoding)
else:
yield doctype
elif type in ("Characters", "SpaceCharacters"):
if type == "SpaceCharacters" or in_cdata:
if in_cdata and token["data"].find("</") >= 0:
self.serializeError(_("Unexpected </ in CDATA"))
if encoding:
yield token["data"].encode(encoding, "strict")
else:
yield token["data"]
elif encoding:
yield encode(escape(token["data"]), encoding)
else:
yield escape(token["data"])
elif type in ("StartTag", "EmptyTag"):
name = token["name"]
if name in rcdataElements and not self.escape_rcdata:
in_cdata = True
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
attrs = token["data"]
if hasattr(attrs, "items"):
attrs = attrs.items()
attrs.sort()
attributes = []
for k,v in attrs:
if encoding:
k = k.encode(encoding, "strict")
attributes.append(' ')
attributes.append(k)
if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) \
and k not in booleanAttributes.get("", tuple())):
attributes.append("=")
if self.quote_attr_values or not v:
quote_attr = True
else:
quote_attr = reduce(lambda x,y: x or (y in v),
spaceCharacters + "<>\"'", False)
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
if encoding:
v = encode(v, encoding)
if quote_attr:
quote_char = self.quote_char
if self.use_best_quote_char:
if "'" in v and '"' not in v:
quote_char = '"'
elif '"' in v and "'" not in v:
quote_char = "'"
if quote_char == "'":
v = v.replace("'", "&#39;")
else:
v = v.replace('"', "&quot;")
attributes.append(quote_char)
attributes.append(v)
attributes.append(quote_char)
else:
attributes.append(v)
if name in voidElements and self.use_trailing_solidus:
if self.space_before_trailing_solidus:
attributes.append(" /")
else:
attributes.append("/")
if encoding:
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
else:
yield u"<%s%s>" % (name, u"".join(attributes))
elif type == "EndTag":
name = token["name"]
if name in rcdataElements:
in_cdata = False
elif in_cdata:
self.serializeError(_("Unexpected child element of a CDATA element"))
end_tag = u"</%s>" % name
if encoding:
end_tag = end_tag.encode(encoding, "strict")
yield end_tag
elif type == "Comment":
data = token["data"]
if data.find("--") >= 0:
self.serializeError(_("Comment contains --"))
comment = u"<!--%s-->" % token["data"]
if encoding:
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
else:
self.serializeError(token["data"])
def render(self, treewalker, encoding=None):
if encoding:
return "".join(list(self.serialize(treewalker, encoding)))
else:
return u"".join(list(self.serialize(treewalker)))
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append(data)
if self.strict:
raise SerializeError
def SerializeError(Exception):
"""Error in serialized tree"""
pass

View File

@ -0,0 +1,9 @@
from htmlserializer import HTMLSerializer
class XHTMLSerializer(HTMLSerializer):
quote_attr_values = True
minimize_boolean_attributes = False
use_trailing_solidus = True
escape_lt_in_attrs = True
omit_optional_tags = False
escape_rcdata = True

View File

@ -9,7 +9,7 @@ _ = gettext.gettext
from constants import contentModelFlags, spaceCharacters
from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
from constants import digits, hexDigits, EOF
from inputstream import HTMLInputStream
@ -50,18 +50,30 @@ class HTMLTokenizer(object):
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
"bogusComment":self.bogusCommentState,
"markupDeclarationOpen":self.markupDeclarationOpenState,
"commentStart":self.commentStartState,
"commentStartDash":self.commentStartDashState,
"comment":self.commentState,
"commentDash":self.commentDashState,
"commentEndDash":self.commentEndDashState,
"commentEnd":self.commentEndState,
"doctype":self.doctypeState,
"beforeDoctypeName":self.beforeDoctypeNameState,
"doctypeName":self.doctypeNameState,
"afterDoctypeName":self.afterDoctypeNameState,
"beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
"doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
"doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
"afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
"beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
"doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
"doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
"afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
"bogusDoctype":self.bogusDoctypeState
}
# Setup the initial tokenizer state
self.contentModelFlag = contentModelFlags["PCDATA"]
self.escapeFlag = False
self.lastFourChars = []
self.state = self.states["data"]
# The current token being created
@ -77,7 +89,6 @@ class HTMLTokenizer(object):
to return we yield the token which pauses processing until the next token
is requested.
"""
self.stream.reset()
self.tokenQueue = []
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
@ -102,7 +113,7 @@ class HTMLTokenizer(object):
# The character we just consumed need to be put back on the stack so it
# doesn't get lost...
self.stream.queue.append(data)
self.stream.unget(data)
def consumeNumberEntity(self, isHex):
"""This function returns either U+FFFD or the character based on the
@ -132,70 +143,71 @@ class HTMLTokenizer(object):
# Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix)
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
# smaller) we need to do the "windows trick".
if 127 < charAsInt < 160:
#XXX - removed parse error from windows 1252 entity for now
#we may want to reenable this later
#self.tokenQueue.append({"type": "ParseError", "data":
# _("Entity used with illegal number (windows-1252 reference).")})
if charAsInt == 13:
self.tokenQueue.append({"type": "ParseError", "data":
_("Incorrect CR newline entity. Replaced with LF.")})
charAsInt = 10
elif 127 < charAsInt < 160:
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
self.tokenQueue.append({"type": "ParseError", "data":
_("Entity used with illegal number (windows-1252 reference).")})
charAsInt = entitiesWindows1252[charAsInt - 128]
# 0 is not a good number.
if charAsInt == 0:
charAsInt = 65533
try:
# XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work
# according to hsivonen. Also, unichr has a limitation of 65535
char = unichr(charAsInt)
except:
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity couldn't be converted to character.")})
# 0 is not a good number, neither are illegal Unicode code points.
if charAsInt > 0 and charAsInt <= 1114111:
try:
# XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work
# according to hsivonen. Also, unichr has a limitation of 65535
char = unichr(charAsInt)
except:
try:
char = eval("u'\\U%08x'" % charAsInt)
except:
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity couldn't be converted to character.")})
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
if c != u";":
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity didn't end with ';'.")})
self.stream.queue.append(c)
self.stream.unget(c)
return char
def consumeEntity(self):
def consumeEntity(self, fromAttribute=False):
char = None
charStack = [self.stream.char()]
if charStack[0] == u"#":
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
self.stream.unget(charStack)
elif charStack[0] == u"#":
# We might have a number entity here.
charStack.extend([self.stream.char(), self.stream.char()])
if EOF in charStack:
# If we reach the end of the file put everything up to EOF
# back in the queue
charStack = charStack[:charStack.index(EOF)]
self.stream.queue.extend(charStack)
self.stream.unget(charStack)
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity expected. Got end of file instead.")})
else:
if charStack[1].lower() == u"x" \
and charStack[2] in hexDigits:
# Hexadecimal entity detected.
self.stream.queue.append(charStack[2])
self.stream.unget(charStack[2])
char = self.consumeNumberEntity(True)
elif charStack[1] in digits:
# Decimal entity detected.
self.stream.queue.extend(charStack[1:])
self.stream.unget(charStack[1:])
char = self.consumeNumberEntity(False)
else:
# No number entity detected.
self.stream.queue.extend(charStack)
self.stream.unget(charStack)
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity expected but none found.")})
# Break out if we reach the end of the file
elif charStack[0] == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Entity expected. Got end of file instead.")})
else:
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
@ -216,7 +228,8 @@ class HTMLTokenizer(object):
# that may match an entity
entityName = None
# Try to find the longest entity the string will match
# Try to find the longest entity the string will match to take care
# of &noti for instance.
for entityLength in xrange(len(charStack)-1,1,-1):
possibleEntityName = "".join(charStack[:entityLength])
if possibleEntityName in entities:
@ -224,24 +237,26 @@ class HTMLTokenizer(object):
break
if entityName is not None:
char = entities[entityName]
# Check whether or not the last character returned can be
# discarded or needs to be put back.
if not charStack[-1] == ";":
if entityName[-1] != ";":
self.tokenQueue.append({"type": "ParseError", "data":
_("Named entity didn't end with ';'.")})
self.stream.queue.extend(charStack[entityLength:])
if entityName[-1] != ";" and fromAttribute and \
(charStack[entityLength] in asciiLetters
or charStack[entityLength] in digits):
self.stream.unget(charStack)
else:
char = entities[entityName]
self.stream.unget(charStack[entityLength:])
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Named entity expected. Got none.")})
self.stream.queue.extend(charStack)
self.stream.unget(charStack)
return char
def processEntityInAttribute(self):
"""This method replaces the need for "entityInAttributeValueState".
"""
entity = self.consumeEntity()
entity = self.consumeEntity(True)
if entity:
self.currentToken["data"][-1][1] += entity
else:
@ -266,12 +281,30 @@ class HTMLTokenizer(object):
def dataState(self):
data = self.stream.char()
if data == u"&" and self.contentModelFlag in\
if self.contentModelFlag in\
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
if len(self.lastFourChars) == 4:
self.lastFourChars.pop(0)
self.lastFourChars.append(data)
if data == "&" and self.contentModelFlag in\
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
self.state = self.states["entityData"]
elif data == u"<" and self.contentModelFlag !=\
contentModelFlags["PLAINTEXT"]:
elif data == "-" and self.contentModelFlag in\
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
self.escapeFlag == False and\
"".join(self.lastFourChars) == "<!--":
self.escapeFlag = True
self.tokenQueue.append({"type": "Characters", "data":data})
elif data == "<" and (self.contentModelFlag ==\
contentModelFlags["PCDATA"] or (self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
self.escapeFlag == False)):
self.state = self.states["tagOpen"]
elif data == ">" and self.contentModelFlag in\
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
self.escapeFlag = False
self.tokenQueue.append({"type": "Characters", "data":data})
elif data == EOF:
# Tokenization ends.
return False
@ -285,7 +318,7 @@ class HTMLTokenizer(object):
data + self.stream.charsUntil(spaceCharacters, True)})
else:
self.tokenQueue.append({"type": "Characters", "data":
data + self.stream.charsUntil((u"&", u"<"))})
data + self.stream.charsUntil(("&", "<", ">", "-"))})
return True
def entityDataState(self):
@ -321,14 +354,14 @@ class HTMLTokenizer(object):
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got '?' instead (HTML doesn't "
"support processing instructions).")})
self.stream.queue.append(data)
self.stream.unget(data)
self.state = self.states["bogusComment"]
else:
# XXX
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got something else instead")})
self.tokenQueue.append({"type": "Characters", "data": u"<"})
self.stream.queue.append(data)
self.stream.unget(data)
self.state = self.states["data"]
else:
# We know the content model flag is set to either RCDATA or CDATA
@ -338,7 +371,7 @@ class HTMLTokenizer(object):
self.state = self.states["closeTagOpen"]
else:
self.tokenQueue.append({"type": "Characters", "data": u"<"})
self.stream.queue.insert(0, data)
self.stream.unget(data)
self.state = self.states["data"]
return True
@ -361,7 +394,7 @@ class HTMLTokenizer(object):
# Since this is just for checking. We put the characters back on
# the stack.
self.stream.queue.extend(charStack)
self.stream.unget(charStack)
if self.currentToken \
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
@ -372,8 +405,6 @@ class HTMLTokenizer(object):
# emitting the end tag token.
self.contentModelFlag = contentModelFlags["PCDATA"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag after seeing '</'. None found.")})
self.tokenQueue.append({"type": "Characters", "data": u"</"})
self.state = self.states["data"]
@ -381,27 +412,25 @@ class HTMLTokenizer(object):
# method to be walked through.
return True
if self.contentModelFlag == contentModelFlags["PCDATA"]:
data = self.stream.char()
if data in asciiLetters:
self.currentToken =\
{"type": "EndTag", "name": data, "data": []}
self.state = self.states["tagName"]
elif data == u">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected end of file.")})
self.tokenQueue.append({"type": "Characters", "data": u"</"})
self.state = self.states["data"]
else:
# XXX data can be _'_...
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected character '" + data + "' found.")})
self.stream.queue.append(data)
self.state = self.states["bogusComment"]
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type":"EndTag", "name":data, "data":[]}
self.state = self.states["tagName"]
elif data == u">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected end of file.")})
self.tokenQueue.append({"type": "Characters", "data": u"</"})
self.state = self.states["data"]
else:
# XXX data can be _'_...
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected character '" + data + "' found.")})
self.stream.unget(data)
self.state = self.states["bogusComment"]
return True
def tagNameState(self):
@ -413,11 +442,6 @@ class HTMLTokenizer(object):
self.stream.charsUntil(asciiLetters, True)
elif data == u">":
self.emitCurrentToken()
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character when getting the tag name.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in the tag name.")})
@ -440,11 +464,6 @@ class HTMLTokenizer(object):
self.emitCurrentToken()
elif data == u"/":
self.processSolidusInTag()
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute name instead.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute name instead.")})
@ -473,12 +492,6 @@ class HTMLTokenizer(object):
elif data == u"/":
self.processSolidusInTag()
self.state = self.states["beforeAttributeName"]
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute name.")})
self.emitCurrentToken()
leavingThisState = False
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute name.")})
@ -515,11 +528,6 @@ class HTMLTokenizer(object):
elif data == u"/":
self.processSolidusInTag()
self.state = self.states["beforeAttributeName"]
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected = or end of tag.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected = or end of tag.")})
@ -537,16 +545,11 @@ class HTMLTokenizer(object):
self.state = self.states["attributeValueDoubleQuoted"]
elif data == u"&":
self.state = self.states["attributeValueUnQuoted"]
self.stream.queue.append(data);
self.stream.unget(data);
elif data == u"'":
self.state = self.states["attributeValueSingleQuoted"]
elif data == u">":
self.emitCurrentToken()
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute value.")})
@ -594,11 +597,6 @@ class HTMLTokenizer(object):
self.processEntityInAttribute()
elif data == u">":
self.emitCurrentToken()
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value.")})
@ -625,27 +623,66 @@ class HTMLTokenizer(object):
charStack = [self.stream.char(), self.stream.char()]
if charStack == [u"-", u"-"]:
self.currentToken = {"type": "Comment", "data": ""}
self.state = self.states["comment"]
self.state = self.states["commentStart"]
else:
for x in xrange(5):
charStack.append(self.stream.char())
# Put in explicit EOF check
if (not EOF in charStack and
"".join(charStack).upper() == u"DOCTYPE"):
self.currentToken =\
{"type": "Doctype", "name": "", "data": True}
self.currentToken = {"type":"Doctype", "name":"",
"publicId":None, "systemId":None, "correct":True}
self.state = self.states["doctype"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected '--' or 'DOCTYPE'. Not found.")})
self.stream.queue.extend(charStack)
self.stream.unget(charStack)
self.state = self.states["bogusComment"]
return True
def commentStartState(self):
data = self.stream.char()
if data == "-":
self.state = self.states["commentStartDash"]
elif data == ">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Incorrect comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
self.state = self.states["comment"]
return True
def commentStartDashState(self):
data = self.stream.char()
if data == "-":
self.state = self.states["commentEnd"]
elif data == ">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Incorrect comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
self.state = self.states["comment"]
return True
def commentState(self):
data = self.stream.char()
if data == u"-":
self.state = self.states["commentDash"]
self.state = self.states["commentEndDash"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment.")})
@ -655,7 +692,7 @@ class HTMLTokenizer(object):
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
return True
def commentDashState(self):
def commentEndDashState(self):
data = self.stream.char()
if data == u"-":
self.state = self.states["commentEnd"]
@ -702,7 +739,7 @@ class HTMLTokenizer(object):
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("No space after literal string 'DOCTYPE'.")})
self.stream.queue.append(data)
self.stream.unget(data)
self.state = self.states["beforeDoctypeName"]
return True
@ -710,19 +747,16 @@ class HTMLTokenizer(object):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data in asciiLowercase:
self.currentToken["name"] = data.upper()
self.state = self.states["doctypeName"]
elif data == u">":
# Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with "data" as argument.
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected > character. Expected DOCTYPE name.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected DOCTYPE name.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
@ -732,30 +766,19 @@ class HTMLTokenizer(object):
def doctypeNameState(self):
data = self.stream.char()
needsDoctypeCheck = False
if data in spaceCharacters:
self.state = self.states["afterDoctypeName"]
needsDoctypeCheck = True
elif data == u">":
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE name.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
# We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters.
if data in asciiLowercase:
data = data.upper()
self.currentToken["name"] += data
needsDoctypeCheck = True
# After some iterations through this state it should eventually say
# "HTML". Otherwise there's an error.
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
self.currentToken["data"] = False
return True
def afterDoctypeNameState(self):
@ -766,28 +789,194 @@ class HTMLTokenizer(object):
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.currentToken["data"] = True
# XXX EMIT
self.stream.queue.append(data)
self.currentToken["correct"] = False
self.stream.unget(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
charStack = [data]
for x in xrange(5):
charStack.append(self.stream.char())
if EOF not in charStack and\
"".join(charStack).translate(asciiUpper2Lower) == "public":
self.state = self.states["beforeDoctypePublicIdentifier"]
elif EOF not in charStack and\
"".join(charStack).translate(asciiUpper2Lower) == "system":
self.state = self.states["beforeDoctypeSystemIdentifier"]
else:
self.stream.unget(charStack)
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected space or '>'. Got '" + data + "'")})
self.state = self.states["bogusDoctype"]
return True
def beforeDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
self.currentToken["publicId"] = ""
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
elif data == "'":
self.currentToken["publicId"] = ""
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
elif data == ">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected space or '>'. Got '" + data + "'")})
self.currentToken["data"] = True
_("Unexpected end of DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected character in DOCTYPE.")})
self.state = self.states["bogusDoctype"]
return True
def doctypePublicIdentifierDoubleQuotedState(self):
data = self.stream.char()
if data == "\"":
self.state = self.states["afterDoctypePublicIdentifier"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["publicId"] += data
return True
def doctypePublicIdentifierSingleQuotedState(self):
data = self.stream.char()
if data == "'":
self.state = self.states["afterDoctypePublicIdentifier"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["publicId"] += data
return True
def afterDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
self.currentToken["systemId"] = ""
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
elif data == "'":
self.currentToken["systemId"] = ""
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected character in DOCTYPE.")})
self.state = self.states["bogusDoctype"]
return True
def beforeDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
self.currentToken["systemId"] = ""
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
elif data == "'":
self.currentToken["systemId"] = ""
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
elif data == ">":
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected character in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected character in DOCTYPE.")})
self.state = self.states["bogusDoctype"]
return True
def doctypeSystemIdentifierDoubleQuotedState(self):
data = self.stream.char()
if data == "\"":
self.state = self.states["afterDoctypeSystemIdentifier"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["systemId"] += data
return True
def doctypeSystemIdentifierSingleQuotedState(self):
data = self.stream.char()
if data == "'":
self.state = self.states["afterDoctypeSystemIdentifier"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["systemId"] += data
return True
def afterDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected character in DOCTYPE.")})
self.state = self.states["bogusDoctype"]
return True
def bogusDoctypeState(self):
data = self.stream.char()
self.currentToken["correct"] = False
if data == u">":
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
# XXX EMIT
self.stream.queue.append(data)
self.stream.unget(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in bogus doctype.")})
self.tokenQueue.append(self.currentToken)

View File

@ -0,0 +1,64 @@
"""A collection of modules for building different kinds of tree from
HTML documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
signature for their constructor, see treebuilders.simpletree.Comment)
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
The supplied simpletree module provides a python-only implementation
of a full treebuilder and is a useful reference for the semantics of
the various methods.
"""
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation
"etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
implementation - (Currently applies to the "etree" tree type only). A module
implementing the tree type e.g. xml.etree.ElementTree or
lxml.etree."""
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType in ("dom", "simpletree"):
mod = __import__(treeType, globals())
treeBuilderCache[treeType] = mod.TreeBuilder
elif treeType == "beautifulsoup":
import soup
treeBuilderCache[treeType] = soup.TreeBuilder
elif treeType == "etree":
import etree
treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
return treeBuilderCache.get(treeType)

View File

@ -1,4 +1,4 @@
from constants import scopingElements, tableInsertModeElements
from html5lib.constants import scopingElements, tableInsertModeElements
try:
frozenset
except NameError:

View File

@ -2,7 +2,7 @@ import _base
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
from xml.sax.saxutils import escape
from constants import voidElements
from html5lib.constants import voidElements
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -80,9 +80,11 @@ class TreeBuilder(_base.TreeBuilder):
setattr(self.dom, 'hilite', method)
return self
def doctypeClass(self,name):
def insertDoctype(self, name):
domimpl = minidom.getDOMImplementation()
return NodeBuilder(domimpl.createDocumentType(name,None,None))
doctype = domimpl.createDocumentType(name,None,None)
self.document.appendChild(NodeBuilder(doctype))
doctype.ownerDocument = self.dom
def elementClass(self, name):
return NodeBuilder(self.dom.createElement(name))
@ -126,8 +128,8 @@ def testSerializer(element):
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
elif element.nodeType == Node.DOCUMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document")
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
rv.append("#document-fragment")
elif element.nodeType == Node.COMMENT_NODE:
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
@ -215,10 +217,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes: dom2sax(child, handler, nsmap)
else:
# ATTRIBUTE_NODE

249
planet/vendor/html5lib/treebuilders/etree.py vendored Executable file
View File

@ -0,0 +1,249 @@
import _base
import new
import copy
moduleCache = {}
def getETreeModule(ElementTreeImplementation, fullTree=False):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = Element(self.name)
for name, value in self.attributes.iteritems():
element.attributes[name] = value
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
self._element = ElementTree.Comment(data)
self.parent = None
self._childNodes = []
self._flags = []
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_ROOT>")
class DocumentFragment(Element):
def __init__(self):
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if not(hasattr(element, "tag")):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag == "<DOCUMENT_ROOT>":
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif type(element.tag) == type(ElementTree.Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if type(element) == type(ElementTree.ElementTree):
element = element.getroot()
if element.tag == "<!DOCTYPE>":
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag == "<DOCUMENT_ROOT>":
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif type(element.tag) == type(ElementTree.Comment):
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
fragmentClass = DocumentFragment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._element
else:
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element
return locals()

View File

@ -1,5 +1,5 @@
import _base
from constants import voidElements
from html5lib.constants import voidElements
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing

View File

@ -0,0 +1,162 @@
import sys
import copy
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return self.attrs.items().__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return self.attrs.items()
def keys(self):
return self.attrs.keys()
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in self.attrs.keys()
class Element(_base.Node):
def __init__(self, element, soup):
_base.Node.__init__(self, element.name)
self.element = element
self.soup=soup
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
newNode = TextNode(NavigableString(
self.element.contents[-1]+node.element), self.soup)
self.element.contents[-1].extract()
self.appendChild(newNode)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes:
for name, value in attributes.items():
self.element[name] = value
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.contents.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
newNode = TextNode(NavigableString(
self.element.contents[index-1]+node.element), self.soup)
self.element.contents[index-1].extract()
self.insertBefore(newNode, refNode)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
node.element.extract()
node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.element.name), self.soup)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
class TextNode(Element):
def __init__(self, element, soup):
_base.Node.__init__(self, None)
self.element = element
self.soup=soup
def cloneNode(self):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup)
def insertDoctype(self, name):
self.soup.insert(0, Declaration(name))
def elementClass(self, name):
return Element(Tag(self.soup, name), self.soup)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.soup
def getFragment(self):
return _base.TreeBuilder.getFragment(self).element
def testSerializer(element):
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
elif isinstance(element, BeautifulSoup):
if element.name == "[document_fragment]":
rv.append("#document-fragment")
else:
rv.append("#document")
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
elif isinstance(element, unicode):
rv.append("|%s\"%s\"" %(' '*indent, element))
else:
rv.append("|%s<%s>"%(' '*indent, element.name))
if element.attrs:
for name, value in element.attrs:
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
indent += 2
if hasattr(element, "contents"):
for child in element.contents:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)

View File

@ -0,0 +1,47 @@
"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer
module.
To create a tree walker for a new type of tree, you need to do
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.
"""
treeWalkerCache = {}
def getTreeWalker(treeType, implementation=None, **kwargs):
"""Get a TreeWalker class for various types of tree with built-in support
treeType - the name of the tree type required (case-insensitive). Supported
values are "simpletree", "dom", "etree" and "beautifulsoup"
"simpletree" - a built-in DOM-ish tree type with support for some
more pythonic idioms.
"dom" - The xml.dom.minidom DOM implementation
"pulldom" - The xml.dom.pulldom event stream
"etree" - A generic builder for tree implementations exposing an
elementtree-like interface (known to work with
ElementTree, cElementTree and lxml.etree).
"beautifulsoup" - Beautiful soup (if installed)
"genshi" - a Genshi stream
implementation - (Currently applies to the "etree" tree type only). A module
implementing the tree type e.g. xml.etree.ElementTree or
lxml.etree."""
treeType = treeType.lower()
if treeType not in treeWalkerCache:
if treeType in ("dom", "pulldom", "simpletree"):
mod = __import__(treeType, globals())
treeWalkerCache[treeType] = mod.TreeWalker
elif treeType == "genshi":
import genshistream
treeWalkerCache[treeType] = genshistream.TreeWalker
elif treeType == "beautifulsoup":
import soup
treeWalkerCache[treeType] = soup.TreeWalker
elif treeType == "etree":
import etree
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
return treeWalkerCache.get(treeType)

View File

@ -0,0 +1,151 @@
import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, spaceCharacters
spaceCharacters = u"".join(spaceCharacters)
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
return {"type": "SerializeError", "data": msg}
def normalizeAttrs(self, attrs):
if not attrs:
attrs = []
elif hasattr(attrs, 'items'):
attrs = attrs.items()
return [(unicode(name),unicode(value)) for name,value in attrs]
def emptyTag(self, name, attrs, hasChildren=False):
yield {"type": "EmptyTag", "name": unicode(name), \
"data": self.normalizeAttrs(attrs)}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, name, attrs):
return {"type": "StartTag", "name": unicode(name), \
"data": self.normalizeAttrs(attrs)}
def endTag(self, name):
return {"type": "EndTag", "name": unicode(name), "data": []}
def text(self, data):
data = unicode(data)
middle = data.lstrip(spaceCharacters)
left = data[:len(data)-len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
return {"type": "Comment", "data": unicode(data)}
def doctype(self, name):
return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NodeImplementedError
def element(self, node, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(name, attributes, hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(name)
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
if self.tree is currentNode:
currentNode = None
else:
currentNode = self.getParentNode(currentNode)

View File

@ -0,0 +1,37 @@
from xml.dom import Node
import gettext
_ = gettext.gettext
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return _base.DOCTYPE, node.nodeName
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return _base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE:
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (_base.DOCUMENT,)
else:
return _base.UNKNOWN, node.nodeType
def getFirstChild(self, node):
return node.firstChild
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parentNode

View File

@ -0,0 +1,112 @@
import gettext
_ = gettext.gettext
import new
import copy
import _base
from html5lib.constants import voidElements
moduleCache = {}
def getETreeModule(ElementTreeImplementation):
name = "_" + ElementTreeImplementation.__name__+"builder"
if name in moduleCache:
return moduleCache[name]
else:
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
objs = getETreeBuilder(ElementTreeImplementation)
mod.__dict__.update(objs)
moduleCache[name] = mod
return mod
def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following
content:
1. An Element node serving as *context* (it cannot be called the parent
node due to the particular ``tail`` text nodes.
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
3. A list used as a stack of all ancestor *context nodes*. It is a
pair tuple whose first item is an Element and second item is a child
index.
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents = node
if key in ("text", "tail"):
return _base.TEXT, getattr(elt, key)
else:
node = elt[int(key)]
if not(hasattr(node, "tag")):
node = node.getroot()
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
return (_base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>":
return _base.DOCTYPE, node.text
elif type(node.tag) == type(ElementTree.Comment):
return _base.COMMENT, node.text
else:
#This is assumed to be an ordinary element
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, key, parents = node
assert key not in ("text", "tail"), "Text nodes have no children"
parents.append((elt, int(key)))
node = elt[int(key)]
else:
parents = []
assert len(node) or node.text, "Node has no children"
if node.text:
return (node, "text", parents)
else:
return (node, 0, parents)
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
elt, key, parents = node
if key == "text":
key = -1
elif key == "tail":
elt, key = parents.pop()
else:
# Look for "tail" of the "revisited" node
child = elt[key]
if child.tail:
parents.append((elt, key))
return (child, "tail", parents)
# case where key were "text" or "tail" or elt[key] had a tail
key += 1
if len(elt) > key:
return (elt, key, parents)
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
elt, key, parents = node
if parents:
elt, key = parents.pop()
return elt, key, parents
else:
# HACK: We could return ``elt`` but None will stop the algorithm the same way
return None
return locals()

View File

@ -0,0 +1,67 @@
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
depth = 0
ignore_until = None
previous = None
for event in NamespaceFlattener(prefixes={
'http://www.w3.org/1999/xhtml': ''
})(self.tree):
if previous is not None:
if previous[0] == START:
depth += 1
if ignore_until <= depth:
ignore_until = None
if ignore_until is None:
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = depth
if previous[0] == END:
depth -= 1
previous = event
if previous is not None:
if ignore_until is None or ignore_until <= depth:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
kind, data, pos = event
if kind == START:
tag, attrib = data
if tag in voidElements:
for token in self.emptyTag(tag, list(attrib), \
not next or next[0] != END or next[1] != tag):
yield token
else:
yield self.startTag(tag, list(attrib))
elif kind == END:
if data not in voidElements:
yield self.endTag(data)
elif kind == COMMENT:
yield self.comment(data)
elif kind == TEXT:
for token in self.text(data):
yield token
elif kind == DOCTYPE:
yield self.doctype(data[0])
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
START_CDATA, END_CDATA, PI):
pass
else:
yield self.unknown(kind)

View File

@ -0,0 +1,52 @@
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.TreeWalker):
def __iter__(self):
ignore_until = None
previous = None
for event in self.tree:
if previous is not None and \
(ignore_until is None or previous[1] is ignore_until):
if previous[1] is ignore_until:
ignore_until = None
for token in self.tokens(previous, event):
yield token
if token["type"] == "EmptyTag":
ignore_until = previous[1]
previous = event
if ignore_until is None or previous[1] is ignore_until:
for token in self.tokens(previous, None):
yield token
elif ignore_until is not None:
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
def tokens(self, event, next):
type, node = event
if type == START_ELEMENT:
name = node.nodeName
if name in voidElements:
for token in self.emptyTag(name, \
node.attributes.items(), not next or next[1] is not node):
yield token
else:
yield self.startTag(name, node.attributes.items())
elif type == END_ELEMENT:
name = node.nodeName
if name not in voidElements:
yield self.endTag(name)
elif type == COMMENT:
yield self.comment(node.nodeValue)
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
for token in self.text(node.nodeValue):
yield token
else:
yield self.unknown(type)

View File

@ -0,0 +1,72 @@
import gettext
_ = gettext.gettext
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
"""Given that simpletree has no performant way of getting a node's
next sibling, this implementation returns "nodes" as tuples with the
following content:
1. The parent Node (Element, Document or DocumentFragment)
2. The child index of the current node in its parent's children list
3. A list used as a stack of all ancestors. It is a pair tuple whose
first item is a parent Node and second item is a child index.
"""
def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
node = parent.childNodes[idx]
# testing node.type allows us not to import treebuilders.simpletree
if node.type in (1, 2): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif node.type == 3: # DocumentType
return _base.DOCTYPE, node.name
elif node.type == 4: # TextNode
return _base.TEXT, node.value
elif node.type == 5: # Element
return _base.ELEMENT, node.name, \
node.attributes.items(), node.hasContent()
elif node.type == 6: # CommentNode
return _base.COMMENT, node.data
else:
return _node.UNKNOWN, node.type
def getFirstChild(self, node):
if isinstance(node, tuple): # It might be the root Node
parent, idx, parents = node
parents.append((parent, idx))
node = parent.childNodes[idx]
else:
parents = []
assert node.hasContent(), "Node has no children"
return (node, 0, parents)
def getNextSibling(self, node):
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
parent, idx, parents = node
idx += 1
if len(parent.childNodes) > idx:
return (parent, idx, parents)
else:
return None
def getParentNode(self, node):
assert isinstance(node, tuple)
parent, idx, parents = node
if parents:
parent, idx = parents.pop()
return parent, idx, parents
else:
# HACK: We could return ``parent`` but None will stop the algorithm the same way
return None

View File

@ -0,0 +1,36 @@
import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
elif isinstance(node, Declaration): # DocumentType
#Slice needed to remove markup added during unicode conversion
return _base.DOCTYPE, unicode(node.string)[2:-1]
elif isinstance(node, Comment):
return _base.COMMENT, unicode(node.string)[4:-3]
elif isinstance(node, unicode): # TextNode
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return _base.ELEMENT, node.name, \
dict(node.attrs).items(), node.contents
else:
return _base.UNKNOWN, node.__class__.__name__
def getFirstChild(self, node):
return node.contents[0]
def getNextSibling(self, node):
return node.nextSibling
def getParentNode(self, node):
return node.parent