resync with html5lib (includes improved <pre> support)
This commit is contained in:
parent
32a1c49090
commit
bc33615ced
@ -1,3 +1,4 @@
|
||||
|
||||
# Differences from the current specification (23 December 2006) are as follows:
|
||||
# * Phases and insertion modes are one concept in parser.py.
|
||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
|
||||
# the crazy mode
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
#Keep a ref to this for special handling of whitespace in <pre>
|
||||
self.processSpaceCharactersNonPre = self.processSpaceCharacters
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("script", "style"), self.startTagScriptStyle),
|
||||
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
|
||||
self.tree.openElements[-1])
|
||||
|
||||
# the real deal
|
||||
def processSpaceCharactersPre(self, data):
|
||||
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
|
||||
and not self.tree.openElements[-1].hasContent()):
|
||||
data = data[1:]
|
||||
if data:
|
||||
self.tree.insertText(data)
|
||||
|
||||
def processCharacters(self, data):
|
||||
# XXX The specification says to do this for every character at the
|
||||
# moment, but apparently that doesn't match the real world so we don't
|
||||
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope("p"):
|
||||
self.endTagP("p")
|
||||
self.tree.insertElement(name, attributes)
|
||||
if name == "pre":
|
||||
self.processSpaceCharacters = self.processSpaceCharactersPre
|
||||
|
||||
def startTagForm(self, name, attributes):
|
||||
if self.tree.formPointer:
|
||||
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagBlock(self, name):
|
||||
#Put us back in the right whitespace handling mode
|
||||
if name == "pre":
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
inScope = self.tree.elementInScope(name)
|
||||
if inScope:
|
||||
self.tree.generateImpliedEndTags()
|
||||
|
@ -11,11 +11,6 @@ References:
|
||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
|
||||
@@TODO:
|
||||
* Produce SAX events based on the produced DOM. This is intended not to
|
||||
support streaming, but rather to support application level compatibility.
|
||||
* Optional namespace support
|
||||
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
|
||||
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
||||
* Selectively lowercase only XHTML, but not foreign markup
|
||||
"""
|
||||
|
||||
@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
|
||||
elif token["type"] == "Comment":
|
||||
# Rescue CDATA from the comments
|
||||
if (token["data"].startswith("[CDATA[") and
|
||||
token["data"].endswith("]]")):
|
||||
token["type"] = "Characters"
|
||||
token["data"] = token["data"][7:-2]
|
||||
|
||||
return token
|
||||
|
||||
class XHTMLParser(XMLParser):
|
||||
|
@ -1,5 +1,6 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
def hilite(self, encoding):
|
||||
print 'foo'
|
||||
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
|
||||
setattr(self.dom, 'hilite', method)
|
||||
return self
|
||||
|
||||
def doctypeClass(self,name):
|
||||
@ -129,3 +134,58 @@ def testSerializer(element):
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
if node.nodeType == Node.ELEMENT_NODE:
|
||||
if not nsmap:
|
||||
handler.startElement(node.nodeName, node.attributes)
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endElement(node.nodeName)
|
||||
else:
|
||||
attributes = dict(node.attributes.itemsNS())
|
||||
|
||||
# gather namespace declarations
|
||||
prefixes = []
|
||||
for attrname in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attrname)
|
||||
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
||||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
||||
prefix = (attr.localName != 'xmlns' and attr.localName or None)
|
||||
handler.startPrefixMapping(prefix, attr.nodeValue)
|
||||
prefixes.append(prefix)
|
||||
nsmap = nsmap.copy()
|
||||
nsmap[prefix] = attr.nodeValue
|
||||
del attributes[(attr.namespaceURI, attr.localName)]
|
||||
|
||||
# apply namespace declarations
|
||||
for attrname in node.attributes.keys():
|
||||
attr = node.getAttributeNode(attrname)
|
||||
if attr.namespaceURI == None and ':' in attr.nodeName:
|
||||
prefix = attr.nodeName.split(':')[0]
|
||||
if nsmap.has_key(prefix):
|
||||
del attributes[(attr.namespaceURI, attr.localName)]
|
||||
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
|
||||
|
||||
# SAX events
|
||||
ns = node.namespaceURI or nsmap.get(None,None)
|
||||
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
||||
|
||||
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
||||
handler.characters(node.nodeValue)
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||
handler.startDocument()
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
else:
|
||||
# ATTRIBUTE_NODE
|
||||
# ENTITY_NODE
|
||||
# PROCESSING_INSTRUCTION_NODE
|
||||
# COMMENT_NODE
|
||||
# DOCUMENT_TYPE_NODE
|
||||
# NOTATION_NODE
|
||||
pass
|
||||
|
@ -1,208 +1,5 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
from elementtree import ElementTree
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
#Set the element text and tail to the empty string rather than None
|
||||
#XXX - is this desirable or should we do it on a case by case basis?
|
||||
self._element.text = ""
|
||||
self._element.tail = ""
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
Element.__init__(self, Comment)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
import etreefull
|
||||
|
||||
class TreeBuilder(etreefull.TreeBuilder):
|
||||
def getDocument(self):
|
||||
return self.document._element
|
||||
return self.document._element.find("html")
|
||||
|
216
planet/html5lib/treebuilders/etreefull.py
Normal file
216
planet/html5lib/treebuilders/etreefull.py
Normal file
@ -0,0 +1,216 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
from elementtree import ElementTree
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
Element.__init__(self, None)
|
||||
self._element = ElementTree.Comment(data)
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
Loading…
x
Reference in New Issue
Block a user