resync with html5lib (includes improved <pre> support)

This commit is contained in:
Sam Ruby 2007-01-26 19:22:35 -05:00
parent 32a1c49090
commit bc33615ced
5 changed files with 306 additions and 212 deletions

View File

@ -1,3 +1,4 @@
# Differences from the current specification (23 December 2006) are as follows:
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body>
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
# the crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
#Keep a ref to this for special handling of whitespace in <pre>
self.processSpaceCharactersNonPre = self.processSpaceCharacters
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("script", "style"), self.startTagScriptStyle),
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
self.tree.openElements[-1])
# the real deal
def processSpaceCharactersPre(self, data):
#Sometimes (start of <pre> blocks) we want to drop leading newlines
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
and not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.insertText(data)
def processCharacters(self, data):
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersPre
def startTagForm(self, name, attributes):
if self.tree.formPointer:
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
self.parser.phase.processEndTag(name)
def endTagBlock(self, name):
#Put us back in the right whitespace handling mode
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersNonPre
inScope = self.tree.elementInScope(name)
if inScope:
self.tree.generateImpliedEndTags()

View File

@ -11,11 +11,6 @@ References:
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Produce SAX events based on the produced DOM. This is intended not to
support streaming, but rather to support application level compatibility.
* Optional namespace support
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
* Selectively lowercase only XHTML, but not foreign markup
"""
@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
elif token["type"] == "Comment":
# Rescue CDATA from the comments
if (token["data"].startswith("[CDATA[") and
token["data"].endswith("]]")):
token["type"] = "Characters"
token["data"] = token["data"][7:-2]
return token
class XHTMLParser(XMLParser):

View File

@ -1,5 +1,6 @@
import _base
from xml.dom import minidom, Node
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
def hilite(self, encoding):
print 'foo'
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
setattr(self.dom, 'hilite', method)
return self
def doctypeClass(self,name):
@ -129,3 +134,58 @@ def testSerializer(element):
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.localName != 'xmlns' and attr.localName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.localName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass

View File

@ -1,208 +1,5 @@
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import _base
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
#Set the element text and tail to the empty string rather than None
#XXX - is this desirable or should we do it on a case by case basis?
self._element.text = ""
self._element.tail = ""
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
self._element[index-1].tail += data
else:
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element.attributes = self.attributes
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
Element.__init__(self, Comment)
self._element.text = data
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, DocumentType)
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, Document)
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if element.tag is DocumentType:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag is Document:
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag is Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if element.tag is DocumentType:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag is Document:
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif element.tag is Comment:
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
def testSerializer(self, element):
return testSerializer(element)
import etreefull
class TreeBuilder(etreefull.TreeBuilder):
def getDocument(self):
return self.document._element
return self.document._element.find("html")

View File

@ -0,0 +1,216 @@
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import _base
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element.attributes = self.attributes
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
Element.__init__(self, None)
self._element = ElementTree.Comment(data)
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, DocumentType)
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, Document)
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if element.tag is DocumentType:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag is Document:
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag is ElementTree.Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if element.tag is DocumentType:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag is Document:
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif element.tag is ElementTree.Comment:
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.document._element