diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py index e075001..6fe28a8 100644 --- a/planet/html5lib/html5parser.py +++ b/planet/html5lib/html5parser.py @@ -1,3 +1,4 @@ + # Differences from the current specification (23 December 2006) are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure ,
and @@ -553,6 +554,10 @@ class InBodyPhase(Phase): # the crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) + + #Keep a ref to this for special handling of whitespace in+ self.processSpaceCharactersNonPre = self.processSpaceCharacters + self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("script", "style"), self.startTagScriptStyle), @@ -622,6 +627,15 @@ class InBodyPhase(Phase): self.tree.openElements[-1]) # the real deal + def processSpaceCharactersPre(self, data): + #Sometimes (start ofblocks) we want to drop leading newlines + self.processSpaceCharacters = self.processSpaceCharactersNonPre + if (data.startswith("\n") and self.tree.openElements[-1].name == "pre" + and not self.tree.openElements[-1].hasContent()): + data = data[1:] + if data: + self.tree.insertText(data) + def processCharacters(self, data): # XXX The specification says to do this for every character at the # moment, but apparently that doesn't match the real world so we don't @@ -651,6 +665,8 @@ class InBodyPhase(Phase): if self.tree.elementInScope("p"): self.endTagP("p") self.tree.insertElement(name, attributes) + if name == "pre": + self.processSpaceCharacters = self.processSpaceCharactersPre def startTagForm(self, name, attributes): if self.tree.formPointer: @@ -849,6 +865,9 @@ class InBodyPhase(Phase): self.parser.phase.processEndTag(name) def endTagBlock(self, name): + #Put us back in the right whitespace handling mode + if name == "pre": + self.processSpaceCharacters = self.processSpaceCharactersNonPre inScope = self.tree.elementInScope(name) if inScope: self.tree.generateImpliedEndTags() diff --git a/planet/html5lib/liberalxmlparser.py b/planet/html5lib/liberalxmlparser.py index a3d98cf..4c7a660 100644 --- a/planet/html5lib/liberalxmlparser.py +++ b/planet/html5lib/liberalxmlparser.py @@ -11,11 +11,6 @@ References: * http://wiki.whatwg.org/wiki/HtmlVsXhtml @@TODO: - * Produce SAX events based on the produced DOM. This is intended not to - support streaming, but rather to support application level compatibility. - * Optional namespace support - * Investigate the use of when tokenizer.contentModelFlag - indicates CDATA processsing to ensure dual HTML/XHTML compatibility. * Selectively lowercase only XHTML, but not foreign markup """ @@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser): if token["data"]: self.parseError(_("End tag contains unexpected attributes.")) + elif token["type"] == "Comment": + # Rescue CDATA from the comments + if (token["data"].startswith("[CDATA[") and + token["data"].endswith("]]")): + token["type"] = "Characters" + token["data"] = token["data"][7:-2] + return token class XHTMLParser(XMLParser): diff --git a/planet/html5lib/treebuilders/dom.py b/planet/html5lib/treebuilders/dom.py index 39a88f6..8b52d6a 100755 --- a/planet/html5lib/treebuilders/dom.py +++ b/planet/html5lib/treebuilders/dom.py @@ -1,5 +1,6 @@ import _base -from xml.dom import minidom, Node +from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE +import new import re illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") @@ -71,6 +72,10 @@ class NodeBuilder(_base.Node): class TreeBuilder(_base.TreeBuilder): def documentClass(self): self.dom = minidom.getDOMImplementation().createDocument(None,None,None) + def hilite(self, encoding): + print 'foo' + method = new.instancemethod(hilite, self.dom, self.dom.__class__) + setattr(self.dom, 'hilite', method) return self def doctypeClass(self,name): @@ -129,3 +134,58 @@ def testSerializer(element): serializeElement(element, 0) return "\n".join(rv) + +def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): + if node.nodeType == Node.ELEMENT_NODE: + if not nsmap: + handler.startElement(node.nodeName, node.attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElement(node.nodeName) + else: + attributes = dict(node.attributes.itemsNS()) + + # gather namespace declarations + prefixes = [] + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if (attr.namespaceURI == XMLNS_NAMESPACE or + (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): + prefix = (attr.localName != 'xmlns' and attr.localName or None) + handler.startPrefixMapping(prefix, attr.nodeValue) + prefixes.append(prefix) + nsmap = nsmap.copy() + nsmap[prefix] = attr.nodeValue + del attributes[(attr.namespaceURI, attr.localName)] + + # apply namespace declarations + for attrname in node.attributes.keys(): + attr = node.getAttributeNode(attrname) + if attr.namespaceURI == None and ':' in attr.nodeName: + prefix = attr.nodeName.split(':')[0] + if nsmap.has_key(prefix): + del attributes[(attr.namespaceURI, attr.localName)] + attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue + + # SAX events + ns = node.namespaceURI or nsmap.get(None,None) + handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endElementNS((ns, node.nodeName), node.nodeName) + for prefix in prefixes: handler.endPrefixMapping(prefix) + + elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: + handler.characters(node.nodeValue) + + elif node.nodeType == Node.DOCUMENT_NODE: + handler.startDocument() + for child in node.childNodes: dom2sax(child, handler, nsmap) + handler.endDocument() + + else: + # ATTRIBUTE_NODE + # ENTITY_NODE + # PROCESSING_INSTRUCTION_NODE + # COMMENT_NODE + # DOCUMENT_TYPE_NODE + # NOTATION_NODE + pass diff --git a/planet/html5lib/treebuilders/etree.py b/planet/html5lib/treebuilders/etree.py index 1d1c5e6..5af468b 100755 --- a/planet/html5lib/treebuilders/etree.py +++ b/planet/html5lib/treebuilders/etree.py @@ -1,208 +1,5 @@ -try: - from xml.etree import ElementTree -except ImportError: - from elementtree import ElementTree - -import _base - -class Element(_base.Node): - def __init__(self, name): - self._element = ElementTree.Element(name) - self.name = name - self.parent = None - self._childNodes = [] - self._flags = [] - - #Set the element text and tail to the empty string rather than None - #XXX - is this desirable or should we do it on a case by case basis? - self._element.text = "" - self._element.tail = "" - - def _setName(self, name): - self._element.tag = name - - def _getName(self): - return self._element.tag - - name = property(_getName, _setName) - - def _getAttributes(self): - return self._element.attrib - - def _setAttributes(self, attributes): - #Delete existing attributes first - #XXX - there may be a better way to do this... - for key in self._element.attrib.keys(): - del self._element.attrib[key] - for key, value in attributes.iteritems(): - self._element.set(key, value) - - attributes = property(_getAttributes, _setAttributes) - - def _getChildNodes(self): - return self._childNodes - - def _setChildNodes(self, value): - del self._element[:] - self._childNodes = [] - for element in value: - self.insertChild(element) - - childNodes = property(_getChildNodes, _setChildNodes) - - def hasContent(self): - """Return true if the node has children or text""" - return bool(self._element.text or self._element.getchildren()) - - def appendChild(self, node): - self._childNodes.append(node) - self._element.append(node._element) - node.parent = self - - def insertBefore(self, node, refNode): - index = self._element.getchildren().index(refNode._element) - self._element.insert(index, node._element) - node.parent = self - - def removeChild(self, node): - self._element.remove(node._element) - node.parent=None - - def insertText(self, data, insertBefore=None): - if not(len(self._element)): - self._element.text += data - elif insertBefore is None: - #Insert the text as the tail of the last child element - self._element[-1].tail += data - else: - #Insert the text before the specified node - children = self._element.getchildren() - index = children.index(insertBefore._element) - if index > 0: - self._element[index-1].tail += data - else: - self._element.text += data - - def cloneNode(self): - element = Element(self.name) - element.attributes = self.attributes - return element - - def reparentChildren(self, newParent): - if newParent.childNodes: - newParent.childNodes[-1]._element.tail += self._element.text - else: - newParent._element.text += self._element.text - self._element.text = "" - _base.Node.reparentChildren(self, newParent) - -class Comment(Element): - def __init__(self, data): - Element.__init__(self, Comment) - self._element.text = data - - def _getData(self): - return self._element.text - - def _setData(self, value): - self._element.text = value - - data = property(_getData, _setData) - -class DocumentType(Element): - def __init__(self, name): - Element.__init__(self, DocumentType) - self._element.text = name - -class Document(Element): - def __init__(self): - Element.__init__(self, Document) - -def testSerializer(element): - rv = [] - finalText = None - def serializeElement(element, indent=0): - if element.tag is DocumentType: - rv.append("|%s"%(' '*indent, element.text)) - elif element.tag is Document: - rv.append("#document") - if element.text: - rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) - if element.tail: - finalText = element.tail - elif element.tag is Comment: - rv.append("|%s"%(' '*indent, element.text)) - else: - rv.append("|%s<%s>"%(' '*indent, element.tag)) - if hasattr(element, "attrib"): - for name, value in element.attrib.iteritems(): - rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) - if element.text: - rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) - indent += 2 - for child in element.getchildren(): - serializeElement(child, indent) - if element.tail: - rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) - serializeElement(element, 0) - - if finalText is not None: - rv.append("|%s\"%s\""%(' '*2, finalText)) - - return "\n".join(rv) - -def tostring(element): - """Serialize an element and its child nodes to a string""" - rv = [] - finalText = None - def serializeElement(element): - if element.tag is DocumentType: - rv.append(""%(element.text,)) - elif element.tag is Document: - if element.text: - rv.append(element.text) - if element.tail: - finalText = element.tail - - for child in element.getchildren(): - serializeElement(child) - - elif element.tag is Comment: - rv.append(""%(element.text,)) - else: - #This is assumed to be an ordinary element - if not element.attrib: - rv.append("<%s>"%(element.tag,)) - else: - attr = " ".join(["%s=\"%s\""%(name, value) - for name, value in element.attrib.iteritems()]) - rv.append("<%s %s>"%(element.tag, attr)) - if element.text: - rv.append(element.text) - - for child in element.getchildren(): - serializeElement(child) - - rv.append("%s>"%(element.tag,)) - - if element.tail: - rv.append(element.tail) - - serializeElement(element) - - if finalText is not None: - rv.append("%s\""%(' '*2, finalText)) - - return "".join(rv) - -class TreeBuilder(_base.TreeBuilder): - documentClass = Document - doctypeClass = DocumentType - elementClass = Element - commentClass = Comment - - def testSerializer(self, element): - return testSerializer(element) +import etreefull +class TreeBuilder(etreefull.TreeBuilder): def getDocument(self): - return self.document._element + return self.document._element.find("html") diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py new file mode 100644 index 0000000..ba224fb --- /dev/null +++ b/planet/html5lib/treebuilders/etreefull.py @@ -0,0 +1,216 @@ +try: + from xml.etree import ElementTree +except ImportError: + from elementtree import ElementTree + +import _base + +class Element(_base.Node): + def __init__(self, name): + self._element = ElementTree.Element(name) + self.name = name + self.parent = None + self._childNodes = [] + self._flags = [] + + def _setName(self, name): + self._element.tag = name + + def _getName(self): + return self._element.tag + + name = property(_getName, _setName) + + def _getAttributes(self): + return self._element.attrib + + def _setAttributes(self, attributes): + #Delete existing attributes first + #XXX - there may be a better way to do this... + for key in self._element.attrib.keys(): + del self._element.attrib[key] + for key, value in attributes.iteritems(): + self._element.set(key, value) + + attributes = property(_getAttributes, _setAttributes) + + def _getChildNodes(self): + return self._childNodes + + def _setChildNodes(self, value): + del self._element[:] + self._childNodes = [] + for element in value: + self.insertChild(element) + + childNodes = property(_getChildNodes, _setChildNodes) + + def hasContent(self): + """Return true if the node has children or text""" + return bool(self._element.text or self._element.getchildren()) + + def appendChild(self, node): + self._childNodes.append(node) + self._element.append(node._element) + node.parent = self + + def insertBefore(self, node, refNode): + index = self._element.getchildren().index(refNode._element) + self._element.insert(index, node._element) + node.parent = self + + def removeChild(self, node): + self._element.remove(node._element) + node.parent=None + + def insertText(self, data, insertBefore=None): + if not(len(self._element)): + if not self._element.text: + self._element.text = "" + self._element.text += data + elif insertBefore is None: + #Insert the text as the tail of the last child element + if not self._element[-1].tail: + self._element[-1].tail = "" + self._element[-1].tail += data + else: + #Insert the text before the specified node + children = self._element.getchildren() + index = children.index(insertBefore._element) + if index > 0: + if not self._element[index-1].tail: + self._element[index-1].tail = "" + self._element[index-1].tail += data + else: + if not self._element.text: + self._element.text = "" + self._element.text += data + + def cloneNode(self): + element = Element(self.name) + element.attributes = self.attributes + return element + + def reparentChildren(self, newParent): + if newParent.childNodes: + newParent.childNodes[-1]._element.tail += self._element.text + else: + if not newParent._element.text: + newParent._element.text = "" + if self._element.text is not None: + newParent._element.text += self._element.text + self._element.text = "" + _base.Node.reparentChildren(self, newParent) + +class Comment(Element): + def __init__(self, data): + #Use the superclass constructor to set all properties on the + #wrapper element + Element.__init__(self, None) + self._element = ElementTree.Comment(data) + + def _getData(self): + return self._element.text + + def _setData(self, value): + self._element.text = value + + data = property(_getData, _setData) + +class DocumentType(Element): + def __init__(self, name): + Element.__init__(self, DocumentType) + self._element.text = name + +class Document(Element): + def __init__(self): + Element.__init__(self, Document) + +def testSerializer(element): + rv = [] + finalText = None + def serializeElement(element, indent=0): + if element.tag is DocumentType: + rv.append("|%s"%(' '*indent, element.text)) + elif element.tag is Document: + rv.append("#document") + if element.text: + rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) + if element.tail: + finalText = element.tail + elif element.tag is ElementTree.Comment: + rv.append("|%s"%(' '*indent, element.text)) + else: + rv.append("|%s<%s>"%(' '*indent, element.tag)) + if hasattr(element, "attrib"): + for name, value in element.attrib.iteritems(): + rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) + if element.text: + rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) + indent += 2 + for child in element.getchildren(): + serializeElement(child, indent) + if element.tail: + rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\""%(' '*2, finalText)) + + return "\n".join(rv) + +def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + def serializeElement(element): + if element.tag is DocumentType: + rv.append(""%(element.text,)) + elif element.tag is Document: + if element.text: + rv.append(element.text) + if element.tail: + finalText = element.tail + + for child in element.getchildren(): + serializeElement(child) + + elif element.tag is ElementTree.Comment: + rv.append(""%(element.text,)) + else: + #This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>"%(element.tag,)) + else: + attr = " ".join(["%s=\"%s\""%(name, value) + for name, value in element.attrib.iteritems()]) + rv.append("<%s %s>"%(element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element.getchildren(): + serializeElement(child) + + rv.append("%s>"%(element.tag,)) + + if element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\""%(' '*2, finalText)) + + return "".join(rv) + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = Element + commentClass = Comment + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + return self.document._element