From fc90da7fc07cd966b94f3212cc92ab8fb75f3219 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 25 Jun 2007 10:49:51 -0400 Subject: [PATCH] Update to latest html5lib; move packaged dependencies to vendor directory --- filters/html2xhtml.plugin | 2 +- filters/mememe.plugin | 3 +- planet/__init__.py | 4 +- planet/html5lib/treebuilders/__init__.py | 42 -- planet/html5lib/treebuilders/etree.py | 5 - planet/html5lib/treebuilders/etreefull.py | 227 ------- planet/reconstitute.py | 5 +- planet/shell/tmpl.py | 3 +- planet/spider.py | 2 +- .../{ => vendor}/compat_logging/__init__.py | 0 planet/{ => vendor}/compat_logging/config.py | 0 .../{ => vendor}/compat_logging/handlers.py | 0 planet/{ => vendor}/feedparser.py | 0 planet/{ => vendor}/html5lib/__init__.py | 0 planet/{ => vendor}/html5lib/constants.py | 617 +++++++++++------- planet/vendor/html5lib/filters/__init__.py | 0 planet/vendor/html5lib/filters/_base.py | 10 + .../html5lib/filters/inject_meta_charset.py | 63 ++ planet/vendor/html5lib/filters/lint.py | 90 +++ .../vendor/html5lib/filters/optionaltags.py | 175 +++++ planet/vendor/html5lib/filters/whitespace.py | 38 ++ planet/{ => vendor}/html5lib/html5parser.py | 237 +++++-- planet/{ => vendor}/html5lib/inputstream.py | 176 +++-- .../{ => vendor}/html5lib/liberalxmlparser.py | 33 +- planet/vendor/html5lib/sanitizer.py | 189 ++++++ planet/vendor/html5lib/serializer/__init__.py | 3 + .../html5lib/serializer/htmlserializer.py | 216 ++++++ .../html5lib/serializer/xhtmlserializer.py | 9 + planet/{ => vendor}/html5lib/tokenizer.py | 461 +++++++++---- .../vendor/html5lib/treebuilders/__init__.py | 64 ++ .../html5lib/treebuilders/_base.py | 2 +- .../{ => vendor}/html5lib/treebuilders/dom.py | 20 +- planet/vendor/html5lib/treebuilders/etree.py | 249 +++++++ .../html5lib/treebuilders/simpletree.py | 2 +- planet/vendor/html5lib/treebuilders/soup.py | 162 +++++ .../vendor/html5lib/treewalkers/__init__.py | 47 ++ planet/vendor/html5lib/treewalkers/_base.py | 151 +++++ planet/vendor/html5lib/treewalkers/dom.py | 37 ++ planet/vendor/html5lib/treewalkers/etree.py | 112 ++++ .../html5lib/treewalkers/genshistream.py | 67 ++ planet/vendor/html5lib/treewalkers/pulldom.py | 52 ++ .../vendor/html5lib/treewalkers/simpletree.py | 72 ++ planet/vendor/html5lib/treewalkers/soup.py | 36 + planet/{ => vendor}/html5lib/utils.py | 0 planet/{ => vendor}/htmltmpl.py | 0 planet/{ => vendor}/httplib2/__init__.py | 0 planet/{ => vendor}/httplib2/iri2uri.py | 0 planet/{ => vendor}/portalocker.py | 0 planet/{ => vendor}/timeoutsocket.py | 0 49 files changed, 2883 insertions(+), 800 deletions(-) delete mode 100755 planet/html5lib/treebuilders/__init__.py delete mode 100755 planet/html5lib/treebuilders/etree.py delete mode 100644 planet/html5lib/treebuilders/etreefull.py rename planet/{ => vendor}/compat_logging/__init__.py (100%) rename planet/{ => vendor}/compat_logging/config.py (100%) rename planet/{ => vendor}/compat_logging/handlers.py (100%) rename planet/{ => vendor}/feedparser.py (100%) rename planet/{ => vendor}/html5lib/__init__.py (100%) rename planet/{ => vendor}/html5lib/constants.py (56%) create mode 100644 planet/vendor/html5lib/filters/__init__.py create mode 100644 planet/vendor/html5lib/filters/_base.py create mode 100644 planet/vendor/html5lib/filters/inject_meta_charset.py create mode 100644 planet/vendor/html5lib/filters/lint.py create mode 100644 planet/vendor/html5lib/filters/optionaltags.py create mode 100644 planet/vendor/html5lib/filters/whitespace.py rename planet/{ => vendor}/html5lib/html5parser.py (88%) rename planet/{ => vendor}/html5lib/inputstream.py (79%) rename planet/{ => vendor}/html5lib/liberalxmlparser.py (77%) create mode 100644 planet/vendor/html5lib/sanitizer.py create mode 100644 planet/vendor/html5lib/serializer/__init__.py create mode 100644 planet/vendor/html5lib/serializer/htmlserializer.py create mode 100644 planet/vendor/html5lib/serializer/xhtmlserializer.py rename planet/{ => vendor}/html5lib/tokenizer.py (65%) create mode 100755 planet/vendor/html5lib/treebuilders/__init__.py rename planet/{ => vendor}/html5lib/treebuilders/_base.py (99%) rename planet/{ => vendor}/html5lib/treebuilders/dom.py (97%) mode change 100755 => 100644 create mode 100755 planet/vendor/html5lib/treebuilders/etree.py rename planet/{ => vendor}/html5lib/treebuilders/simpletree.py (99%) create mode 100644 planet/vendor/html5lib/treebuilders/soup.py create mode 100644 planet/vendor/html5lib/treewalkers/__init__.py create mode 100644 planet/vendor/html5lib/treewalkers/_base.py create mode 100644 planet/vendor/html5lib/treewalkers/dom.py create mode 100644 planet/vendor/html5lib/treewalkers/etree.py create mode 100644 planet/vendor/html5lib/treewalkers/genshistream.py create mode 100644 planet/vendor/html5lib/treewalkers/pulldom.py create mode 100644 planet/vendor/html5lib/treewalkers/simpletree.py create mode 100644 planet/vendor/html5lib/treewalkers/soup.py rename planet/{ => vendor}/html5lib/utils.py (100%) rename planet/{ => vendor}/htmltmpl.py (100%) rename planet/{ => vendor}/httplib2/__init__.py (100%) rename planet/{ => vendor}/httplib2/iri2uri.py (100%) rename planet/{ => vendor}/portalocker.py (100%) rename planet/{ => vendor}/timeoutsocket.py (100%) diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin index 456df48..3ab7a8c 100644 --- a/filters/html2xhtml.plugin +++ b/filters/html2xhtml.plugin @@ -1,5 +1,5 @@ import sys -from planet import html5lib +import html5lib tree=html5lib.treebuilders.dom.TreeBuilder parser = html5lib.html5parser.HTMLParser(tree=tree) document = parser.parse(sys.stdin) diff --git a/filters/mememe.plugin b/filters/mememe.plugin index 9efee9f..8b4b126 100644 --- a/filters/mememe.plugin +++ b/filters/mememe.plugin @@ -23,8 +23,9 @@ from xml.sax.saxutils import escape from htmlentitydefs import entitydefs import planet -from planet import config, feedparser +from planet import config from planet.spider import filename +import feedparser log = planet.logger options = config.filter_options(sys.argv[0]) diff --git a/planet/__init__.py b/planet/__init__.py index f75ea6a..d9655d4 100644 --- a/planet/__init__.py +++ b/planet/__init__.py @@ -32,7 +32,9 @@ def getLogger(level, format): loggerParms = (level,format) return logger +sys.path.append(os.path.join(os.path.dirname(__file__),'vendor')) + # Configure feed parser -from planet import feedparser +import feedparser feedparser.SANITIZE_HTML=0 feedparser.RESOLVE_RELATIVE_URIS=0 diff --git a/planet/html5lib/treebuilders/__init__.py b/planet/html5lib/treebuilders/__init__.py deleted file mode 100755 index 9470145..0000000 --- a/planet/html5lib/treebuilders/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -"""A collection of modules for building different kinds of tree from -HTML documents. - -To create a treebuilder for a new type of tree, you need to do -implement several things: - -1) A set of classes for various types of elements: Document, Doctype, -Comment, Element. These must implement the interface of -_base.treebuilders.Node (although comment nodes have a different -signature for their constructor, see treebuilders.simpletree.Comment) -Textual content may also be implemented as another node type, or not, as -your tree implementation requires. - -2) A treebuilder object (called TreeBuilder by convention) that -inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: -documentClass - the class to use for the bottommost node of a document -elementClass - the class to use for HTML Elements -commentClass - the class to use for comments -doctypeClass - the class to use for doctypes -It also has one required method: -getDocument - Returns the root node of the complete document tree - -3) If you wish to run the unit tests, you must also create a -testSerializer method on your treebuilder which accepts a node and -returns a string containing Node and its children serialized according -to the format used in the unittests - -The supplied simpletree module provides a python-only implementation -of a full treebuilder and is a useful reference for the semantics of -the various methods. -""" - -import os.path -__path__.append(os.path.dirname(__path__[0])) - -import dom -import simpletree - -try: - import etree -except: - pass diff --git a/planet/html5lib/treebuilders/etree.py b/planet/html5lib/treebuilders/etree.py deleted file mode 100755 index 5af468b..0000000 --- a/planet/html5lib/treebuilders/etree.py +++ /dev/null @@ -1,5 +0,0 @@ -import etreefull - -class TreeBuilder(etreefull.TreeBuilder): - def getDocument(self): - return self.document._element.find("html") diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py deleted file mode 100644 index 2629664..0000000 --- a/planet/html5lib/treebuilders/etreefull.py +++ /dev/null @@ -1,227 +0,0 @@ -try: - from xml.etree import ElementTree -except ImportError: - try: - from elementtree import ElementTree - except: - pass - -import _base - -class Element(_base.Node): - def __init__(self, name): - self._element = ElementTree.Element(name) - self.name = name - self.parent = None - self._childNodes = [] - self._flags = [] - - def _setName(self, name): - self._element.tag = name - - def _getName(self): - return self._element.tag - - name = property(_getName, _setName) - - def _getAttributes(self): - return self._element.attrib - - def _setAttributes(self, attributes): - #Delete existing attributes first - #XXX - there may be a better way to do this... - for key in self._element.attrib.keys(): - del self._element.attrib[key] - for key, value in attributes.iteritems(): - self._element.set(key, value) - - attributes = property(_getAttributes, _setAttributes) - - def _getChildNodes(self): - return self._childNodes - - def _setChildNodes(self, value): - del self._element[:] - self._childNodes = [] - for element in value: - self.insertChild(element) - - childNodes = property(_getChildNodes, _setChildNodes) - - def hasContent(self): - """Return true if the node has children or text""" - return bool(self._element.text or self._element.getchildren()) - - def appendChild(self, node): - self._childNodes.append(node) - self._element.append(node._element) - node.parent = self - - def insertBefore(self, node, refNode): - index = self._element.getchildren().index(refNode._element) - self._element.insert(index, node._element) - node.parent = self - - def removeChild(self, node): - self._element.remove(node._element) - node.parent=None - - def insertText(self, data, insertBefore=None): - if not(len(self._element)): - if not self._element.text: - self._element.text = "" - self._element.text += data - elif insertBefore is None: - #Insert the text as the tail of the last child element - if not self._element[-1].tail: - self._element[-1].tail = "" - self._element[-1].tail += data - else: - #Insert the text before the specified node - children = self._element.getchildren() - index = children.index(insertBefore._element) - if index > 0: - if not self._element[index-1].tail: - self._element[index-1].tail = "" - self._element[index-1].tail += data - else: - if not self._element.text: - self._element.text = "" - self._element.text += data - - def cloneNode(self): - element = Element(self.name) - element.attributes = self.attributes - return element - - def reparentChildren(self, newParent): - if newParent.childNodes: - newParent.childNodes[-1]._element.tail += self._element.text - else: - if not newParent._element.text: - newParent._element.text = "" - if self._element.text is not None: - newParent._element.text += self._element.text - self._element.text = "" - _base.Node.reparentChildren(self, newParent) - -class Comment(Element): - def __init__(self, data): - #Use the superclass constructor to set all properties on the - #wrapper element - Element.__init__(self, None) - self._element = ElementTree.Comment(data) - - def _getData(self): - return self._element.text - - def _setData(self, value): - self._element.text = value - - data = property(_getData, _setData) - -class DocumentType(Element): - def __init__(self, name): - Element.__init__(self, DocumentType) - self._element.text = name - -class Document(Element): - def __init__(self): - Element.__init__(self, Document) - -class DocumentFragment(Element): - def __init__(self): - Element.__init__(self, DocumentFragment) - -def testSerializer(element): - rv = [] - finalText = None - def serializeElement(element, indent=0): - if element.tag is DocumentType: - rv.append("|%s"%(' '*indent, element.text)) - elif element.tag is Document: - rv.append("#document") - if element.text: - rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) - if element.tail: - finalText = element.tail - elif element.tag is ElementTree.Comment: - rv.append("|%s"%(' '*indent, element.text)) - else: - rv.append("|%s<%s>"%(' '*indent, element.tag)) - if hasattr(element, "attrib"): - for name, value in element.attrib.iteritems(): - rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) - if element.text: - rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) - indent += 2 - for child in element.getchildren(): - serializeElement(child, indent) - if element.tail: - rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) - serializeElement(element, 0) - - if finalText is not None: - rv.append("|%s\"%s\""%(' '*2, finalText)) - - return "\n".join(rv) - -def tostring(element): - """Serialize an element and its child nodes to a string""" - rv = [] - finalText = None - def serializeElement(element): - if element.tag is DocumentType: - rv.append(""%(element.text,)) - elif element.tag is Document: - if element.text: - rv.append(element.text) - if element.tail: - finalText = element.tail - - for child in element.getchildren(): - serializeElement(child) - - elif element.tag is ElementTree.Comment: - rv.append(""%(element.text,)) - else: - #This is assumed to be an ordinary element - if not element.attrib: - rv.append("<%s>"%(element.tag,)) - else: - attr = " ".join(["%s=\"%s\""%(name, value) - for name, value in element.attrib.iteritems()]) - rv.append("<%s %s>"%(element.tag, attr)) - if element.text: - rv.append(element.text) - - for child in element.getchildren(): - serializeElement(child) - - rv.append(""%(element.tag,)) - - if element.tail: - rv.append(element.tail) - - serializeElement(element) - - if finalText is not None: - rv.append("%s\""%(' '*2, finalText)) - - return "".join(rv) - -class TreeBuilder(_base.TreeBuilder): - documentClass = Document - doctypeClass = DocumentType - elementClass = Element - commentClass = Comment - fragmentClass = DocumentFragment - - def testSerializer(self, element): - return testSerializer(element) - - def getDocument(self): - return self.document._element - - def getFragment(self): - return _base.TreeBuilder.getFragment(self)._element diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 29e371d..3ab9cd8 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -16,7 +16,8 @@ Todo: import re, time, md5, sgmllib from xml.sax.saxutils import escape from xml.dom import minidom, Node -from planet.html5lib import liberalxmlparser, treebuilders +from html5lib import liberalxmlparser +from html5lib.treebuilders import dom import planet, config illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") @@ -154,7 +155,7 @@ def content(xentry, name, detail, bozo): data = minidom.parseString(xdiv % detail.value).documentElement xcontent.setAttribute('type', 'xhtml') else: - parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder) + parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder) html = parser.parse(xdiv % detail.value, encoding="utf-8") for body in html.documentElement.childNodes: if body.nodeType != Node.ELEMENT_NODE: continue diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py index 99ae27d..05fb0cf 100644 --- a/planet/shell/tmpl.py +++ b/planet/shell/tmpl.py @@ -1,6 +1,7 @@ from xml.sax.saxutils import escape import sgmllib, time, os, sys, new, urlparse, re -from planet import config, feedparser, htmltmpl +from planet import config, feedparser +import htmltmpl voids=feedparser._BaseHTMLProcessor.elements_no_end_tag empty=re.compile(r"<((%s)[^>]*)>" % '|'.join(voids)) diff --git a/planet/spider.py b/planet/spider.py index e6b4932..bbabd07 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -340,7 +340,7 @@ def spiderPlanet(only_if_new = False): log.info("Socket timeout set to %d seconds", timeout) except: try: - from planet import timeoutsocket + import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: diff --git a/planet/compat_logging/__init__.py b/planet/vendor/compat_logging/__init__.py similarity index 100% rename from planet/compat_logging/__init__.py rename to planet/vendor/compat_logging/__init__.py diff --git a/planet/compat_logging/config.py b/planet/vendor/compat_logging/config.py similarity index 100% rename from planet/compat_logging/config.py rename to planet/vendor/compat_logging/config.py diff --git a/planet/compat_logging/handlers.py b/planet/vendor/compat_logging/handlers.py similarity index 100% rename from planet/compat_logging/handlers.py rename to planet/vendor/compat_logging/handlers.py diff --git a/planet/feedparser.py b/planet/vendor/feedparser.py similarity index 100% rename from planet/feedparser.py rename to planet/vendor/feedparser.py diff --git a/planet/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py similarity index 100% rename from planet/html5lib/__init__.py rename to planet/vendor/html5lib/__init__.py diff --git a/planet/html5lib/constants.py b/planet/vendor/html5lib/constants.py similarity index 56% rename from planet/html5lib/constants.py rename to planet/vendor/html5lib/constants.py index ba8ae8c..459098f 100644 --- a/planet/html5lib/constants.py +++ b/planet/vendor/html5lib/constants.py @@ -119,8 +119,8 @@ spaceCharacters = frozenset(( tableInsertModeElements = frozenset(( "table", "tbody", - "tfoot", - "thead", + "tfoot", + "thead", "tr" )) @@ -133,7 +133,7 @@ hexDigits = frozenset(string.hexdigits) asciiUpper2Lower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase]) -# Heading elements need to be ordered +# Heading elements need to be ordered headingElements = ( "h1", "h2", @@ -158,6 +158,38 @@ voidElements = frozenset(( "input" )) +cdataElements = frozenset(('title', 'textarea')) + +rcdataElements = frozenset(( + 'style', + 'script', + 'xmp', + 'iframe', + 'noembed', + 'noframes', + 'noscript' +)) + +booleanAttributes = { + "": frozenset(("irrelevant",)), + "style": frozenset(("scoped",)), + "img": frozenset(("ismap",)), + "audio": frozenset(("autoplay","controls")), + "video": frozenset(("autoplay","controls")), + "script": frozenset(("defer", "async")), + "details": frozenset(("open",)), + "datagrid": frozenset(("multiple", "disabled")), + "command": frozenset(("hidden", "disabled", "checked", "default")), + "menu": frozenset(("autosubmit",)), + "fieldset": frozenset(("disabled", "readonly")), + "option": frozenset(("disabled", "readonly", "selected")), + "optgroup": frozenset(("disabled", "readonly")), + "button": frozenset(("disabled", "autofocus")), + "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), + "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), + "output": frozenset(("disabled", "readonly")), +} + # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( @@ -196,265 +228,372 @@ entitiesWindows1252 = ( ) entities = { + "AElig;": u"\u00C6", "AElig": u"\u00C6", - "Aacute": u"\u00C1", - "Acirc": u"\u00C2", - "Agrave": u"\u00C0", - "Alpha": u"\u0391", - "Aring": u"\u00C5", - "Atilde": u"\u00C3", - "Auml": u"\u00C4", - "Beta": u"\u0392", - "Ccedil": u"\u00C7", - "Chi": u"\u03A7", - "Dagger": u"\u2021", - "Delta": u"\u0394", - "ETH": u"\u00D0", - "Eacute": u"\u00C9", - "Ecirc": u"\u00CA", - "Egrave": u"\u00C8", - "Epsilon": u"\u0395", - "Eta": u"\u0397", - "Euml": u"\u00CB", - "Gamma": u"\u0393", - "Iacute": u"\u00CD", - "Icirc": u"\u00CE", - "Igrave": u"\u00CC", - "Iota": u"\u0399", - "Iuml": u"\u00CF", - "Kappa": u"\u039A", - "Lambda": u"\u039B", - "Mu": u"\u039C", - "Ntilde": u"\u00D1", - "Nu": u"\u039D", - "OElig": u"\u0152", - "Oacute": u"\u00D3", - "Ocirc": u"\u00D4", - "Ograve": u"\u00D2", - "Omega": u"\u03A9", - "Omicron": u"\u039F", - "Oslash": u"\u00D8", - "Otilde": u"\u00D5", - "Ouml": u"\u00D6", - "Phi": u"\u03A6", - "Pi": u"\u03A0", - "Prime": u"\u2033", - "Psi": u"\u03A8", - "Rho": u"\u03A1", - "Scaron": u"\u0160", - "Sigma": u"\u03A3", - "THORN": u"\u00DE", - "Tau": u"\u03A4", - "Theta": u"\u0398", - "Uacute": u"\u00DA", - "Ucirc": u"\u00DB", - "Ugrave": u"\u00D9", - "Upsilon": u"\u03A5", - "Uuml": u"\u00DC", - "Xi": u"\u039E", - "Yacute": u"\u00DD", - "Yuml": u"\u0178", - "Zeta": u"\u0396", - "aacute": u"\u00E1", - "acirc": u"\u00E2", - "acute": u"\u00B4", - "aelig": u"\u00E6", - "agrave": u"\u00E0", - "alefsym": u"\u2135", - "alpha": u"\u03B1", - "amp": u"\u0026", + "AMP;": u"\u0026", "AMP": u"\u0026", - "and": u"\u2227", - "ang": u"\u2220", - "apos": u"\u0027", - "aring": u"\u00E5", - "asymp": u"\u2248", - "atilde": u"\u00E3", - "auml": u"\u00E4", - "bdquo": u"\u201E", - "beta": u"\u03B2", - "brvbar": u"\u00A6", - "bull": u"\u2022", - "cap": u"\u2229", - "ccedil": u"\u00E7", - "cedil": u"\u00B8", - "cent": u"\u00A2", - "chi": u"\u03C7", - "circ": u"\u02C6", - "clubs": u"\u2663", - "cong": u"\u2245", - "copy": u"\u00A9", + "Aacute;": u"\u00C1", + "Aacute": u"\u00C1", + "Acirc;": u"\u00C2", + "Acirc": u"\u00C2", + "Agrave;": u"\u00C0", + "Agrave": u"\u00C0", + "Alpha;": u"\u0391", + "Aring;": u"\u00C5", + "Aring": u"\u00C5", + "Atilde;": u"\u00C3", + "Atilde": u"\u00C3", + "Auml;": u"\u00C4", + "Auml": u"\u00C4", + "Beta;": u"\u0392", + "COPY;": u"\u00A9", "COPY": u"\u00A9", - "crarr": u"\u21B5", - "cup": u"\u222A", - "curren": u"\u00A4", - "dArr": u"\u21D3", - "dagger": u"\u2020", - "darr": u"\u2193", - "deg": u"\u00B0", - "delta": u"\u03B4", - "diams": u"\u2666", - "divide": u"\u00F7", - "eacute": u"\u00E9", - "ecirc": u"\u00EA", - "egrave": u"\u00E8", - "empty": u"\u2205", - "emsp": u"\u2003", - "ensp": u"\u2002", - "epsilon": u"\u03B5", - "equiv": u"\u2261", - "eta": u"\u03B7", - "eth": u"\u00F0", - "euml": u"\u00EB", - "euro": u"\u20AC", - "exist": u"\u2203", - "fnof": u"\u0192", - "forall": u"\u2200", - "frac12": u"\u00BD", - "frac14": u"\u00BC", - "frac34": u"\u00BE", - "frasl": u"\u2044", - "gamma": u"\u03B3", - "ge": u"\u2265", - "gt": u"\u003E", + "Ccedil;": u"\u00C7", + "Ccedil": u"\u00C7", + "Chi;": u"\u03A7", + "Dagger;": u"\u2021", + "Delta;": u"\u0394", + "ETH;": u"\u00D0", + "ETH": u"\u00D0", + "Eacute;": u"\u00C9", + "Eacute": u"\u00C9", + "Ecirc;": u"\u00CA", + "Ecirc": u"\u00CA", + "Egrave;": u"\u00C8", + "Egrave": u"\u00C8", + "Epsilon;": u"\u0395", + "Eta;": u"\u0397", + "Euml;": u"\u00CB", + "Euml": u"\u00CB", + "GT;": u"\u003E", "GT": u"\u003E", - "hArr": u"\u21D4", - "harr": u"\u2194", - "hearts": u"\u2665", - "hellip": u"\u2026", - "iacute": u"\u00ED", - "icirc": u"\u00EE", - "iexcl": u"\u00A1", - "igrave": u"\u00EC", - "image": u"\u2111", - "infin": u"\u221E", - "int": u"\u222B", - "iota": u"\u03B9", - "iquest": u"\u00BF", - "isin": u"\u2208", - "iuml": u"\u00EF", - "kappa": u"\u03BA", - "lArr": u"\u21D0", - "lambda": u"\u03BB", - "lang": u"\u2329", - "laquo": u"\u00AB", - "larr": u"\u2190", - "lceil": u"\u2308", - "ldquo": u"\u201C", - "le": u"\u2264", - "lfloor": u"\u230A", - "lowast": u"\u2217", - "loz": u"\u25CA", - "lrm": u"\u200E", - "lsaquo": u"\u2039", - "lsquo": u"\u2018", - "lt": u"\u003C", + "Gamma;": u"\u0393", + "Iacute;": u"\u00CD", + "Iacute": u"\u00CD", + "Icirc;": u"\u00CE", + "Icirc": u"\u00CE", + "Igrave;": u"\u00CC", + "Igrave": u"\u00CC", + "Iota;": u"\u0399", + "Iuml;": u"\u00CF", + "Iuml": u"\u00CF", + "Kappa;": u"\u039A", + "LT;": u"\u003C", "LT": u"\u003C", - "macr": u"\u00AF", - "mdash": u"\u2014", - "micro": u"\u00B5", - "middot": u"\u00B7", - "minus": u"\u2212", - "mu": u"\u03BC", - "nabla": u"\u2207", - "nbsp": u"\u00A0", - "ndash": u"\u2013", - "ne": u"\u2260", - "ni": u"\u220B", - "not": u"\u00AC", - "notin": u"\u2209", - "nsub": u"\u2284", - "ntilde": u"\u00F1", - "nu": u"\u03BD", - "oacute": u"\u00F3", - "ocirc": u"\u00F4", - "oelig": u"\u0153", - "ograve": u"\u00F2", - "oline": u"\u203E", - "omega": u"\u03C9", - "omicron": u"\u03BF", - "oplus": u"\u2295", - "or": u"\u2228", - "ordf": u"\u00AA", - "ordm": u"\u00BA", - "oslash": u"\u00F8", - "otilde": u"\u00F5", - "otimes": u"\u2297", - "ouml": u"\u00F6", - "para": u"\u00B6", - "part": u"\u2202", - "permil": u"\u2030", - "perp": u"\u22A5", - "phi": u"\u03C6", - "pi": u"\u03C0", - "piv": u"\u03D6", - "plusmn": u"\u00B1", - "pound": u"\u00A3", - "prime": u"\u2032", - "prod": u"\u220F", - "prop": u"\u221D", - "psi": u"\u03C8", - "quot": u"\u0022", + "Lambda;": u"\u039B", + "Mu;": u"\u039C", + "Ntilde;": u"\u00D1", + "Ntilde": u"\u00D1", + "Nu;": u"\u039D", + "OElig;": u"\u0152", + "Oacute;": u"\u00D3", + "Oacute": u"\u00D3", + "Ocirc;": u"\u00D4", + "Ocirc": u"\u00D4", + "Ograve;": u"\u00D2", + "Ograve": u"\u00D2", + "Omega;": u"\u03A9", + "Omicron;": u"\u039F", + "Oslash;": u"\u00D8", + "Oslash": u"\u00D8", + "Otilde;": u"\u00D5", + "Otilde": u"\u00D5", + "Ouml;": u"\u00D6", + "Ouml": u"\u00D6", + "Phi;": u"\u03A6", + "Pi;": u"\u03A0", + "Prime;": u"\u2033", + "Psi;": u"\u03A8", + "QUOT;": u"\u0022", "QUOT": u"\u0022", - "rArr": u"\u21D2", - "radic": u"\u221A", - "rang": u"\u232A", - "raquo": u"\u00BB", - "rarr": u"\u2192", - "rceil": u"\u2309", - "rdquo": u"\u201D", - "real": u"\u211C", - "reg": u"\u00AE", + "REG;": u"\u00AE", "REG": u"\u00AE", - "rfloor": u"\u230B", - "rho": u"\u03C1", - "rlm": u"\u200F", - "rsaquo": u"\u203A", - "rsquo": u"\u2019", - "sbquo": u"\u201A", - "scaron": u"\u0161", - "sdot": u"\u22C5", + "Rho;": u"\u03A1", + "Scaron;": u"\u0160", + "Sigma;": u"\u03A3", + "THORN;": u"\u00DE", + "THORN": u"\u00DE", + "TRADE;": u"\u2122", + "Tau;": u"\u03A4", + "Theta;": u"\u0398", + "Uacute;": u"\u00DA", + "Uacute": u"\u00DA", + "Ucirc;": u"\u00DB", + "Ucirc": u"\u00DB", + "Ugrave;": u"\u00D9", + "Ugrave": u"\u00D9", + "Upsilon;": u"\u03A5", + "Uuml;": u"\u00DC", + "Uuml": u"\u00DC", + "Xi;": u"\u039E", + "Yacute;": u"\u00DD", + "Yacute": u"\u00DD", + "Yuml;": u"\u0178", + "Zeta;": u"\u0396", + "aacute;": u"\u00E1", + "aacute": u"\u00E1", + "acirc;": u"\u00E2", + "acirc": u"\u00E2", + "acute;": u"\u00B4", + "acute": u"\u00B4", + "aelig;": u"\u00E6", + "aelig": u"\u00E6", + "agrave;": u"\u00E0", + "agrave": u"\u00E0", + "alefsym;": u"\u2135", + "alpha;": u"\u03B1", + "amp;": u"\u0026", + "amp": u"\u0026", + "and;": u"\u2227", + "ang;": u"\u2220", + "apos;": u"\u0027", + "aring;": u"\u00E5", + "aring": u"\u00E5", + "asymp;": u"\u2248", + "atilde;": u"\u00E3", + "atilde": u"\u00E3", + "auml;": u"\u00E4", + "auml": u"\u00E4", + "bdquo;": u"\u201E", + "beta;": u"\u03B2", + "brvbar;": u"\u00A6", + "brvbar": u"\u00A6", + "bull;": u"\u2022", + "cap;": u"\u2229", + "ccedil;": u"\u00E7", + "ccedil": u"\u00E7", + "cedil;": u"\u00B8", + "cedil": u"\u00B8", + "cent;": u"\u00A2", + "cent": u"\u00A2", + "chi;": u"\u03C7", + "circ;": u"\u02C6", + "clubs;": u"\u2663", + "cong;": u"\u2245", + "copy;": u"\u00A9", + "copy": u"\u00A9", + "crarr;": u"\u21B5", + "cup;": u"\u222A", + "curren;": u"\u00A4", + "curren": u"\u00A4", + "dArr;": u"\u21D3", + "dagger;": u"\u2020", + "darr;": u"\u2193", + "deg;": u"\u00B0", + "deg": u"\u00B0", + "delta;": u"\u03B4", + "diams;": u"\u2666", + "divide;": u"\u00F7", + "divide": u"\u00F7", + "eacute;": u"\u00E9", + "eacute": u"\u00E9", + "ecirc;": u"\u00EA", + "ecirc": u"\u00EA", + "egrave;": u"\u00E8", + "egrave": u"\u00E8", + "empty;": u"\u2205", + "emsp;": u"\u2003", + "ensp;": u"\u2002", + "epsilon;": u"\u03B5", + "equiv;": u"\u2261", + "eta;": u"\u03B7", + "eth;": u"\u00F0", + "eth": u"\u00F0", + "euml;": u"\u00EB", + "euml": u"\u00EB", + "euro;": u"\u20AC", + "exist;": u"\u2203", + "fnof;": u"\u0192", + "forall;": u"\u2200", + "frac12;": u"\u00BD", + "frac12": u"\u00BD", + "frac14;": u"\u00BC", + "frac14": u"\u00BC", + "frac34;": u"\u00BE", + "frac34": u"\u00BE", + "frasl;": u"\u2044", + "gamma;": u"\u03B3", + "ge;": u"\u2265", + "gt;": u"\u003E", + "gt": u"\u003E", + "hArr;": u"\u21D4", + "harr;": u"\u2194", + "hearts;": u"\u2665", + "hellip;": u"\u2026", + "iacute;": u"\u00ED", + "iacute": u"\u00ED", + "icirc;": u"\u00EE", + "icirc": u"\u00EE", + "iexcl;": u"\u00A1", + "iexcl": u"\u00A1", + "igrave;": u"\u00EC", + "igrave": u"\u00EC", + "image;": u"\u2111", + "infin;": u"\u221E", + "int;": u"\u222B", + "iota;": u"\u03B9", + "iquest;": u"\u00BF", + "iquest": u"\u00BF", + "isin;": u"\u2208", + "iuml;": u"\u00EF", + "iuml": u"\u00EF", + "kappa;": u"\u03BA", + "lArr;": u"\u21D0", + "lambda;": u"\u03BB", + "lang;": u"\u3008", + "laquo;": u"\u00AB", + "laquo": u"\u00AB", + "larr;": u"\u2190", + "lceil;": u"\u2308", + "ldquo;": u"\u201C", + "le;": u"\u2264", + "lfloor;": u"\u230A", + "lowast;": u"\u2217", + "loz;": u"\u25CA", + "lrm;": u"\u200E", + "lsaquo;": u"\u2039", + "lsquo;": u"\u2018", + "lt;": u"\u003C", + "lt": u"\u003C", + "macr;": u"\u00AF", + "macr": u"\u00AF", + "mdash;": u"\u2014", + "micro;": u"\u00B5", + "micro": u"\u00B5", + "middot;": u"\u00B7", + "middot": u"\u00B7", + "minus;": u"\u2212", + "mu;": u"\u03BC", + "nabla;": u"\u2207", + "nbsp;": u"\u00A0", + "nbsp": u"\u00A0", + "ndash;": u"\u2013", + "ne;": u"\u2260", + "ni;": u"\u220B", + "not;": u"\u00AC", + "not": u"\u00AC", + "notin;": u"\u2209", + "nsub;": u"\u2284", + "ntilde;": u"\u00F1", + "ntilde": u"\u00F1", + "nu;": u"\u03BD", + "oacute;": u"\u00F3", + "oacute": u"\u00F3", + "ocirc;": u"\u00F4", + "ocirc": u"\u00F4", + "oelig;": u"\u0153", + "ograve;": u"\u00F2", + "ograve": u"\u00F2", + "oline;": u"\u203E", + "omega;": u"\u03C9", + "omicron;": u"\u03BF", + "oplus;": u"\u2295", + "or;": u"\u2228", + "ordf;": u"\u00AA", + "ordf": u"\u00AA", + "ordm;": u"\u00BA", + "ordm": u"\u00BA", + "oslash;": u"\u00F8", + "oslash": u"\u00F8", + "otilde;": u"\u00F5", + "otilde": u"\u00F5", + "otimes;": u"\u2297", + "ouml;": u"\u00F6", + "ouml": u"\u00F6", + "para;": u"\u00B6", + "para": u"\u00B6", + "part;": u"\u2202", + "permil;": u"\u2030", + "perp;": u"\u22A5", + "phi;": u"\u03C6", + "pi;": u"\u03C0", + "piv;": u"\u03D6", + "plusmn;": u"\u00B1", + "plusmn": u"\u00B1", + "pound;": u"\u00A3", + "pound": u"\u00A3", + "prime;": u"\u2032", + "prod;": u"\u220F", + "prop;": u"\u221D", + "psi;": u"\u03C8", + "quot;": u"\u0022", + "quot": u"\u0022", + "rArr;": u"\u21D2", + "radic;": u"\u221A", + "rang;": u"\u3009", + "raquo;": u"\u00BB", + "raquo": u"\u00BB", + "rarr;": u"\u2192", + "rceil;": u"\u2309", + "rdquo;": u"\u201D", + "real;": u"\u211C", + "reg;": u"\u00AE", + "reg": u"\u00AE", + "rfloor;": u"\u230B", + "rho;": u"\u03C1", + "rlm;": u"\u200F", + "rsaquo;": u"\u203A", + "rsquo;": u"\u2019", + "sbquo;": u"\u201A", + "scaron;": u"\u0161", + "sdot;": u"\u22C5", + "sect;": u"\u00A7", "sect": u"\u00A7", + "shy;": u"\u00AD", "shy": u"\u00AD", - "sigma": u"\u03C3", - "sigmaf": u"\u03C2", - "sim": u"\u223C", - "spades": u"\u2660", - "sub": u"\u2282", - "sube": u"\u2286", - "sum": u"\u2211", - "sup": u"\u2283", + "sigma;": u"\u03C3", + "sigmaf;": u"\u03C2", + "sim;": u"\u223C", + "spades;": u"\u2660", + "sub;": u"\u2282", + "sube;": u"\u2286", + "sum;": u"\u2211", + "sup1;": u"\u00B9", "sup1": u"\u00B9", + "sup2;": u"\u00B2", "sup2": u"\u00B2", + "sup3;": u"\u00B3", "sup3": u"\u00B3", - "supe": u"\u2287", + "sup;": u"\u2283", + "supe;": u"\u2287", + "szlig;": u"\u00DF", "szlig": u"\u00DF", - "tau": u"\u03C4", - "there4": u"\u2234", - "theta": u"\u03B8", - "thetasym": u"\u03D1", - "thinsp": u"\u2009", + "tau;": u"\u03C4", + "there4;": u"\u2234", + "theta;": u"\u03B8", + "thetasym;": u"\u03D1", + "thinsp;": u"\u2009", + "thorn;": u"\u00FE", "thorn": u"\u00FE", - "tilde": u"\u02DC", + "tilde;": u"\u02DC", + "times;": u"\u00D7", "times": u"\u00D7", - "trade": u"\u2122", - "uArr": u"\u21D1", + "trade;": u"\u2122", + "uArr;": u"\u21D1", + "uacute;": u"\u00FA", "uacute": u"\u00FA", - "uarr": u"\u2191", + "uarr;": u"\u2191", + "ucirc;": u"\u00FB", "ucirc": u"\u00FB", + "ugrave;": u"\u00F9", "ugrave": u"\u00F9", + "uml;": u"\u00A8", "uml": u"\u00A8", - "upsih": u"\u03D2", - "upsilon": u"\u03C5", + "upsih;": u"\u03D2", + "upsilon;": u"\u03C5", + "uuml;": u"\u00FC", "uuml": u"\u00FC", - "weierp": u"\u2118", - "xi": u"\u03BE", + "weierp;": u"\u2118", + "xi;": u"\u03BE", + "yacute;": u"\u00FD", "yacute": u"\u00FD", + "yen;": u"\u00A5", "yen": u"\u00A5", + "yuml;": u"\u00FF", "yuml": u"\u00FF", - "zeta": u"\u03B6", - "zwj": u"\u200D", - "zwnj": u"\u200C" + "zeta;": u"\u03B6", + "zwj;": u"\u200D", + "zwnj;": u"\u200C" } encodings = frozenset(( diff --git a/planet/vendor/html5lib/filters/__init__.py b/planet/vendor/html5lib/filters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/planet/vendor/html5lib/filters/_base.py b/planet/vendor/html5lib/filters/_base.py new file mode 100644 index 0000000..bca94ad --- /dev/null +++ b/planet/vendor/html5lib/filters/_base.py @@ -0,0 +1,10 @@ + +class Filter(object): + def __init__(self, source): + self.source = source + + def __iter__(self): + return iter(self.source) + + def __getattr__(self, name): + return getattr(self.source, name) diff --git a/planet/vendor/html5lib/filters/inject_meta_charset.py b/planet/vendor/html5lib/filters/inject_meta_charset.py new file mode 100644 index 0000000..35a2d95 --- /dev/null +++ b/planet/vendor/html5lib/filters/inject_meta_charset.py @@ -0,0 +1,63 @@ +import _base + +class Filter(_base.Filter): + def __init__(self, source, encoding): + _base.Filter.__init__(self, source) + self.encoding = encoding + + def __iter__(self): + state = "pre_head" + meta_found = (self.encoding is None) + pending = [] + + for token in _base.Filter.__iter__(self): + type = token["type"] + if type == "StartTag": + if token["name"].lower() == "head": + state = "in_head" + + elif type == "EmptyTag": + if token["name"].lower() == "meta": + # replace charset with actual encoding + has_http_equiv_content_type = False + content_index = -1 + for i,(name,value) in enumerate(token["data"]): + if name.lower() == 'charset': + token["data"][i] = (u'charset', self.encoding) + meta_found = True + break + elif name == 'http-equiv' and value.lower() == 'content-type': + has_http_equiv_content_type = True + elif name == 'content': + content_index = i + else: + if has_http_equiv_content_type and content_index >= 0: + token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding) + meta_found = True + + elif token["name"].lower() == "head" and not meta_found: + # insert meta into empty head + yield {"type": "StartTag", "name": "head", + "data": token["data"]} + yield {"type": "EmptyTag", "name": "meta", + "data": [["charset", self.encoding]]} + yield {"type": "EndTag", "name": "head"} + meta_found = True + continue + + elif type == "EndTag": + if token["name"].lower() == "head" and pending: + # insert meta into head (if necessary) and flush pending queue + yield pending.pop(0) + if not meta_found: + yield {"type": "EmptyTag", "name": "meta", + "data": [["charset", self.encoding]]} + while pending: + yield pending.pop(0) + meta_found = True + state = "post_head" + + if state == "in_head": + pending.append(token) + else: + yield token diff --git a/planet/vendor/html5lib/filters/lint.py b/planet/vendor/html5lib/filters/lint.py new file mode 100644 index 0000000..770e0a4 --- /dev/null +++ b/planet/vendor/html5lib/filters/lint.py @@ -0,0 +1,90 @@ +from gettext import gettext +_ = gettext + +import _base +from html5lib.constants import cdataElements, rcdataElements, voidElements + +from html5lib.constants import spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class LintError(Exception): pass + +class Filter(_base.Filter): + def __iter__(self): + open_elements = [] + contentModelFlag = "PCDATA" + for token in _base.Filter.__iter__(self): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + name = token["name"] + if contentModelFlag != "PCDATA": + raise LintError(_("StartTag not in PCDATA content model flag: %s") % name) + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty tag name")) + if type == "StartTag" and name in voidElements: + raise LintError(_(u"Void element reported as StartTag token: %s") % name) + elif type == "EmptyTag" and name not in voidElements: + raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"]) + if type == "StartTag": + open_elements.append(name) + for name, value in token["data"]: + if not isinstance(name, unicode): + raise LintError(_("Attribute name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty attribute name")) + if not isinstance(value, unicode): + raise LintError(_("Attribute value is not a string: %r") % value) + if name in cdataElements: + contentModelFlag = "CDATA" + elif name in rcdataElements: + contentModelFlag = "RCDATA" + elif name == "plaintext": + contentModelFlag = "PLAINTEXT" + + elif type == "EndTag": + name = token["name"] + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty tag name")) + if name in voidElements: + raise LintError(_(u"Void element reported as EndTag token: %s") % name) + start_name = open_elements.pop() + if start_name != name: + raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name)) + contentModelFlag = "PCDATA" + + elif type == "Comment": + if contentModelFlag != "PCDATA": + raise LintError(_("Comment not in PCDATA content model flag")) + + elif type in ("Characters", "SpaceCharacters"): + data = token["data"] + if not isinstance(data, unicode): + raise LintError(_("Attribute name is not a string: %r") % data) + if not data: + raise LintError(_(u"%s token with empty data") % type) + if type == "SpaceCharacters": + data = data.strip(spaceCharacters) + if data: + raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data) + + elif type == "Doctype": + name = token["name"] + if contentModelFlag != "PCDATA": + raise LintError(_("Doctype not in PCDATA content model flag: %s") % name) + if not isinstance(name, unicode): + raise LintError(_(u"Tag name is not a string: %r") % name) + if not name: + raise LintError(_(u"Empty tag name")) + # XXX: what to do with token["data"] ? + + elif type in ("ParseError", "SerializeError"): + pass + + else: + raise LintError(_(u"Unknown token type: %s") % type) + + yield token diff --git a/planet/vendor/html5lib/filters/optionaltags.py b/planet/vendor/html5lib/filters/optionaltags.py new file mode 100644 index 0000000..73da96c --- /dev/null +++ b/planet/vendor/html5lib/filters/optionaltags.py @@ -0,0 +1,175 @@ +import _base + +class Filter(_base.Filter): + def slider(self): + previous1 = previous2 = None + for token in self.source: + if previous1 is not None: + yield previous2, previous1, token + previous2 = previous1 + previous1 = token + yield previous2, previous1, None + + def __iter__(self): + for previous, token, next in self.slider(): + type = token["type"] + if type == "StartTag": + if token["data"] or not self.is_optional_start(token["name"], previous, next): + yield token + elif type == "EndTag": + if not self.is_optional_end(token["name"], next): + yield token + else: + yield token + + def is_optional_start(self, tagname, previous, next): + type = next and next["type"] or None + if tagname in 'html': + # An html element's start tag may be omitted if the first thing + # inside the html element is not a space character or a comment. + return type not in ("Comment", "SpaceCharacters") + elif tagname == 'head': + # A head element's start tag may be omitted if the first thing + # inside the head element is an element. + return type == "StartTag" + elif tagname == 'body': + # A body element's start tag may be omitted if the first thing + # inside the body element is not a space character or a comment, + # except if the first thing inside the body element is a script + # or style element and the node immediately preceding the body + # element is a head element whose end tag has been omitted. + if type in ("Comment", "SpaceCharacters"): + return False + elif type == "StartTag": + # XXX: we do not look at the preceding event, so we never omit + # the body element's start tag if it's followed by a script or + # a style element. + return next["name"] not in ('script', 'style') + else: + return True + elif tagname == 'colgroup': + # A colgroup element's start tag may be omitted if the first thing + # inside the colgroup element is a col element, and if the element + # is not immediately preceeded by another colgroup element whose + # end tag has been omitted. + if type == "StartTag": + # XXX: we do not look at the preceding event, so instead we never + # omit the colgroup element's end tag when it is immediately + # followed by another colgroup element. See is_optional_end. + return next["name"] == "col" + else: + return False + elif tagname == 'tbody': + # A tbody element's start tag may be omitted if the first thing + # inside the tbody element is a tr element, and if the element is + # not immediately preceeded by a tbody, thead, or tfoot element + # whose end tag has been omitted. + if type == "StartTag": + # omit the thead and tfoot elements' end tag when they are + # immediately followed by a tbody element. See is_optional_end. + if previous and previous['type'] == 'EndTag' and \ + previous['name'] in ('tbody','thead','tfoot'): + return False + return next["name"] == 'tr' + else: + return False + return False + + def is_optional_end(self, tagname, next): + type = next and next["type"] or None + if tagname in ('html', 'head', 'body'): + # An html element's end tag may be omitted if the html element + # is not immediately followed by a space character or a comment. + return type not in ("Comment", "SpaceCharacters") + elif tagname in ('li', 'optgroup', 'option', 'tr'): + # A li element's end tag may be omitted if the li element is + # immediately followed by another li element or if there is + # no more content in the parent element. + # An optgroup element's end tag may be omitted if the optgroup + # element is immediately followed by another optgroup element, + # or if there is no more content in the parent element. + # An option element's end tag may be omitted if the option + # element is immediately followed by another option element, + # or if there is no more content in the parent element. + # A tr element's end tag may be omitted if the tr element is + # immediately followed by another tr element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] == tagname + else: + return type == "EndTag" or type is None + elif tagname in ('dt', 'dd'): + # A dt element's end tag may be omitted if the dt element is + # immediately followed by another dt element or a dd element. + # A dd element's end tag may be omitted if the dd element is + # immediately followed by another dd element or a dt element, + # or if there is no more content in the parent element. + if type == "StartTag": + return next["name"] in ('dt', 'dd') + elif tagname == 'dd': + return type == "EndTag" or type is None + else: + return False + elif tagname == 'p': + # A p element's end tag may be omitted if the p element is + # immediately followed by an address, blockquote, dl, fieldset, + # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table, + # or ul element, or if there is no more content in the parent + # element. + if type == "StartTag": + return next["name"] in ('address', 'blockquote', \ + 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \ + 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul') + else: + return type == "EndTag" or type is None + elif tagname == 'colgroup': + # A colgroup element's end tag may be omitted if the colgroup + # element is not immediately followed by a space character or + # a comment. + if type in ("Comment", "SpaceCharacters"): + return False + elif type == "StartTag": + # XXX: we also look for an immediately following colgroup + # element. See is_optional_start. + return next["name"] != 'colgroup' + else: + return True + elif tagname in ('thead', 'tbody'): + # A thead element's end tag may be omitted if the thead element + # is immediately followed by a tbody or tfoot element. + # A tbody element's end tag may be omitted if the tbody element + # is immediately followed by a tbody or tfoot element, or if + # there is no more content in the parent element. + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == "StartTag": + return next["name"] in ['tbody', 'tfoot'] + elif tagname == 'tbody': + return type == "EndTag" or type is None + else: + return False + elif tagname == 'tfoot': + # A tfoot element's end tag may be omitted if the tfoot element + # is immediately followed by a tbody element, or if there is no + # more content in the parent element. + # XXX: we never omit the end tag when the following element is + # a tbody. See is_optional_start. + if type == "StartTag": + return next["name"] == 'tbody' + else: + return type == "EndTag" or type is None + elif tagname in ('td', 'th'): + # A td element's end tag may be omitted if the td element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + # A th element's end tag may be omitted if the th element is + # immediately followed by a td or th element, or if there is + # no more content in the parent element. + if type == "StartTag": + return next["name"] in ('td', 'th') + else: + return type == "EndTag" or type is None + return False diff --git a/planet/vendor/html5lib/filters/whitespace.py b/planet/vendor/html5lib/filters/whitespace.py new file mode 100644 index 0000000..cb16325 --- /dev/null +++ b/planet/vendor/html5lib/filters/whitespace.py @@ -0,0 +1,38 @@ +try: + frozenset +except NameError: + # Import from the sets module for python 2.3 + from sets import ImmutableSet as frozenset + +import re + +import _base +from html5lib.constants import rcdataElements, spaceCharacters +spaceCharacters = u"".join(spaceCharacters) + +class Filter(_base.Filter): + + spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) + + def __iter__(self): + preserve = 0 + for token in _base.Filter.__iter__(self): + type = token["type"] + if type == "StartTag" \ + and (preserve or token["name"] in self.spacePreserveElements): + preserve += 1 + + elif type == "EndTag" and preserve: + preserve -= 1 + + elif not preserve and type == "SpaceCharacters": + continue + + elif not preserve and type == "Characters": + token["data"] = collapse_spaces(token["data"]) + + yield token + +def collapse_spaces(text): + return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text) + diff --git a/planet/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py similarity index 88% rename from planet/html5lib/html5parser.py rename to planet/vendor/html5lib/html5parser.py index 898ec9f..a3a7fd5 100644 --- a/planet/html5lib/html5parser.py +++ b/planet/vendor/html5lib/html5parser.py @@ -3,14 +3,14 @@ # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure , and # always exist. -# * We also deal with content when there's no DOCTYPE. -# It is expected that the specification will catch up with us in due course ;-) +# *
creates a
element. +# +# We haven't updated DOCTYPE handling yet # # It should be trivial to add the following cases. However, we should probably # also look into comment handling and such then... # * A

element end tag creates an empty

element when there's no

# element in scope. -# * A
element end tag creates an empty
element. try: frozenset @@ -20,6 +20,7 @@ except NameError: from sets import ImmutableSet as frozenset import gettext _ = gettext.gettext +import sys import tokenizer @@ -30,27 +31,32 @@ from treebuilders import simpletree import utils from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower from constants import scopingElements, formattingElements, specialElements -from constants import headingElements, tableInsertModeElements, voidElements +from constants import headingElements, tableInsertModeElements +from constants import cdataElements, rcdataElements, voidElements class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, strict = False, tree=simpletree.TreeBuilder): + def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be - returned. This class is almost always a subclass of - html5lib.treebuilders._base.TreeBuilder + returned. Built in treebuilders can be accessed through + html5lib.treebuilders.getTreeBuilder(treeType) """ # Raise an exception on the first error encountered self.strict = strict self.tree = tree() + self.tokenizer_class = tokenizer self.errors = [] + # "quirks" / "almost-standards" / "standards" + self.quirksMode = "standards" + self.phases = { "initial": InitialPhase(self, self.tree), "rootElement": RootElementPhase(self, self.tree), @@ -78,15 +84,15 @@ class HTMLParser(object): self.firstStartTag = False self.errors = [] - self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding, - parseMeta=innerHTML) + self.tokenizer = self.tokenizer_class(stream, encoding, + parseMeta=not innerHTML) if innerHTML: self.innerHTML = container.lower() - if self.innerHTML in ('title', 'textarea'): + if self.innerHTML in cdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] - elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'): + elif self.innerHTML in rcdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"] elif self.innerHTML == 'plaintext': self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"] @@ -113,10 +119,12 @@ class HTMLParser(object): method = getattr(self.phase, "process%s" % type, None) if type in ("Characters", "SpaceCharacters", "Comment"): method(token["data"]) - elif type in ("StartTag", "Doctype"): + elif type == "StartTag": method(token["name"], token["data"]) elif type == "EndTag": method(token["name"]) + elif type == "Doctype": + method(token["name"], token["publicId"], token["systemId"], token["correct"]) else: self.parseError(token["data"]) @@ -158,10 +166,6 @@ class HTMLParser(object): if self.strict: raise ParseError - def atheistParseError(self): - """This error is not an error""" - pass - def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ @@ -171,9 +175,7 @@ class HTMLParser(object): # element. If it matches a void element atheists did the wrong # thing and if it doesn't it's wrong for everyone. - if token["name"] in voidElements: - self.atheistParseError() - else: + if token["name"] not in voidElements: self.parseError(_("Solidus (/) incorrectly placed in tag.")) token["type"] = "StartTag" @@ -283,7 +285,7 @@ class Phase(object): # overridden. self.tree.insertComment(data, self.tree.openElements[-1]) - def processDoctype(self, name, error): + def processDoctype(self, name, publicId, systemId, correct): self.parser.parseError(_("Unexpected DOCTYPE. Ignored.")) def processSpaceCharacters(self, data): @@ -319,10 +321,101 @@ class InitialPhase(Phase): def processComment(self, data): self.tree.insertComment(data, self.tree.document) - def processDoctype(self, name, error): - if error: + def processDoctype(self, name, publicId, systemId, correct): + nameLower = name.translate(asciiUpper2Lower) + if nameLower != "html" or publicId != None or\ + systemId != None: self.parser.parseError(_("Erroneous DOCTYPE.")) + # XXX need to update DOCTYPE tokens self.tree.insertDoctype(name) + + if publicId == None: + publicId = "" + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if nameLower != "html": + # XXX quirks mode + pass + else: + if publicId in\ + ("+//silmaril//dtd html pro v0r11 19970101//en", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", + "-//as//dtd html 3.0 aswedit + extensions//en", + "-//ietf//dtd html 2.0 level 1//en", + "-//ietf//dtd html 2.0 level 2//en", + "-//ietf//dtd html 2.0 strict level 1//en", + "-//ietf//dtd html 2.0 strict level 2//en", + "-//ietf//dtd html 2.0 strict//en", + "-//ietf//dtd html 2.0//en", + "-//ietf//dtd html 2.1e//en", + "-//ietf//dtd html 3.0//en", + "-//ietf//dtd html 3.0//en//", + "-//ietf//dtd html 3.2 final//en", + "-//ietf//dtd html 3.2//en", + "-//ietf//dtd html 3//en", + "-//ietf//dtd html level 0//en", + "-//ietf//dtd html level 0//en//2.0", + "-//ietf//dtd html level 1//en", + "-//ietf//dtd html level 1//en//2.0", + "-//ietf//dtd html level 2//en", + "-//ietf//dtd html level 2//en//2.0", + "-//ietf//dtd html level 3//en", + "-//ietf//dtd html level 3//en//3.0", + "-//ietf//dtd html strict level 0//en", + "-//ietf//dtd html strict level 0//en//2.0", + "-//ietf//dtd html strict level 1//en", + "-//ietf//dtd html strict level 1//en//2.0", + "-//ietf//dtd html strict level 2//en", + "-//ietf//dtd html strict level 2//en//2.0", + "-//ietf//dtd html strict level 3//en", + "-//ietf//dtd html strict level 3//en//3.0", + "-//ietf//dtd html strict//en", + "-//ietf//dtd html strict//en//2.0", + "-//ietf//dtd html strict//en//3.0", + "-//ietf//dtd html//en", + "-//ietf//dtd html//en//2.0", + "-//ietf//dtd html//en//3.0", + "-//metrius//dtd metrius presentational//en", + "-//microsoft//dtd internet explorer 2.0 html strict//en", + "-//microsoft//dtd internet explorer 2.0 html//en", + "-//microsoft//dtd internet explorer 2.0 tables//en", + "-//microsoft//dtd internet explorer 3.0 html strict//en", + "-//microsoft//dtd internet explorer 3.0 html//en", + "-//microsoft//dtd internet explorer 3.0 tables//en", + "-//netscape comm. corp.//dtd html//en", + "-//netscape comm. corp.//dtd strict html//en", + "-//o'reilly and associates//dtd html 2.0//en", + "-//o'reilly and associates//dtd html extended 1.0//en", + "-//spyglass//dtd html 2.0 extended//en", + "-//sq//dtd html 2.0 hotmetal + extensions//en", + "-//sun microsystems corp.//dtd hotjava html//en", + "-//sun microsystems corp.//dtd hotjava strict html//en", + "-//w3c//dtd html 3 1995-03-24//en", + "-//w3c//dtd html 3.2 draft//en", + "-//w3c//dtd html 3.2 final//en", + "-//w3c//dtd html 3.2//en", + "-//w3c//dtd html 3.2s draft//en", + "-//w3c//dtd html 4.0 frameset//en", + "-//w3c//dtd html 4.0 transitional//en", + "-//w3c//dtd html experimental 19960712//en", + "-//w3c//dtd html experimental 970421//en", + "-//w3c//dtd w3 html//en", + "-//w3o//dtd w3 html 3.0//en", + "-//w3o//dtd w3 html 3.0//en//", + "-//w3o//dtd w3 html strict 3.0//en//", + "-//webtechs//dtd mozilla html 2.0//en", + "-//webtechs//dtd mozilla html//en", + "-/w3c/dtd html 4.0 transitional/en", + "html")\ + or (publicId in\ + ("-//w3c//dtd html 4.01 frameset//EN", + "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\ + or (systemId != None and\ + systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): + #XXX quirks mode + pass + self.parser.phase = self.parser.phases["rootElement"] def processSpaceCharacters(self, data): @@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase): self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - ("html", self.endTagHtml) + (("html", "head", "body", "br"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther @@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase): self.startTagHead("head", {}) self.parser.phase.processStartTag(name, attributes) - def endTagHtml(self, name): + def endTagImplyHead(self, name): self.startTagHead("head", {}) self.parser.phase.processEndTag(name) @@ -437,7 +530,7 @@ class InHeadPhase(Phase): self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), - ("html", self.endTagHtml), + (("html", "body", "br"), self.endTagImplyAfterHead), (("title", "style", "script"), self.endTagTitleStyleScript) ]) self.endTagHandler.default = self.endTagOther @@ -499,7 +592,11 @@ class InHeadPhase(Phase): def startTagBaseLinkMeta(self, name, attributes): element = self.tree.createElement(name, attributes) - self.appendToHead(element) + if (self.tree.headPointer is not None and + self.parser.phase == self.parser.phases["inHead"]): + self.appendToHead(element) + else: + self.tree.openElements[-1].appendChild(element) def startTagOther(self, name, attributes): self.anythingElse() @@ -512,7 +609,7 @@ class InHeadPhase(Phase): self.parser.parseError(_(u"Unexpected end tag (head). Ignored.")) self.parser.phase = self.parser.phases["afterHead"] - def endTagHtml(self, name): + def endTagImplyAfterHead(self, name): self.anythingElse() self.parser.phase.processEndTag(name) @@ -592,9 +689,9 @@ class InBodyPhase(Phase): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), - (("script", "style"), self.startTagScriptStyle), - (("base", "link", "meta", "title"), - self.startTagFromHead), + (("base", "link", "meta", "script", "style"), + self.startTagProcessInHead), + ("title", self.startTagTitle), ("body", self.startTagBody), (("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "listing", "menu", "ol", "p", "pre", "ul"), @@ -604,8 +701,9 @@ class InBodyPhase(Phase): ("plaintext",self.startTagPlaintext), (headingElements, self.startTagHeading), ("a", self.startTagA), - (("b", "big", "em", "font", "i", "nobr", "s", "small", "strike", - "strong", "tt", "u"),self.startTagFormatting), + (("b", "big", "em", "font", "i", "s", "small", "strike", "strong", + "tt", "u"),self.startTagFormatting), + ("nobr", self.startTagNobr), ("button", self.startTagButton), (("marquee", "object"), self.startTagMarqueeObject), ("xmp", self.startTagXmp), @@ -642,7 +740,8 @@ class InBodyPhase(Phase): (("head", "frameset", "select", "optgroup", "option", "table", "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr", "td", "th"), self.endTagMisplaced), - (("area", "basefont", "bgsound", "br", "embed", "hr", "image", + ("br", self.endTagBr), + (("area", "basefont", "bgsound", "embed", "hr", "image", "img", "input", "isindex", "param", "spacer", "wbr", "frame"), self.endTagNone), (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"), @@ -659,11 +758,13 @@ class InBodyPhase(Phase): self.tree.openElements[-1]) # the real deal - def processSpaceCharactersPre(self, data): - #Sometimes (start of

 blocks) we want to drop leading newlines
+    def processSpaceCharactersDropNewline(self, data):
+        # Sometimes (start of 
 and