Update to latest html5lib; move packaged dependencies to vendor directory
This commit is contained in:
parent
65e41f7b22
commit
fc90da7fc0
@ -1,5 +1,5 @@
|
||||
import sys
|
||||
from planet import html5lib
|
||||
import html5lib
|
||||
tree=html5lib.treebuilders.dom.TreeBuilder
|
||||
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
||||
document = parser.parse(sys.stdin)
|
||||
|
@ -23,8 +23,9 @@ from xml.sax.saxutils import escape
|
||||
from htmlentitydefs import entitydefs
|
||||
|
||||
import planet
|
||||
from planet import config, feedparser
|
||||
from planet import config
|
||||
from planet.spider import filename
|
||||
import feedparser
|
||||
log = planet.logger
|
||||
options = config.filter_options(sys.argv[0])
|
||||
|
||||
|
@ -32,7 +32,9 @@ def getLogger(level, format):
|
||||
loggerParms = (level,format)
|
||||
return logger
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__),'vendor'))
|
||||
|
||||
# Configure feed parser
|
||||
from planet import feedparser
|
||||
import feedparser
|
||||
feedparser.SANITIZE_HTML=0
|
||||
feedparser.RESOLVE_RELATIVE_URIS=0
|
||||
|
@ -1,42 +0,0 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
__path__.append(os.path.dirname(__path__[0]))
|
||||
|
||||
import dom
|
||||
import simpletree
|
||||
|
||||
try:
|
||||
import etree
|
||||
except:
|
||||
pass
|
@ -1,5 +0,0 @@
|
||||
import etreefull
|
||||
|
||||
class TreeBuilder(etreefull.TreeBuilder):
|
||||
def getDocument(self):
|
||||
return self.document._element.find("html")
|
@ -1,227 +0,0 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
try:
|
||||
from elementtree import ElementTree
|
||||
except:
|
||||
pass
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
Element.__init__(self, None)
|
||||
self._element = ElementTree.Comment(data)
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, DocumentFragment)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
@ -16,7 +16,8 @@ Todo:
|
||||
import re, time, md5, sgmllib
|
||||
from xml.sax.saxutils import escape
|
||||
from xml.dom import minidom, Node
|
||||
from planet.html5lib import liberalxmlparser, treebuilders
|
||||
from html5lib import liberalxmlparser
|
||||
from html5lib.treebuilders import dom
|
||||
import planet, config
|
||||
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -154,7 +155,7 @@ def content(xentry, name, detail, bozo):
|
||||
data = minidom.parseString(xdiv % detail.value).documentElement
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
else:
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
||||
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
|
||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||
for body in html.documentElement.childNodes:
|
||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||
|
@ -1,6 +1,7 @@
|
||||
from xml.sax.saxutils import escape
|
||||
import sgmllib, time, os, sys, new, urlparse, re
|
||||
from planet import config, feedparser, htmltmpl
|
||||
from planet import config, feedparser
|
||||
import htmltmpl
|
||||
|
||||
voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
|
||||
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
|
||||
|
@ -340,7 +340,7 @@ def spiderPlanet(only_if_new = False):
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
|
@ -119,8 +119,8 @@ spaceCharacters = frozenset((
|
||||
tableInsertModeElements = frozenset((
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr"
|
||||
))
|
||||
|
||||
@ -133,7 +133,7 @@ hexDigits = frozenset(string.hexdigits)
|
||||
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
||||
for c in string.ascii_uppercase])
|
||||
|
||||
# Heading elements need to be ordered
|
||||
# Heading elements need to be ordered
|
||||
headingElements = (
|
||||
"h1",
|
||||
"h2",
|
||||
@ -158,6 +158,38 @@ voidElements = frozenset((
|
||||
"input"
|
||||
))
|
||||
|
||||
cdataElements = frozenset(('title', 'textarea'))
|
||||
|
||||
rcdataElements = frozenset((
|
||||
'style',
|
||||
'script',
|
||||
'xmp',
|
||||
'iframe',
|
||||
'noembed',
|
||||
'noframes',
|
||||
'noscript'
|
||||
))
|
||||
|
||||
booleanAttributes = {
|
||||
"": frozenset(("irrelevant",)),
|
||||
"style": frozenset(("scoped",)),
|
||||
"img": frozenset(("ismap",)),
|
||||
"audio": frozenset(("autoplay","controls")),
|
||||
"video": frozenset(("autoplay","controls")),
|
||||
"script": frozenset(("defer", "async")),
|
||||
"details": frozenset(("open",)),
|
||||
"datagrid": frozenset(("multiple", "disabled")),
|
||||
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
||||
"menu": frozenset(("autosubmit",)),
|
||||
"fieldset": frozenset(("disabled", "readonly")),
|
||||
"option": frozenset(("disabled", "readonly", "selected")),
|
||||
"optgroup": frozenset(("disabled", "readonly")),
|
||||
"button": frozenset(("disabled", "autofocus")),
|
||||
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
||||
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
||||
"output": frozenset(("disabled", "readonly")),
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||
# therefore can't be a frozenset.
|
||||
entitiesWindows1252 = (
|
||||
@ -196,265 +228,372 @@ entitiesWindows1252 = (
|
||||
)
|
||||
|
||||
entities = {
|
||||
"AElig;": u"\u00C6",
|
||||
"AElig": u"\u00C6",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha": u"\u0391",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta": u"\u0392",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi": u"\u03A7",
|
||||
"Dagger": u"\u2021",
|
||||
"Delta": u"\u0394",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon": u"\u0395",
|
||||
"Eta": u"\u0397",
|
||||
"Euml": u"\u00CB",
|
||||
"Gamma": u"\u0393",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota": u"\u0399",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa": u"\u039A",
|
||||
"Lambda": u"\u039B",
|
||||
"Mu": u"\u039C",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu": u"\u039D",
|
||||
"OElig": u"\u0152",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega": u"\u03A9",
|
||||
"Omicron": u"\u039F",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi": u"\u03A6",
|
||||
"Pi": u"\u03A0",
|
||||
"Prime": u"\u2033",
|
||||
"Psi": u"\u03A8",
|
||||
"Rho": u"\u03A1",
|
||||
"Scaron": u"\u0160",
|
||||
"Sigma": u"\u03A3",
|
||||
"THORN": u"\u00DE",
|
||||
"Tau": u"\u03A4",
|
||||
"Theta": u"\u0398",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon": u"\u03A5",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi": u"\u039E",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml": u"\u0178",
|
||||
"Zeta": u"\u0396",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc": u"\u00E2",
|
||||
"acute": u"\u00B4",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym": u"\u2135",
|
||||
"alpha": u"\u03B1",
|
||||
"amp": u"\u0026",
|
||||
"AMP;": u"\u0026",
|
||||
"AMP": u"\u0026",
|
||||
"and": u"\u2227",
|
||||
"ang": u"\u2220",
|
||||
"apos": u"\u0027",
|
||||
"aring": u"\u00E5",
|
||||
"asymp": u"\u2248",
|
||||
"atilde": u"\u00E3",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo": u"\u201E",
|
||||
"beta": u"\u03B2",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull": u"\u2022",
|
||||
"cap": u"\u2229",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil": u"\u00B8",
|
||||
"cent": u"\u00A2",
|
||||
"chi": u"\u03C7",
|
||||
"circ": u"\u02C6",
|
||||
"clubs": u"\u2663",
|
||||
"cong": u"\u2245",
|
||||
"copy": u"\u00A9",
|
||||
"Aacute;": u"\u00C1",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc;": u"\u00C2",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave;": u"\u00C0",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha;": u"\u0391",
|
||||
"Aring;": u"\u00C5",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde;": u"\u00C3",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml;": u"\u00C4",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta;": u"\u0392",
|
||||
"COPY;": u"\u00A9",
|
||||
"COPY": u"\u00A9",
|
||||
"crarr": u"\u21B5",
|
||||
"cup": u"\u222A",
|
||||
"curren": u"\u00A4",
|
||||
"dArr": u"\u21D3",
|
||||
"dagger": u"\u2020",
|
||||
"darr": u"\u2193",
|
||||
"deg": u"\u00B0",
|
||||
"delta": u"\u03B4",
|
||||
"diams": u"\u2666",
|
||||
"divide": u"\u00F7",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave": u"\u00E8",
|
||||
"empty": u"\u2205",
|
||||
"emsp": u"\u2003",
|
||||
"ensp": u"\u2002",
|
||||
"epsilon": u"\u03B5",
|
||||
"equiv": u"\u2261",
|
||||
"eta": u"\u03B7",
|
||||
"eth": u"\u00F0",
|
||||
"euml": u"\u00EB",
|
||||
"euro": u"\u20AC",
|
||||
"exist": u"\u2203",
|
||||
"fnof": u"\u0192",
|
||||
"forall": u"\u2200",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl": u"\u2044",
|
||||
"gamma": u"\u03B3",
|
||||
"ge": u"\u2265",
|
||||
"gt": u"\u003E",
|
||||
"Ccedil;": u"\u00C7",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi;": u"\u03A7",
|
||||
"Dagger;": u"\u2021",
|
||||
"Delta;": u"\u0394",
|
||||
"ETH;": u"\u00D0",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute;": u"\u00C9",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc;": u"\u00CA",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave;": u"\u00C8",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon;": u"\u0395",
|
||||
"Eta;": u"\u0397",
|
||||
"Euml;": u"\u00CB",
|
||||
"Euml": u"\u00CB",
|
||||
"GT;": u"\u003E",
|
||||
"GT": u"\u003E",
|
||||
"hArr": u"\u21D4",
|
||||
"harr": u"\u2194",
|
||||
"hearts": u"\u2665",
|
||||
"hellip": u"\u2026",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave": u"\u00EC",
|
||||
"image": u"\u2111",
|
||||
"infin": u"\u221E",
|
||||
"int": u"\u222B",
|
||||
"iota": u"\u03B9",
|
||||
"iquest": u"\u00BF",
|
||||
"isin": u"\u2208",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa": u"\u03BA",
|
||||
"lArr": u"\u21D0",
|
||||
"lambda": u"\u03BB",
|
||||
"lang": u"\u2329",
|
||||
"laquo": u"\u00AB",
|
||||
"larr": u"\u2190",
|
||||
"lceil": u"\u2308",
|
||||
"ldquo": u"\u201C",
|
||||
"le": u"\u2264",
|
||||
"lfloor": u"\u230A",
|
||||
"lowast": u"\u2217",
|
||||
"loz": u"\u25CA",
|
||||
"lrm": u"\u200E",
|
||||
"lsaquo": u"\u2039",
|
||||
"lsquo": u"\u2018",
|
||||
"lt": u"\u003C",
|
||||
"Gamma;": u"\u0393",
|
||||
"Iacute;": u"\u00CD",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc;": u"\u00CE",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave;": u"\u00CC",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota;": u"\u0399",
|
||||
"Iuml;": u"\u00CF",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa;": u"\u039A",
|
||||
"LT;": u"\u003C",
|
||||
"LT": u"\u003C",
|
||||
"macr": u"\u00AF",
|
||||
"mdash": u"\u2014",
|
||||
"micro": u"\u00B5",
|
||||
"middot": u"\u00B7",
|
||||
"minus": u"\u2212",
|
||||
"mu": u"\u03BC",
|
||||
"nabla": u"\u2207",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash": u"\u2013",
|
||||
"ne": u"\u2260",
|
||||
"ni": u"\u220B",
|
||||
"not": u"\u00AC",
|
||||
"notin": u"\u2209",
|
||||
"nsub": u"\u2284",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu": u"\u03BD",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig": u"\u0153",
|
||||
"ograve": u"\u00F2",
|
||||
"oline": u"\u203E",
|
||||
"omega": u"\u03C9",
|
||||
"omicron": u"\u03BF",
|
||||
"oplus": u"\u2295",
|
||||
"or": u"\u2228",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes": u"\u2297",
|
||||
"ouml": u"\u00F6",
|
||||
"para": u"\u00B6",
|
||||
"part": u"\u2202",
|
||||
"permil": u"\u2030",
|
||||
"perp": u"\u22A5",
|
||||
"phi": u"\u03C6",
|
||||
"pi": u"\u03C0",
|
||||
"piv": u"\u03D6",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound": u"\u00A3",
|
||||
"prime": u"\u2032",
|
||||
"prod": u"\u220F",
|
||||
"prop": u"\u221D",
|
||||
"psi": u"\u03C8",
|
||||
"quot": u"\u0022",
|
||||
"Lambda;": u"\u039B",
|
||||
"Mu;": u"\u039C",
|
||||
"Ntilde;": u"\u00D1",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu;": u"\u039D",
|
||||
"OElig;": u"\u0152",
|
||||
"Oacute;": u"\u00D3",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc;": u"\u00D4",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve;": u"\u00D2",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega;": u"\u03A9",
|
||||
"Omicron;": u"\u039F",
|
||||
"Oslash;": u"\u00D8",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde;": u"\u00D5",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml;": u"\u00D6",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi;": u"\u03A6",
|
||||
"Pi;": u"\u03A0",
|
||||
"Prime;": u"\u2033",
|
||||
"Psi;": u"\u03A8",
|
||||
"QUOT;": u"\u0022",
|
||||
"QUOT": u"\u0022",
|
||||
"rArr": u"\u21D2",
|
||||
"radic": u"\u221A",
|
||||
"rang": u"\u232A",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr": u"\u2192",
|
||||
"rceil": u"\u2309",
|
||||
"rdquo": u"\u201D",
|
||||
"real": u"\u211C",
|
||||
"reg": u"\u00AE",
|
||||
"REG;": u"\u00AE",
|
||||
"REG": u"\u00AE",
|
||||
"rfloor": u"\u230B",
|
||||
"rho": u"\u03C1",
|
||||
"rlm": u"\u200F",
|
||||
"rsaquo": u"\u203A",
|
||||
"rsquo": u"\u2019",
|
||||
"sbquo": u"\u201A",
|
||||
"scaron": u"\u0161",
|
||||
"sdot": u"\u22C5",
|
||||
"Rho;": u"\u03A1",
|
||||
"Scaron;": u"\u0160",
|
||||
"Sigma;": u"\u03A3",
|
||||
"THORN;": u"\u00DE",
|
||||
"THORN": u"\u00DE",
|
||||
"TRADE;": u"\u2122",
|
||||
"Tau;": u"\u03A4",
|
||||
"Theta;": u"\u0398",
|
||||
"Uacute;": u"\u00DA",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc;": u"\u00DB",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave;": u"\u00D9",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon;": u"\u03A5",
|
||||
"Uuml;": u"\u00DC",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi;": u"\u039E",
|
||||
"Yacute;": u"\u00DD",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml;": u"\u0178",
|
||||
"Zeta;": u"\u0396",
|
||||
"aacute;": u"\u00E1",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc;": u"\u00E2",
|
||||
"acirc": u"\u00E2",
|
||||
"acute;": u"\u00B4",
|
||||
"acute": u"\u00B4",
|
||||
"aelig;": u"\u00E6",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave;": u"\u00E0",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym;": u"\u2135",
|
||||
"alpha;": u"\u03B1",
|
||||
"amp;": u"\u0026",
|
||||
"amp": u"\u0026",
|
||||
"and;": u"\u2227",
|
||||
"ang;": u"\u2220",
|
||||
"apos;": u"\u0027",
|
||||
"aring;": u"\u00E5",
|
||||
"aring": u"\u00E5",
|
||||
"asymp;": u"\u2248",
|
||||
"atilde;": u"\u00E3",
|
||||
"atilde": u"\u00E3",
|
||||
"auml;": u"\u00E4",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo;": u"\u201E",
|
||||
"beta;": u"\u03B2",
|
||||
"brvbar;": u"\u00A6",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull;": u"\u2022",
|
||||
"cap;": u"\u2229",
|
||||
"ccedil;": u"\u00E7",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil;": u"\u00B8",
|
||||
"cedil": u"\u00B8",
|
||||
"cent;": u"\u00A2",
|
||||
"cent": u"\u00A2",
|
||||
"chi;": u"\u03C7",
|
||||
"circ;": u"\u02C6",
|
||||
"clubs;": u"\u2663",
|
||||
"cong;": u"\u2245",
|
||||
"copy;": u"\u00A9",
|
||||
"copy": u"\u00A9",
|
||||
"crarr;": u"\u21B5",
|
||||
"cup;": u"\u222A",
|
||||
"curren;": u"\u00A4",
|
||||
"curren": u"\u00A4",
|
||||
"dArr;": u"\u21D3",
|
||||
"dagger;": u"\u2020",
|
||||
"darr;": u"\u2193",
|
||||
"deg;": u"\u00B0",
|
||||
"deg": u"\u00B0",
|
||||
"delta;": u"\u03B4",
|
||||
"diams;": u"\u2666",
|
||||
"divide;": u"\u00F7",
|
||||
"divide": u"\u00F7",
|
||||
"eacute;": u"\u00E9",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc;": u"\u00EA",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave;": u"\u00E8",
|
||||
"egrave": u"\u00E8",
|
||||
"empty;": u"\u2205",
|
||||
"emsp;": u"\u2003",
|
||||
"ensp;": u"\u2002",
|
||||
"epsilon;": u"\u03B5",
|
||||
"equiv;": u"\u2261",
|
||||
"eta;": u"\u03B7",
|
||||
"eth;": u"\u00F0",
|
||||
"eth": u"\u00F0",
|
||||
"euml;": u"\u00EB",
|
||||
"euml": u"\u00EB",
|
||||
"euro;": u"\u20AC",
|
||||
"exist;": u"\u2203",
|
||||
"fnof;": u"\u0192",
|
||||
"forall;": u"\u2200",
|
||||
"frac12;": u"\u00BD",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14;": u"\u00BC",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34;": u"\u00BE",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl;": u"\u2044",
|
||||
"gamma;": u"\u03B3",
|
||||
"ge;": u"\u2265",
|
||||
"gt;": u"\u003E",
|
||||
"gt": u"\u003E",
|
||||
"hArr;": u"\u21D4",
|
||||
"harr;": u"\u2194",
|
||||
"hearts;": u"\u2665",
|
||||
"hellip;": u"\u2026",
|
||||
"iacute;": u"\u00ED",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc;": u"\u00EE",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl;": u"\u00A1",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave;": u"\u00EC",
|
||||
"igrave": u"\u00EC",
|
||||
"image;": u"\u2111",
|
||||
"infin;": u"\u221E",
|
||||
"int;": u"\u222B",
|
||||
"iota;": u"\u03B9",
|
||||
"iquest;": u"\u00BF",
|
||||
"iquest": u"\u00BF",
|
||||
"isin;": u"\u2208",
|
||||
"iuml;": u"\u00EF",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa;": u"\u03BA",
|
||||
"lArr;": u"\u21D0",
|
||||
"lambda;": u"\u03BB",
|
||||
"lang;": u"\u3008",
|
||||
"laquo;": u"\u00AB",
|
||||
"laquo": u"\u00AB",
|
||||
"larr;": u"\u2190",
|
||||
"lceil;": u"\u2308",
|
||||
"ldquo;": u"\u201C",
|
||||
"le;": u"\u2264",
|
||||
"lfloor;": u"\u230A",
|
||||
"lowast;": u"\u2217",
|
||||
"loz;": u"\u25CA",
|
||||
"lrm;": u"\u200E",
|
||||
"lsaquo;": u"\u2039",
|
||||
"lsquo;": u"\u2018",
|
||||
"lt;": u"\u003C",
|
||||
"lt": u"\u003C",
|
||||
"macr;": u"\u00AF",
|
||||
"macr": u"\u00AF",
|
||||
"mdash;": u"\u2014",
|
||||
"micro;": u"\u00B5",
|
||||
"micro": u"\u00B5",
|
||||
"middot;": u"\u00B7",
|
||||
"middot": u"\u00B7",
|
||||
"minus;": u"\u2212",
|
||||
"mu;": u"\u03BC",
|
||||
"nabla;": u"\u2207",
|
||||
"nbsp;": u"\u00A0",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash;": u"\u2013",
|
||||
"ne;": u"\u2260",
|
||||
"ni;": u"\u220B",
|
||||
"not;": u"\u00AC",
|
||||
"not": u"\u00AC",
|
||||
"notin;": u"\u2209",
|
||||
"nsub;": u"\u2284",
|
||||
"ntilde;": u"\u00F1",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu;": u"\u03BD",
|
||||
"oacute;": u"\u00F3",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc;": u"\u00F4",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig;": u"\u0153",
|
||||
"ograve;": u"\u00F2",
|
||||
"ograve": u"\u00F2",
|
||||
"oline;": u"\u203E",
|
||||
"omega;": u"\u03C9",
|
||||
"omicron;": u"\u03BF",
|
||||
"oplus;": u"\u2295",
|
||||
"or;": u"\u2228",
|
||||
"ordf;": u"\u00AA",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm;": u"\u00BA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash;": u"\u00F8",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde;": u"\u00F5",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes;": u"\u2297",
|
||||
"ouml;": u"\u00F6",
|
||||
"ouml": u"\u00F6",
|
||||
"para;": u"\u00B6",
|
||||
"para": u"\u00B6",
|
||||
"part;": u"\u2202",
|
||||
"permil;": u"\u2030",
|
||||
"perp;": u"\u22A5",
|
||||
"phi;": u"\u03C6",
|
||||
"pi;": u"\u03C0",
|
||||
"piv;": u"\u03D6",
|
||||
"plusmn;": u"\u00B1",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound;": u"\u00A3",
|
||||
"pound": u"\u00A3",
|
||||
"prime;": u"\u2032",
|
||||
"prod;": u"\u220F",
|
||||
"prop;": u"\u221D",
|
||||
"psi;": u"\u03C8",
|
||||
"quot;": u"\u0022",
|
||||
"quot": u"\u0022",
|
||||
"rArr;": u"\u21D2",
|
||||
"radic;": u"\u221A",
|
||||
"rang;": u"\u3009",
|
||||
"raquo;": u"\u00BB",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr;": u"\u2192",
|
||||
"rceil;": u"\u2309",
|
||||
"rdquo;": u"\u201D",
|
||||
"real;": u"\u211C",
|
||||
"reg;": u"\u00AE",
|
||||
"reg": u"\u00AE",
|
||||
"rfloor;": u"\u230B",
|
||||
"rho;": u"\u03C1",
|
||||
"rlm;": u"\u200F",
|
||||
"rsaquo;": u"\u203A",
|
||||
"rsquo;": u"\u2019",
|
||||
"sbquo;": u"\u201A",
|
||||
"scaron;": u"\u0161",
|
||||
"sdot;": u"\u22C5",
|
||||
"sect;": u"\u00A7",
|
||||
"sect": u"\u00A7",
|
||||
"shy;": u"\u00AD",
|
||||
"shy": u"\u00AD",
|
||||
"sigma": u"\u03C3",
|
||||
"sigmaf": u"\u03C2",
|
||||
"sim": u"\u223C",
|
||||
"spades": u"\u2660",
|
||||
"sub": u"\u2282",
|
||||
"sube": u"\u2286",
|
||||
"sum": u"\u2211",
|
||||
"sup": u"\u2283",
|
||||
"sigma;": u"\u03C3",
|
||||
"sigmaf;": u"\u03C2",
|
||||
"sim;": u"\u223C",
|
||||
"spades;": u"\u2660",
|
||||
"sub;": u"\u2282",
|
||||
"sube;": u"\u2286",
|
||||
"sum;": u"\u2211",
|
||||
"sup1;": u"\u00B9",
|
||||
"sup1": u"\u00B9",
|
||||
"sup2;": u"\u00B2",
|
||||
"sup2": u"\u00B2",
|
||||
"sup3;": u"\u00B3",
|
||||
"sup3": u"\u00B3",
|
||||
"supe": u"\u2287",
|
||||
"sup;": u"\u2283",
|
||||
"supe;": u"\u2287",
|
||||
"szlig;": u"\u00DF",
|
||||
"szlig": u"\u00DF",
|
||||
"tau": u"\u03C4",
|
||||
"there4": u"\u2234",
|
||||
"theta": u"\u03B8",
|
||||
"thetasym": u"\u03D1",
|
||||
"thinsp": u"\u2009",
|
||||
"tau;": u"\u03C4",
|
||||
"there4;": u"\u2234",
|
||||
"theta;": u"\u03B8",
|
||||
"thetasym;": u"\u03D1",
|
||||
"thinsp;": u"\u2009",
|
||||
"thorn;": u"\u00FE",
|
||||
"thorn": u"\u00FE",
|
||||
"tilde": u"\u02DC",
|
||||
"tilde;": u"\u02DC",
|
||||
"times;": u"\u00D7",
|
||||
"times": u"\u00D7",
|
||||
"trade": u"\u2122",
|
||||
"uArr": u"\u21D1",
|
||||
"trade;": u"\u2122",
|
||||
"uArr;": u"\u21D1",
|
||||
"uacute;": u"\u00FA",
|
||||
"uacute": u"\u00FA",
|
||||
"uarr": u"\u2191",
|
||||
"uarr;": u"\u2191",
|
||||
"ucirc;": u"\u00FB",
|
||||
"ucirc": u"\u00FB",
|
||||
"ugrave;": u"\u00F9",
|
||||
"ugrave": u"\u00F9",
|
||||
"uml;": u"\u00A8",
|
||||
"uml": u"\u00A8",
|
||||
"upsih": u"\u03D2",
|
||||
"upsilon": u"\u03C5",
|
||||
"upsih;": u"\u03D2",
|
||||
"upsilon;": u"\u03C5",
|
||||
"uuml;": u"\u00FC",
|
||||
"uuml": u"\u00FC",
|
||||
"weierp": u"\u2118",
|
||||
"xi": u"\u03BE",
|
||||
"weierp;": u"\u2118",
|
||||
"xi;": u"\u03BE",
|
||||
"yacute;": u"\u00FD",
|
||||
"yacute": u"\u00FD",
|
||||
"yen;": u"\u00A5",
|
||||
"yen": u"\u00A5",
|
||||
"yuml;": u"\u00FF",
|
||||
"yuml": u"\u00FF",
|
||||
"zeta": u"\u03B6",
|
||||
"zwj": u"\u200D",
|
||||
"zwnj": u"\u200C"
|
||||
"zeta;": u"\u03B6",
|
||||
"zwj;": u"\u200D",
|
||||
"zwnj;": u"\u200C"
|
||||
}
|
||||
|
||||
encodings = frozenset((
|
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __init__(self, source, encoding):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == "meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
content_index = -1
|
||||
for i,(name,value) in enumerate(token["data"]):
|
||||
if name.lower() == 'charset':
|
||||
token["data"][i] = (u'charset', self.encoding)
|
||||
meta_found = True
|
||||
break
|
||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
elif name == 'content':
|
||||
content_index = i
|
||||
else:
|
||||
if has_http_equiv_content_type and content_index >= 0:
|
||||
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == "head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": "head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": [["charset", self.encoding]]}
|
||||
yield {"type": "EndTag", "name": "head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == "head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": [["charset", self.encoding]]}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
90
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
90
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class LintError(Exception): pass
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
contentModelFlag = "PCDATA"
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty attribute name"))
|
||||
if not isinstance(value, unicode):
|
||||
raise LintError(_("Attribute value is not a string: %r") % value)
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
contentModelFlag = "RCDATA"
|
||||
elif name == "plaintext":
|
||||
contentModelFlag = "PLAINTEXT"
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if name in voidElements:
|
||||
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
||||
contentModelFlag = "PCDATA"
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % data)
|
||||
if not data:
|
||||
raise LintError(_(u"%s token with empty data") % type)
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise LintError(_(u"Unknown token type: %s") % type)
|
||||
|
||||
yield token
|
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["data"] or not self.is_optional_start(token["name"], previous, next):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
return type == "StartTag"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody','thead','tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'option', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, blockquote, dl, fieldset,
|
||||
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||
# or ul element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('address', 'blockquote', \
|
||||
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
|
||||
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
38
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
38
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class Filter(_base.Filter):
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters":
|
||||
continue
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
def collapse_spaces(text):
|
||||
return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
|
||||
|
@ -3,14 +3,14 @@
|
||||
# * Phases and insertion modes are one concept in parser.py.
|
||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||
# always exist.
|
||||
# * We also deal with content when there's no DOCTYPE.
|
||||
# It is expected that the specification will catch up with us in due course ;-)
|
||||
# * </br> creates a <br> element.
|
||||
#
|
||||
# We haven't updated DOCTYPE handling yet
|
||||
#
|
||||
# It should be trivial to add the following cases. However, we should probably
|
||||
# also look into comment handling and such then...
|
||||
# * A <p> element end tag creates an empty <p> element when there's no <p>
|
||||
# element in scope.
|
||||
# * A <br> element end tag creates an empty <br> element.
|
||||
|
||||
try:
|
||||
frozenset
|
||||
@ -20,6 +20,7 @@ except NameError:
|
||||
from sets import ImmutableSet as frozenset
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
import sys
|
||||
|
||||
import tokenizer
|
||||
|
||||
@ -30,27 +31,32 @@ from treebuilders import simpletree
|
||||
import utils
|
||||
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
|
||||
from constants import scopingElements, formattingElements, specialElements
|
||||
from constants import headingElements, tableInsertModeElements, voidElements
|
||||
from constants import headingElements, tableInsertModeElements
|
||||
from constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
class HTMLParser(object):
|
||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||
malformed) HTML"""
|
||||
|
||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
|
||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
|
||||
"""
|
||||
strict - raise an exception when a parse error is encountered
|
||||
|
||||
tree - a treebuilder class controlling the type of tree that will be
|
||||
returned. This class is almost always a subclass of
|
||||
html5lib.treebuilders._base.TreeBuilder
|
||||
returned. Built in treebuilders can be accessed through
|
||||
html5lib.treebuilders.getTreeBuilder(treeType)
|
||||
"""
|
||||
|
||||
# Raise an exception on the first error encountered
|
||||
self.strict = strict
|
||||
|
||||
self.tree = tree()
|
||||
self.tokenizer_class = tokenizer
|
||||
self.errors = []
|
||||
|
||||
# "quirks" / "almost-standards" / "standards"
|
||||
self.quirksMode = "standards"
|
||||
|
||||
self.phases = {
|
||||
"initial": InitialPhase(self, self.tree),
|
||||
"rootElement": RootElementPhase(self, self.tree),
|
||||
@ -78,15 +84,15 @@ class HTMLParser(object):
|
||||
self.firstStartTag = False
|
||||
self.errors = []
|
||||
|
||||
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
|
||||
parseMeta=innerHTML)
|
||||
self.tokenizer = self.tokenizer_class(stream, encoding,
|
||||
parseMeta=not innerHTML)
|
||||
|
||||
if innerHTML:
|
||||
self.innerHTML = container.lower()
|
||||
|
||||
if self.innerHTML in ('title', 'textarea'):
|
||||
if self.innerHTML in cdataElements:
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
|
||||
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
|
||||
elif self.innerHTML in rcdataElements:
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
|
||||
elif self.innerHTML == 'plaintext':
|
||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
|
||||
@ -113,10 +119,12 @@ class HTMLParser(object):
|
||||
method = getattr(self.phase, "process%s" % type, None)
|
||||
if type in ("Characters", "SpaceCharacters", "Comment"):
|
||||
method(token["data"])
|
||||
elif type in ("StartTag", "Doctype"):
|
||||
elif type == "StartTag":
|
||||
method(token["name"], token["data"])
|
||||
elif type == "EndTag":
|
||||
method(token["name"])
|
||||
elif type == "Doctype":
|
||||
method(token["name"], token["publicId"], token["systemId"], token["correct"])
|
||||
else:
|
||||
self.parseError(token["data"])
|
||||
|
||||
@ -158,10 +166,6 @@ class HTMLParser(object):
|
||||
if self.strict:
|
||||
raise ParseError
|
||||
|
||||
def atheistParseError(self):
|
||||
"""This error is not an error"""
|
||||
pass
|
||||
|
||||
def normalizeToken(self, token):
|
||||
""" HTML5 specific normalizations to the token stream """
|
||||
|
||||
@ -171,9 +175,7 @@ class HTMLParser(object):
|
||||
# element. If it matches a void element atheists did the wrong
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
if token["name"] in voidElements:
|
||||
self.atheistParseError()
|
||||
else:
|
||||
if token["name"] not in voidElements:
|
||||
self.parseError(_("Solidus (/) incorrectly placed in tag."))
|
||||
|
||||
token["type"] = "StartTag"
|
||||
@ -283,7 +285,7 @@ class Phase(object):
|
||||
# overridden.
|
||||
self.tree.insertComment(data, self.tree.openElements[-1])
|
||||
|
||||
def processDoctype(self, name, error):
|
||||
def processDoctype(self, name, publicId, systemId, correct):
|
||||
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
@ -319,10 +321,101 @@ class InitialPhase(Phase):
|
||||
def processComment(self, data):
|
||||
self.tree.insertComment(data, self.tree.document)
|
||||
|
||||
def processDoctype(self, name, error):
|
||||
if error:
|
||||
def processDoctype(self, name, publicId, systemId, correct):
|
||||
nameLower = name.translate(asciiUpper2Lower)
|
||||
if nameLower != "html" or publicId != None or\
|
||||
systemId != None:
|
||||
self.parser.parseError(_("Erroneous DOCTYPE."))
|
||||
# XXX need to update DOCTYPE tokens
|
||||
self.tree.insertDoctype(name)
|
||||
|
||||
if publicId == None:
|
||||
publicId = ""
|
||||
if publicId != "":
|
||||
publicId = publicId.translate(asciiUpper2Lower)
|
||||
|
||||
if nameLower != "html":
|
||||
# XXX quirks mode
|
||||
pass
|
||||
else:
|
||||
if publicId in\
|
||||
("+//silmaril//dtd html pro v0r11 19970101//en",
|
||||
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
|
||||
"-//as//dtd html 3.0 aswedit + extensions//en",
|
||||
"-//ietf//dtd html 2.0 level 1//en",
|
||||
"-//ietf//dtd html 2.0 level 2//en",
|
||||
"-//ietf//dtd html 2.0 strict level 1//en",
|
||||
"-//ietf//dtd html 2.0 strict level 2//en",
|
||||
"-//ietf//dtd html 2.0 strict//en",
|
||||
"-//ietf//dtd html 2.0//en",
|
||||
"-//ietf//dtd html 2.1e//en",
|
||||
"-//ietf//dtd html 3.0//en",
|
||||
"-//ietf//dtd html 3.0//en//",
|
||||
"-//ietf//dtd html 3.2 final//en",
|
||||
"-//ietf//dtd html 3.2//en",
|
||||
"-//ietf//dtd html 3//en",
|
||||
"-//ietf//dtd html level 0//en",
|
||||
"-//ietf//dtd html level 0//en//2.0",
|
||||
"-//ietf//dtd html level 1//en",
|
||||
"-//ietf//dtd html level 1//en//2.0",
|
||||
"-//ietf//dtd html level 2//en",
|
||||
"-//ietf//dtd html level 2//en//2.0",
|
||||
"-//ietf//dtd html level 3//en",
|
||||
"-//ietf//dtd html level 3//en//3.0",
|
||||
"-//ietf//dtd html strict level 0//en",
|
||||
"-//ietf//dtd html strict level 0//en//2.0",
|
||||
"-//ietf//dtd html strict level 1//en",
|
||||
"-//ietf//dtd html strict level 1//en//2.0",
|
||||
"-//ietf//dtd html strict level 2//en",
|
||||
"-//ietf//dtd html strict level 2//en//2.0",
|
||||
"-//ietf//dtd html strict level 3//en",
|
||||
"-//ietf//dtd html strict level 3//en//3.0",
|
||||
"-//ietf//dtd html strict//en",
|
||||
"-//ietf//dtd html strict//en//2.0",
|
||||
"-//ietf//dtd html strict//en//3.0",
|
||||
"-//ietf//dtd html//en",
|
||||
"-//ietf//dtd html//en//2.0",
|
||||
"-//ietf//dtd html//en//3.0",
|
||||
"-//metrius//dtd metrius presentational//en",
|
||||
"-//microsoft//dtd internet explorer 2.0 html strict//en",
|
||||
"-//microsoft//dtd internet explorer 2.0 html//en",
|
||||
"-//microsoft//dtd internet explorer 2.0 tables//en",
|
||||
"-//microsoft//dtd internet explorer 3.0 html strict//en",
|
||||
"-//microsoft//dtd internet explorer 3.0 html//en",
|
||||
"-//microsoft//dtd internet explorer 3.0 tables//en",
|
||||
"-//netscape comm. corp.//dtd html//en",
|
||||
"-//netscape comm. corp.//dtd strict html//en",
|
||||
"-//o'reilly and associates//dtd html 2.0//en",
|
||||
"-//o'reilly and associates//dtd html extended 1.0//en",
|
||||
"-//spyglass//dtd html 2.0 extended//en",
|
||||
"-//sq//dtd html 2.0 hotmetal + extensions//en",
|
||||
"-//sun microsystems corp.//dtd hotjava html//en",
|
||||
"-//sun microsystems corp.//dtd hotjava strict html//en",
|
||||
"-//w3c//dtd html 3 1995-03-24//en",
|
||||
"-//w3c//dtd html 3.2 draft//en",
|
||||
"-//w3c//dtd html 3.2 final//en",
|
||||
"-//w3c//dtd html 3.2//en",
|
||||
"-//w3c//dtd html 3.2s draft//en",
|
||||
"-//w3c//dtd html 4.0 frameset//en",
|
||||
"-//w3c//dtd html 4.0 transitional//en",
|
||||
"-//w3c//dtd html experimental 19960712//en",
|
||||
"-//w3c//dtd html experimental 970421//en",
|
||||
"-//w3c//dtd w3 html//en",
|
||||
"-//w3o//dtd w3 html 3.0//en",
|
||||
"-//w3o//dtd w3 html 3.0//en//",
|
||||
"-//w3o//dtd w3 html strict 3.0//en//",
|
||||
"-//webtechs//dtd mozilla html 2.0//en",
|
||||
"-//webtechs//dtd mozilla html//en",
|
||||
"-/w3c/dtd html 4.0 transitional/en",
|
||||
"html")\
|
||||
or (publicId in\
|
||||
("-//w3c//dtd html 4.01 frameset//EN",
|
||||
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
|
||||
or (systemId != None and\
|
||||
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
|
||||
#XXX quirks mode
|
||||
pass
|
||||
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase):
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
("html", self.endTagHtml)
|
||||
(("html", "head", "body", "br"), self.endTagImplyHead)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase):
|
||||
self.startTagHead("head", {})
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def endTagHtml(self, name):
|
||||
def endTagImplyHead(self, name):
|
||||
self.startTagHead("head", {})
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -437,7 +530,7 @@ class InHeadPhase(Phase):
|
||||
|
||||
self. endTagHandler = utils.MethodDispatcher([
|
||||
("head", self.endTagHead),
|
||||
("html", self.endTagHtml),
|
||||
(("html", "body", "br"), self.endTagImplyAfterHead),
|
||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
@ -499,7 +592,11 @@ class InHeadPhase(Phase):
|
||||
|
||||
def startTagBaseLinkMeta(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
self.appendToHead(element)
|
||||
if (self.tree.headPointer is not None and
|
||||
self.parser.phase == self.parser.phases["inHead"]):
|
||||
self.appendToHead(element)
|
||||
else:
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.anythingElse()
|
||||
@ -512,7 +609,7 @@ class InHeadPhase(Phase):
|
||||
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
||||
self.parser.phase = self.parser.phases["afterHead"]
|
||||
|
||||
def endTagHtml(self, name):
|
||||
def endTagImplyAfterHead(self, name):
|
||||
self.anythingElse()
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -592,9 +689,9 @@ class InBodyPhase(Phase):
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("script", "style"), self.startTagScriptStyle),
|
||||
(("base", "link", "meta", "title"),
|
||||
self.startTagFromHead),
|
||||
(("base", "link", "meta", "script", "style"),
|
||||
self.startTagProcessInHead),
|
||||
("title", self.startTagTitle),
|
||||
("body", self.startTagBody),
|
||||
(("address", "blockquote", "center", "dir", "div", "dl",
|
||||
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
|
||||
@ -604,8 +701,9 @@ class InBodyPhase(Phase):
|
||||
("plaintext",self.startTagPlaintext),
|
||||
(headingElements, self.startTagHeading),
|
||||
("a", self.startTagA),
|
||||
(("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
|
||||
"strong", "tt", "u"),self.startTagFormatting),
|
||||
(("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
|
||||
"tt", "u"),self.startTagFormatting),
|
||||
("nobr", self.startTagNobr),
|
||||
("button", self.startTagButton),
|
||||
(("marquee", "object"), self.startTagMarqueeObject),
|
||||
("xmp", self.startTagXmp),
|
||||
@ -642,7 +740,8 @@ class InBodyPhase(Phase):
|
||||
(("head", "frameset", "select", "optgroup", "option", "table",
|
||||
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
|
||||
"td", "th"), self.endTagMisplaced),
|
||||
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
|
||||
("br", self.endTagBr),
|
||||
(("area", "basefont", "bgsound", "embed", "hr", "image",
|
||||
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
|
||||
self.endTagNone),
|
||||
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
|
||||
@ -659,11 +758,13 @@ class InBodyPhase(Phase):
|
||||
self.tree.openElements[-1])
|
||||
|
||||
# the real deal
|
||||
def processSpaceCharactersPre(self, data):
|
||||
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
||||
def processSpaceCharactersDropNewline(self, data):
|
||||
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
|
||||
# leading newlines
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
|
||||
and not self.tree.openElements[-1].hasContent()):
|
||||
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
|
||||
or self.tree.openElements[-1].name == "textarea")
|
||||
and not self.tree.openElements[-1].hasContent()):
|
||||
data = data[1:]
|
||||
if data:
|
||||
self.tree.insertText(data)
|
||||
@ -675,10 +776,10 @@ class InBodyPhase(Phase):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(data)
|
||||
|
||||
def startTagScriptStyle(self, name, attributes):
|
||||
def startTagProcessInHead(self, name, attributes):
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
|
||||
def startTagFromHead(self, name, attributes):
|
||||
def startTagTitle(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
") that belongs in the head. Moved."))
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
@ -698,7 +799,7 @@ class InBodyPhase(Phase):
|
||||
self.endTagP("p")
|
||||
self.tree.insertElement(name, attributes)
|
||||
if name == "pre":
|
||||
self.processSpaceCharacters = self.processSpaceCharactersPre
|
||||
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
||||
|
||||
def startTagForm(self, name, attributes):
|
||||
if self.tree.formPointer:
|
||||
@ -717,9 +818,16 @@ class InBodyPhase(Phase):
|
||||
# AT Use reversed in Python 2.4...
|
||||
for i, node in enumerate(self.tree.openElements[::-1]):
|
||||
if node.name in stopName:
|
||||
poppedNodes = []
|
||||
for j in range(i+1):
|
||||
self.tree.openElements.pop()
|
||||
poppedNodes.append(self.tree.openElements.pop())
|
||||
if i >= 1:
|
||||
self.parser.parseError("Missing end tag%s (%s)"%
|
||||
(i > 1 and "s" or "",
|
||||
", ".join([item.name for item in
|
||||
poppedNodes[:-1]])))
|
||||
break
|
||||
|
||||
|
||||
# Phrasing elements are all non special, non scoping, non
|
||||
# formatting elements
|
||||
@ -738,14 +846,16 @@ class InBodyPhase(Phase):
|
||||
def startTagHeading(self, name, attributes):
|
||||
if self.tree.elementInScope("p"):
|
||||
self.endTagP("p")
|
||||
for item in headingElements:
|
||||
if self.tree.elementInScope(item):
|
||||
self.parser.parseError(_("Unexpected start tag (" + name +\
|
||||
")."))
|
||||
item = self.tree.openElements.pop()
|
||||
while item.name not in headingElements:
|
||||
item = self.tree.openElements.pop()
|
||||
break
|
||||
# Uncomment the following for IE7 behavior:
|
||||
#
|
||||
#for item in headingElements:
|
||||
# if self.tree.elementInScope(item):
|
||||
# self.parser.parseError(_("Unexpected start tag (" + name +\
|
||||
# ")."))
|
||||
# item = self.tree.openElements.pop()
|
||||
# while item.name not in headingElements:
|
||||
# item = self.tree.openElements.pop()
|
||||
# break
|
||||
self.tree.insertElement(name, attributes)
|
||||
|
||||
def startTagA(self, name, attributes):
|
||||
@ -765,6 +875,12 @@ class InBodyPhase(Phase):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.addFormattingElement(name, attributes)
|
||||
|
||||
def startTagNobr(self, name, attributes):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
if self.tree.elementInScope("nobr"):
|
||||
self.processEndTag("nobr")
|
||||
self.addFormattingElement(name, attributes)
|
||||
|
||||
def startTagButton(self, name, attributes):
|
||||
if self.tree.elementInScope("button"):
|
||||
self.parser.parseError(_("Unexpected start tag (button) implied "
|
||||
@ -840,6 +956,7 @@ class InBodyPhase(Phase):
|
||||
# XXX Form element pointer checking here as well...
|
||||
self.tree.insertElement(name, attributes)
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
|
||||
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
||||
|
||||
def startTagCdata(self, name, attributes):
|
||||
"""iframe, noembed noframes, noscript(if scripting enabled)"""
|
||||
@ -861,11 +978,13 @@ class InBodyPhase(Phase):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Ignored."))
|
||||
|
||||
def startTagNew(self, name, other):
|
||||
def startTagNew(self, name, attributes):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
"article", "aside", "header", "footer", "datagrid", "command"
|
||||
"""
|
||||
raise NotImplementedError
|
||||
sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
|
||||
self.startTagOther(name, attributes)
|
||||
#raise NotImplementedError
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
@ -1082,6 +1201,12 @@ class InBodyPhase(Phase):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u"). Ignored."))
|
||||
|
||||
def endTagBr(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertElement(name, {})
|
||||
self.tree.openElements.pop()
|
||||
|
||||
def endTagNone(self, name):
|
||||
# This handles elements with no end tag.
|
||||
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
||||
@ -1097,7 +1222,9 @@ class InBodyPhase(Phase):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
"article", "aside", "header", "footer", "datagrid", "command"
|
||||
"""
|
||||
raise NotImplementedError
|
||||
sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
|
||||
self.endTagOther(name)
|
||||
#raise NotImplementedError
|
||||
|
||||
def endTagOther(self, name):
|
||||
# XXX This logic should be moved into the treebuilder
|
||||
@ -1222,10 +1349,10 @@ class InTablePhase(Phase):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
||||
u"table context caused voodoo mode."))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.parser.insertFromTable = True
|
||||
self.tree.insertFromTable = True
|
||||
# Process the end tag in the "in body" mode
|
||||
self.parser.phases["inBody"].processEndTag(name)
|
||||
self.parser.insertFromTable = False
|
||||
self.tree.insertFromTable = False
|
||||
|
||||
|
||||
class InCaptionPhase(Phase):
|
||||
@ -1699,7 +1826,7 @@ class AfterBodyPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
# XXX We should prolly add a handler for "html" here as well...
|
||||
# XXX We should prolly add a handler for here as well...
|
||||
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
@ -31,15 +31,17 @@ class HTMLInputStream(object):
|
||||
|
||||
"""
|
||||
# List of where new lines occur
|
||||
self.newLines = []
|
||||
self.newLines = [0]
|
||||
|
||||
# Raw Stream
|
||||
# Raw Stream
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
self.numBytesMeta = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
@ -48,20 +50,12 @@ class HTMLInputStream(object):
|
||||
encoding = self.detectEncoding(parseMeta, chardet)
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||
|
||||
# Normalize new ipythonlines and null characters
|
||||
uString = re.sub('\r\n?', '\n', uString)
|
||||
uString = re.sub('\x00', u'\uFFFD', uString)
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
self.dataStream = uString
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
||||
|
||||
self.queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
self.reset()
|
||||
self.line = self.col = 0
|
||||
self.lineLengths = []
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
@ -74,6 +68,8 @@ class HTMLInputStream(object):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
@ -90,10 +86,18 @@ class HTMLInputStream(object):
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and chardet:
|
||||
try:
|
||||
import chardet
|
||||
buffer = self.rawStream.read()
|
||||
encoding = chardet.detect(buffer)['encoding']
|
||||
self.rawStream = self.openStream(buffer)
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
self.seek("".join(buffers), 0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
@ -119,60 +123,83 @@ class HTMLInputStream(object):
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
self.rawStream.seek(0)
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
#AT - move this to the caller?
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
self.seek(string, encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def seek(self, buffer, n):
|
||||
"""Unget buffer[n:]"""
|
||||
if hasattr(self.rawStream, 'unget'):
|
||||
self.rawStream.unget(buffer[n:])
|
||||
return
|
||||
|
||||
if hasattr(self.rawStream, 'seek'):
|
||||
try:
|
||||
self.rawStream.seek(n)
|
||||
return
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
class BufferedStream:
|
||||
def __init__(self, data, stream):
|
||||
self.data = data
|
||||
self.stream = stream
|
||||
def read(self, chars=-1):
|
||||
if chars == -1 or chars > len(self.data):
|
||||
result = self.data
|
||||
self.data = ''
|
||||
if chars == -1:
|
||||
return result + self.stream.read()
|
||||
else:
|
||||
return result + self.stream.read(chars-len(result))
|
||||
elif not self.data:
|
||||
return self.stream.read(chars)
|
||||
else:
|
||||
result = self.data[:chars]
|
||||
self.data = self.data[chars:]
|
||||
return result
|
||||
def unget(self, data):
|
||||
if self.data:
|
||||
self.data += data
|
||||
else:
|
||||
self.data = data
|
||||
|
||||
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
|
||||
self.rawStream.seek(0)
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
parser = EncodingParser(buffer)
|
||||
self.seek(buffer, 0)
|
||||
return parser.getEncoding()
|
||||
|
||||
def determineNewLines(self):
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
self.newLines.append(0)
|
||||
for i in xrange(len(self.dataStream)):
|
||||
if self.dataStream[i] == u"\n":
|
||||
self.newLines.append(i)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
# Generate list of new lines first time around
|
||||
if not self.newLines:
|
||||
self.determineNewLines()
|
||||
|
||||
line = 0
|
||||
tell = self.tell
|
||||
for pos in self.newLines:
|
||||
if pos < tell:
|
||||
line += 1
|
||||
line, col = self.line, self.col
|
||||
for c in self.queue[::-1]:
|
||||
if c == '\n':
|
||||
line -= 1
|
||||
assert col == 0
|
||||
col = self.lineLengths[line]
|
||||
else:
|
||||
break
|
||||
col = tell - self.newLines[line-1] - 1
|
||||
return (line, col)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the position in the stream back to the start."""
|
||||
self.tell = 0
|
||||
col -= 1
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
@ -181,12 +208,28 @@ class HTMLInputStream(object):
|
||||
if self.queue:
|
||||
return self.queue.pop(0)
|
||||
else:
|
||||
try:
|
||||
self.tell += 1
|
||||
return self.dataStream[self.tell - 1]
|
||||
except:
|
||||
c = self.dataStream.read(1, 1)
|
||||
if not c:
|
||||
self.col += 1
|
||||
return EOF
|
||||
|
||||
# Normalize newlines and null characters
|
||||
if c == '\x00': c = u'\uFFFD'
|
||||
if c == '\r':
|
||||
c = self.dataStream.read(1, 1)
|
||||
if c != '\n':
|
||||
self.queue.insert(0, unicode(c))
|
||||
c = '\n'
|
||||
|
||||
# update position in stream
|
||||
if c == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
self.col += 1
|
||||
return unicode(c)
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
@ -194,23 +237,20 @@ class HTMLInputStream(object):
|
||||
"""
|
||||
charStack = [self.char()]
|
||||
|
||||
# First from the queue
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
||||
and self.queue:
|
||||
charStack.append(self.queue.pop(0))
|
||||
|
||||
# Then the rest
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||
try:
|
||||
self.tell += 1
|
||||
charStack.append(self.dataStream[self.tell - 1])
|
||||
except:
|
||||
charStack.append(EOF)
|
||||
charStack.append(self.char())
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
self.queue.insert(0, charStack.pop())
|
||||
return "".join(charStack)
|
||||
c = charStack.pop()
|
||||
if c != EOF:
|
||||
self.queue.insert(0, c)
|
||||
|
||||
return u"".join(charStack)
|
||||
|
||||
def unget(self, chars):
|
||||
if chars:
|
||||
self.queue = list(chars) + self.queue
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an assosiated position and various extra methods
|
@ -15,10 +15,13 @@ References:
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
from constants import voidElements
|
||||
from constants import voidElements, contentModelFlags
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from xml.dom import XHTML_NAMESPACE
|
||||
from xml.sax.saxutils import unescape
|
||||
|
||||
class XMLParser(html5parser.HTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
@ -45,6 +48,11 @@ class XMLParser(html5parser.HTMLParser):
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
|
||||
elif token["type"] == "Characters":
|
||||
# un-escape rcdataElements (e.g. style, script)
|
||||
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
||||
token["data"] = unescape(token["data"])
|
||||
|
||||
elif token["type"] == "Comment":
|
||||
# Rescue CDATA from the comments
|
||||
if (token["data"].startswith("[CDATA[") and
|
||||
@ -66,16 +74,21 @@ class XHTMLParser(XMLParser):
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token["type"] == "EndTag" and \
|
||||
token["name"] not in voidElements and \
|
||||
token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
|
||||
break
|
||||
if token["type"] == "EndTag":
|
||||
if token["name"] in voidElements:
|
||||
if not self.tree.openElements or \
|
||||
self.tree.openElements[-1].name != token["name"]:
|
||||
token["type"] = "EmptyTag"
|
||||
if not token.has_key("data"): token["data"] = {}
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
if token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] != XHTML_NAMESPACE:
|
||||
break
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
|
||||
return token
|
||||
|
189
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
189
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
@ -0,0 +1,189 @@
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from tokenizer import HTMLTokenizer
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||
'ul', 'var']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
|
||||
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
|
||||
'xml:lang']
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
||||
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
||||
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
||||
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
|
||||
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
|
||||
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
|
||||
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
||||
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
|
||||
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
|
||||
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
|
||||
'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
|
||||
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
|
||||
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||
'xlink:href', 'xml:base']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||
allowed_css_properties = acceptable_css_properties
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr): continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||
del attrs[attr]
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
yield token
|
||||
else:
|
||||
if token["type"] == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == "EmptyTag":
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = "Characters"
|
||||
del token["name"]
|
||||
yield token
|
||||
elif token["type"] == "Comment":
|
||||
pass
|
||||
else:
|
||||
yield token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
||||
|
||||
clean = []
|
||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||
if not value: continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
from htmlserializer import HTMLSerializer
|
||||
from xhtmlserializer import XHTMLSerializer
|
216
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
216
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
@ -0,0 +1,216 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.filters.whitespace import Filter as WhitespaceFilter
|
||||
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
|
||||
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements
|
||||
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
try:
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
except ImportError:
|
||||
unicode_encode_errors = "strict"
|
||||
else:
|
||||
unicode_encode_errors = "htmlentityreplace"
|
||||
|
||||
from html5lib.constants import entities
|
||||
|
||||
encode_entity_map = {}
|
||||
for k, v in entities.items():
|
||||
if v != "&" and encode_entity_map.get(v) != k.lower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
for c in ex.object[exc.start:exc.end]:
|
||||
c = encode_entity_map.get(c)
|
||||
if c:
|
||||
res.append("&")
|
||||
res.append(c)
|
||||
res.append(";")
|
||||
else:
|
||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||
return (u"".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||
|
||||
del register_error
|
||||
|
||||
def encode(text, encoding):
|
||||
return text.encode(encoding, unicode_encode_errors)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
quote_attr_values = False
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
minimize_boolean_attributes = True
|
||||
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
|
||||
omit_optional_tags = True
|
||||
|
||||
strip_whitespace = False
|
||||
|
||||
inject_meta_charset = True
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs.has_key('quote_char'):
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
if encoding and self.inject_meta_charset:
|
||||
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
|
||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
treewalker = WhitespaceFilter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
treewalker = OptionalTagFilter(treewalker)
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = u"<!DOCTYPE %s>" % token["name"]
|
||||
if encoding:
|
||||
yield doctype.encode(encoding)
|
||||
else:
|
||||
yield doctype
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError(_("Unexpected </ in CDATA"))
|
||||
if encoding:
|
||||
yield token["data"].encode(encoding, "strict")
|
||||
else:
|
||||
yield token["data"]
|
||||
elif encoding:
|
||||
yield encode(escape(token["data"]), encoding)
|
||||
else:
|
||||
yield escape(token["data"])
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
attrs = token["data"]
|
||||
if hasattr(attrs, "items"):
|
||||
attrs = attrs.items()
|
||||
attrs.sort()
|
||||
attributes = []
|
||||
for k,v in attrs:
|
||||
if encoding:
|
||||
k = k.encode(encoding, "strict")
|
||||
attributes.append(' ')
|
||||
|
||||
attributes.append(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) \
|
||||
and k not in booleanAttributes.get("", tuple())):
|
||||
attributes.append("=")
|
||||
if self.quote_attr_values or not v:
|
||||
quote_attr = True
|
||||
else:
|
||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||
spaceCharacters + "<>\"'", False)
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
||||
if encoding:
|
||||
v = encode(v, encoding)
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if "'" in v and '"' not in v:
|
||||
quote_char = '"'
|
||||
elif '"' in v and "'" not in v:
|
||||
quote_char = "'"
|
||||
if quote_char == "'":
|
||||
v = v.replace("'", "'")
|
||||
else:
|
||||
v = v.replace('"', """)
|
||||
attributes.append(quote_char)
|
||||
attributes.append(v)
|
||||
attributes.append(quote_char)
|
||||
else:
|
||||
attributes.append(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
attributes.append(" /")
|
||||
else:
|
||||
attributes.append("/")
|
||||
if encoding:
|
||||
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
|
||||
else:
|
||||
yield u"<%s%s>" % (name, u"".join(attributes))
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
end_tag = u"</%s>" % name
|
||||
if encoding:
|
||||
end_tag = end_tag.encode(encoding, "strict")
|
||||
yield end_tag
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError(_("Comment contains --"))
|
||||
comment = u"<!--%s-->" % token["data"]
|
||||
if encoding:
|
||||
comment = comment.encode(encoding, unicode_encode_errors)
|
||||
yield comment
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
if encoding:
|
||||
return "".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return u"".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
def SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
from htmlserializer import HTMLSerializer
|
||||
|
||||
class XHTMLSerializer(HTMLSerializer):
|
||||
quote_attr_values = True
|
||||
minimize_boolean_attributes = False
|
||||
use_trailing_solidus = True
|
||||
escape_lt_in_attrs = True
|
||||
omit_optional_tags = False
|
||||
escape_rcdata = True
|
@ -9,7 +9,7 @@ _ = gettext.gettext
|
||||
|
||||
from constants import contentModelFlags, spaceCharacters
|
||||
from constants import entitiesWindows1252, entities
|
||||
from constants import asciiLowercase, asciiLetters
|
||||
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
||||
from constants import digits, hexDigits, EOF
|
||||
|
||||
from inputstream import HTMLInputStream
|
||||
@ -50,18 +50,30 @@ class HTMLTokenizer(object):
|
||||
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||
"bogusComment":self.bogusCommentState,
|
||||
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||
"commentStart":self.commentStartState,
|
||||
"commentStartDash":self.commentStartDashState,
|
||||
"comment":self.commentState,
|
||||
"commentDash":self.commentDashState,
|
||||
"commentEndDash":self.commentEndDashState,
|
||||
"commentEnd":self.commentEndState,
|
||||
"doctype":self.doctypeState,
|
||||
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||
"doctypeName":self.doctypeNameState,
|
||||
"afterDoctypeName":self.afterDoctypeNameState,
|
||||
"beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
|
||||
"doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
|
||||
"doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
|
||||
"afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
|
||||
"beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
|
||||
"doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
|
||||
"doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
|
||||
"afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
|
||||
"bogusDoctype":self.bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
self.escapeFlag = False
|
||||
self.lastFourChars = []
|
||||
self.state = self.states["data"]
|
||||
|
||||
# The current token being created
|
||||
@ -77,7 +89,6 @@ class HTMLTokenizer(object):
|
||||
to return we yield the token which pauses processing until the next token
|
||||
is requested.
|
||||
"""
|
||||
self.stream.reset()
|
||||
self.tokenQueue = []
|
||||
# Start processing. When EOF is reached self.state will return False
|
||||
# instead of True and the loop will terminate.
|
||||
@ -102,7 +113,7 @@ class HTMLTokenizer(object):
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
|
||||
def consumeNumberEntity(self, isHex):
|
||||
"""This function returns either U+FFFD or the character based on the
|
||||
@ -132,70 +143,71 @@ class HTMLTokenizer(object):
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = int("".join(charStack), radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if 127 < charAsInt < 160:
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#self.tokenQueue.append({"type": "ParseError", "data":
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
if charAsInt == 13:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||
charAsInt = 10
|
||||
elif 127 < charAsInt < 160:
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0:
|
||||
charAsInt = 65533
|
||||
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
# 0 is not a good number, neither are illegal Unicode code points.
|
||||
if charAsInt > 0 and charAsInt <= 1114111:
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
try:
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != u";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
self.stream.queue.append(c)
|
||||
self.stream.unget(c)
|
||||
|
||||
return char
|
||||
|
||||
def consumeEntity(self):
|
||||
def consumeEntity(self, fromAttribute=False):
|
||||
char = None
|
||||
charStack = [self.stream.char()]
|
||||
if charStack[0] == u"#":
|
||||
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
|
||||
self.stream.unget(charStack)
|
||||
elif charStack[0] == u"#":
|
||||
# We might have a number entity here.
|
||||
charStack.extend([self.stream.char(), self.stream.char()])
|
||||
if EOF in charStack:
|
||||
# If we reach the end of the file put everything up to EOF
|
||||
# back in the queue
|
||||
charStack = charStack[:charStack.index(EOF)]
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else:
|
||||
if charStack[1].lower() == u"x" \
|
||||
and charStack[2] in hexDigits:
|
||||
# Hexadecimal entity detected.
|
||||
self.stream.queue.append(charStack[2])
|
||||
self.stream.unget(charStack[2])
|
||||
char = self.consumeNumberEntity(True)
|
||||
elif charStack[1] in digits:
|
||||
# Decimal entity detected.
|
||||
self.stream.queue.extend(charStack[1:])
|
||||
self.stream.unget(charStack[1:])
|
||||
char = self.consumeNumberEntity(False)
|
||||
else:
|
||||
# No number entity detected.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected but none found.")})
|
||||
# Break out if we reach the end of the file
|
||||
elif charStack[0] == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
else:
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
@ -216,7 +228,8 @@ class HTMLTokenizer(object):
|
||||
# that may match an entity
|
||||
entityName = None
|
||||
|
||||
# Try to find the longest entity the string will match
|
||||
# Try to find the longest entity the string will match to take care
|
||||
# of ¬i for instance.
|
||||
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||
possibleEntityName = "".join(charStack[:entityLength])
|
||||
if possibleEntityName in entities:
|
||||
@ -224,24 +237,26 @@ class HTMLTokenizer(object):
|
||||
break
|
||||
|
||||
if entityName is not None:
|
||||
char = entities[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";":
|
||||
if entityName[-1] != ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity didn't end with ';'.")})
|
||||
self.stream.queue.extend(charStack[entityLength:])
|
||||
if entityName[-1] != ";" and fromAttribute and \
|
||||
(charStack[entityLength] in asciiLetters
|
||||
or charStack[entityLength] in digits):
|
||||
self.stream.unget(charStack)
|
||||
else:
|
||||
char = entities[entityName]
|
||||
self.stream.unget(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity expected. Got none.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
return char
|
||||
|
||||
def processEntityInAttribute(self):
|
||||
"""This method replaces the need for "entityInAttributeValueState".
|
||||
"""
|
||||
entity = self.consumeEntity()
|
||||
entity = self.consumeEntity(True)
|
||||
if entity:
|
||||
self.currentToken["data"][-1][1] += entity
|
||||
else:
|
||||
@ -266,12 +281,30 @@ class HTMLTokenizer(object):
|
||||
|
||||
def dataState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"&" and self.contentModelFlag in\
|
||||
if self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
||||
if len(self.lastFourChars) == 4:
|
||||
self.lastFourChars.pop(0)
|
||||
self.lastFourChars.append(data)
|
||||
if data == "&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||
self.state = self.states["entityData"]
|
||||
elif data == u"<" and self.contentModelFlag !=\
|
||||
contentModelFlags["PLAINTEXT"]:
|
||||
elif data == "-" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == False and\
|
||||
"".join(self.lastFourChars) == "<!--":
|
||||
self.escapeFlag = True
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == "<" and (self.contentModelFlag ==\
|
||||
contentModelFlags["PCDATA"] or (self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == False)):
|
||||
self.state = self.states["tagOpen"]
|
||||
elif data == ">" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
|
||||
self.escapeFlag = False
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
@ -285,7 +318,7 @@ class HTMLTokenizer(object):
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data":
|
||||
data + self.stream.charsUntil((u"&", u"<"))})
|
||||
data + self.stream.charsUntil(("&", "<", ">", "-"))})
|
||||
return True
|
||||
|
||||
def entityDataState(self):
|
||||
@ -321,14 +354,14 @@ class HTMLTokenizer(object):
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
||||
"support processing instructions).")})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
@ -338,7 +371,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["closeTagOpen"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.insert(0, data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
@ -361,7 +394,7 @@ class HTMLTokenizer(object):
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
|
||||
if self.currentToken \
|
||||
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
@ -372,8 +405,6 @@ class HTMLTokenizer(object):
|
||||
# emitting the end tag token.
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
|
||||
@ -381,27 +412,25 @@ class HTMLTokenizer(object):
|
||||
# method to be walked through.
|
||||
return True
|
||||
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "EndTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken = {"type":"EndTag", "name":data, "data":[]}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def tagNameState(self):
|
||||
@ -413,11 +442,6 @@ class HTMLTokenizer(object):
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character when getting the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
@ -440,11 +464,6 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
@ -473,12 +492,6 @@ class HTMLTokenizer(object):
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
@ -515,11 +528,6 @@ class HTMLTokenizer(object):
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
@ -537,16 +545,11 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["attributeValueDoubleQuoted"]
|
||||
elif data == u"&":
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
self.stream.queue.append(data);
|
||||
self.stream.unget(data);
|
||||
elif data == u"'":
|
||||
self.state = self.states["attributeValueSingleQuoted"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
@ -594,11 +597,6 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
@ -625,27 +623,66 @@ class HTMLTokenizer(object):
|
||||
charStack = [self.stream.char(), self.stream.char()]
|
||||
if charStack == [u"-", u"-"]:
|
||||
self.currentToken = {"type": "Comment", "data": ""}
|
||||
self.state = self.states["comment"]
|
||||
self.state = self.states["commentStart"]
|
||||
else:
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
# Put in explicit EOF check
|
||||
if (not EOF in charStack and
|
||||
"".join(charStack).upper() == u"DOCTYPE"):
|
||||
self.currentToken =\
|
||||
{"type": "Doctype", "name": "", "data": True}
|
||||
self.currentToken = {"type":"Doctype", "name":"",
|
||||
"publicId":None, "systemId":None, "correct":True}
|
||||
self.state = self.states["doctype"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def commentStartState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.state = self.states["commentStartDash"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
def commentStartDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
|
||||
def commentState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentDash"]
|
||||
self.state = self.states["commentEndDash"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
@ -655,7 +692,7 @@ class HTMLTokenizer(object):
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
return True
|
||||
|
||||
def commentDashState(self):
|
||||
def commentEndDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentEnd"]
|
||||
@ -702,7 +739,7 @@ class HTMLTokenizer(object):
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
return True
|
||||
|
||||
@ -710,19 +747,16 @@ class HTMLTokenizer(object):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data in asciiLowercase:
|
||||
self.currentToken["name"] = data.upper()
|
||||
self.state = self.states["doctypeName"]
|
||||
elif data == u">":
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -732,30 +766,19 @@ class HTMLTokenizer(object):
|
||||
|
||||
def doctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
needsDoctypeCheck = False
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["afterDoctypeName"]
|
||||
needsDoctypeCheck = True
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if data in asciiLowercase:
|
||||
data = data.upper()
|
||||
self.currentToken["name"] += data
|
||||
needsDoctypeCheck = True
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
||||
self.currentToken["data"] = False
|
||||
return True
|
||||
|
||||
def afterDoctypeNameState(self):
|
||||
@ -766,28 +789,194 @@ class HTMLTokenizer(object):
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.currentToken["data"] = True
|
||||
# XXX EMIT
|
||||
self.stream.queue.append(data)
|
||||
self.currentToken["correct"] = False
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
charStack = [data]
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
if EOF not in charStack and\
|
||||
"".join(charStack).translate(asciiUpper2Lower) == "public":
|
||||
self.state = self.states["beforeDoctypePublicIdentifier"]
|
||||
elif EOF not in charStack and\
|
||||
"".join(charStack).translate(asciiUpper2Lower) == "system":
|
||||
self.state = self.states["beforeDoctypeSystemIdentifier"]
|
||||
else:
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def beforeDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["publicId"] = ""
|
||||
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["publicId"] = ""
|
||||
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
self.currentToken["data"] = True
|
||||
_("Unexpected end of DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def doctypePublicIdentifierDoubleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "\"":
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["publicId"] += data
|
||||
return True
|
||||
|
||||
def doctypePublicIdentifierSingleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "'":
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["publicId"] += data
|
||||
return True
|
||||
|
||||
def afterDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def beforeDoctypeSystemIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def doctypeSystemIdentifierDoubleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "\"":
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["systemId"] += data
|
||||
return True
|
||||
|
||||
def doctypeSystemIdentifierSingleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "'":
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["systemId"] += data
|
||||
return True
|
||||
|
||||
def afterDoctypeSystemIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def bogusDoctypeState(self):
|
||||
data = self.stream.char()
|
||||
self.currentToken["correct"] = False
|
||||
if data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
64
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
64
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
@ -0,0 +1,64 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
lxml.etree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType in ("dom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeBuilderCache[treeType] = mod.TreeBuilder
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
return treeBuilderCache.get(treeType)
|
@ -1,4 +1,4 @@
|
||||
from constants import scopingElements, tableInsertModeElements
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
20
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
20
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
@ -2,7 +2,7 @@ import _base
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
from xml.sax.saxutils import escape
|
||||
from constants import voidElements
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -80,9 +80,11 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
setattr(self.dom, 'hilite', method)
|
||||
return self
|
||||
|
||||
def doctypeClass(self,name):
|
||||
def insertDoctype(self, name):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
||||
doctype = domimpl.createDocumentType(name,None,None)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name):
|
||||
return NodeBuilder(self.dom.createElement(name))
|
||||
@ -126,8 +128,8 @@ def testSerializer(element):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
@ -215,10 +217,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||
handler.startDocument()
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
|
||||
else:
|
||||
# ATTRIBUTE_NODE
|
249
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
249
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
@ -0,0 +1,249 @@
|
||||
import _base
|
||||
import new
|
||||
import copy
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation, fullTree=False):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
for name, value in self.attributes.iteritems():
|
||||
element.attributes[name] = value
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
self._element = ElementTree.Comment(data)
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_ROOT>")
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if type(element) == type(ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
@ -1,5 +1,5 @@
|
||||
import _base
|
||||
from constants import voidElements
|
||||
from html5lib.constants import voidElements
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
162
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
162
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
@ -0,0 +1,162 @@
|
||||
|
||||
import sys
|
||||
import copy
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return self.attrs.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return self.attrs.items()
|
||||
def keys(self):
|
||||
return self.attrs.keys()
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs.keys()
|
||||
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup=soup
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[-1]+node.element), self.soup)
|
||||
self.element.contents[-1].extract()
|
||||
self.appendChild(newNode)
|
||||
else:
|
||||
self.element.insert(len(self.element.contents), node.element)
|
||||
node.parent = self
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(NavigableString(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.contents.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[index-1]+node.element), self.soup)
|
||||
self.element.contents[index-1].extract()
|
||||
self.insertBefore(newNode, refNode)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(Element(child, self.soup))
|
||||
else:
|
||||
newParent.appendChild(TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
node = Element(Tag(self.soup, self.element.name), self.soup)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup=soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup)
|
||||
|
||||
def insertDoctype(self, name):
|
||||
self.soup.insert(0, Declaration(name))
|
||||
|
||||
def elementClass(self, name):
|
||||
return Element(Tag(self.soup, name), self.soup)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup)
|
||||
|
||||
def appendChild(self, node):
|
||||
self.soup.insert(len(self.soup.contents), node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, Declaration):
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
|
||||
elif isinstance(element, BeautifulSoup):
|
||||
if element.name == "[document_fragment]":
|
||||
rv.append("#document-fragment")
|
||||
else:
|
||||
rv.append("#document")
|
||||
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
|
||||
elif isinstance(element, unicode):
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.name))
|
||||
if element.attrs:
|
||||
for name, value in element.attrs:
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
if hasattr(element, "contents"):
|
||||
for child in element.contents:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
47
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
47
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
"""A collection of modules for iterating through different kinds of
|
||||
tree, generating tokens identical to those produced by the tokenizer
|
||||
module.
|
||||
|
||||
To create a tree walker for a new type of tree, you need to do
|
||||
implement a tree walker object (called TreeWalker by convention) that
|
||||
implements a 'serialize' method taking a tree as sole argument and
|
||||
returning an iterator generating tokens.
|
||||
"""
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeWalker class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"pulldom" - The xml.dom.pulldom event stream
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
"genshi" - a Genshi stream
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
lxml.etree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
if treeType in ("dom", "pulldom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeWalkerCache[treeType] = mod.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
import genshistream
|
||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeWalkerCache[treeType] = soup.TreeWalker
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
151
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
151
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
@ -0,0 +1,151 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class TreeWalker(object):
|
||||
def __init__(self, tree):
|
||||
self.tree = tree
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def error(self, msg):
|
||||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def normalizeAttrs(self, attrs):
|
||||
if not attrs:
|
||||
attrs = []
|
||||
elif hasattr(attrs, 'items'):
|
||||
attrs = attrs.items()
|
||||
return [(unicode(name),unicode(value)) for name,value in attrs]
|
||||
|
||||
def emptyTag(self, name, attrs, hasChildren=False):
|
||||
yield {"type": "EmptyTag", "name": unicode(name), \
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
if hasChildren:
|
||||
yield self.error(_("Void element has children"))
|
||||
|
||||
def startTag(self, name, attrs):
|
||||
return {"type": "StartTag", "name": unicode(name), \
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
|
||||
def endTag(self, name):
|
||||
return {"type": "EndTag", "name": unicode(name), "data": []}
|
||||
|
||||
def text(self, data):
|
||||
data = unicode(data)
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data)-len(middle)]
|
||||
if left:
|
||||
yield {"type": "SpaceCharacters", "data": left}
|
||||
data = middle
|
||||
middle = data.rstrip(spaceCharacters)
|
||||
right = data[len(middle):]
|
||||
if middle:
|
||||
yield {"type": "Characters", "data": middle}
|
||||
if right:
|
||||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
return {"type": "Comment", "data": unicode(data)}
|
||||
|
||||
def doctype(self, name):
|
||||
return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
|
||||
class RecursiveTreeWalker(TreeWalker):
|
||||
def walkChildren(self, node):
|
||||
raise NodeImplementedError
|
||||
|
||||
def element(self, node, name, attrs, hasChildren):
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attrs, hasChildren):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, attrs)
|
||||
if hasChildren:
|
||||
for token in self.walkChildren(node):
|
||||
yield token
|
||||
yield self.endTag(name)
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getFirstChild(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getNextSibling(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getParentNode(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def __iter__(self):
|
||||
currentNode = self.tree
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
|
||||
elif type == TEXT:
|
||||
for token in self.text(*details):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attributes, hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
yield self.startTag(name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
else:
|
||||
yield self.unknown(details[0])
|
||||
|
||||
if hasChildren:
|
||||
firstChild = self.getFirstChild(currentNode)
|
||||
else:
|
||||
firstChild = None
|
||||
|
||||
if firstChild is not None:
|
||||
currentNode = firstChild
|
||||
else:
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
from xml.dom import Node
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return _base.DOCTYPE, node.nodeName
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return _base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return _base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return _base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parentNode
|
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import new
|
||||
import copy
|
||||
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation):
|
||||
ElementTree = ElementTreeImplementation
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. An Element node serving as *context* (it cannot be called the parent
|
||||
node due to the particular ``tail`` text nodes.
|
||||
|
||||
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
|
||||
|
||||
3. A list used as a stack of all ancestor *context nodes*. It is a
|
||||
pair tuple whose first item is an Element and second item is a child
|
||||
index.
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
if key in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, key)
|
||||
else:
|
||||
node = elt[int(key)]
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
|
||||
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return _base.DOCTYPE, node.text
|
||||
|
||||
elif type(node.tag) == type(ElementTree.Comment):
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
assert key not in ("text", "tail"), "Text nodes have no children"
|
||||
parents.append((elt, int(key)))
|
||||
node = elt[int(key)]
|
||||
else:
|
||||
parents = []
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text", parents)
|
||||
else:
|
||||
return (node, 0, parents)
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
|
||||
elt, key, parents = node
|
||||
if key == "text":
|
||||
key = -1
|
||||
elif key == "tail":
|
||||
elt, key = parents.pop()
|
||||
else:
|
||||
# Look for "tail" of the "revisited" node
|
||||
child = elt[key]
|
||||
if child.tail:
|
||||
parents.append((elt, key))
|
||||
return (child, "tail", parents)
|
||||
|
||||
# case where key were "text" or "tail" or elt[key] had a tail
|
||||
key += 1
|
||||
if len(elt) > key:
|
||||
return (elt, key, parents)
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
elt, key, parents = node
|
||||
if parents:
|
||||
elt, key = parents.pop()
|
||||
return elt, key, parents
|
||||
else:
|
||||
# HACK: We could return ``elt`` but None will stop the algorithm the same way
|
||||
return None
|
||||
|
||||
return locals()
|
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
|
||||
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.output import NamespaceFlattener
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
depth = 0
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in NamespaceFlattener(prefixes={
|
||||
'http://www.w3.org/1999/xhtml': ''
|
||||
})(self.tree):
|
||||
if previous is not None:
|
||||
if previous[0] == START:
|
||||
depth += 1
|
||||
if ignore_until <= depth:
|
||||
ignore_until = None
|
||||
if ignore_until is None:
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = depth
|
||||
if previous[0] == END:
|
||||
depth -= 1
|
||||
previous = event
|
||||
if previous is not None:
|
||||
if ignore_until is None or ignore_until <= depth:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, pos = event
|
||||
if kind == START:
|
||||
tag, attrib = data
|
||||
if tag in voidElements:
|
||||
for token in self.emptyTag(tag, list(attrib), \
|
||||
not next or next[0] != END or next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(tag, list(attrib))
|
||||
|
||||
elif kind == END:
|
||||
if data not in voidElements:
|
||||
yield self.endTag(data)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
|
||||
elif kind == TEXT:
|
||||
for token in self.text(data):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(data[0])
|
||||
|
||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
else:
|
||||
yield self.unknown(kind)
|
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None and \
|
||||
(ignore_until is None or previous[1] is ignore_until):
|
||||
if previous[1] is ignore_until:
|
||||
ignore_until = None
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = previous[1]
|
||||
previous = event
|
||||
if ignore_until is None or previous[1] is ignore_until:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
type, node = event
|
||||
if type == START_ELEMENT:
|
||||
name = node.nodeName
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, \
|
||||
node.attributes.items(), not next or next[1] is not node):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, node.attributes.items())
|
||||
|
||||
elif type == END_ELEMENT:
|
||||
name = node.nodeName
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(node.nodeValue)
|
||||
|
||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
||||
for token in self.text(node.nodeValue):
|
||||
yield token
|
||||
|
||||
else:
|
||||
yield self.unknown(type)
|
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given that simpletree has no performant way of getting a node's
|
||||
next sibling, this implementation returns "nodes" as tuples with the
|
||||
following content:
|
||||
|
||||
1. The parent Node (Element, Document or DocumentFragment)
|
||||
|
||||
2. The child index of the current node in its parent's children list
|
||||
|
||||
3. A list used as a stack of all ancestors. It is a pair tuple whose
|
||||
first item is a parent Node and second item is a child index.
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
node = parent.childNodes[idx]
|
||||
|
||||
# testing node.type allows us not to import treebuilders.simpletree
|
||||
if node.type in (1, 2): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.type == 3: # DocumentType
|
||||
return _base.DOCTYPE, node.name
|
||||
|
||||
elif node.type == 4: # TextNode
|
||||
return _base.TEXT, node.value
|
||||
|
||||
elif node.type == 5: # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
node.attributes.items(), node.hasContent()
|
||||
|
||||
elif node.type == 6: # CommentNode
|
||||
return _base.COMMENT, node.data
|
||||
|
||||
else:
|
||||
return _node.UNKNOWN, node.type
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
parents.append((parent, idx))
|
||||
node = parent.childNodes[idx]
|
||||
else:
|
||||
parents = []
|
||||
|
||||
assert node.hasContent(), "Node has no children"
|
||||
return (node, 0, parents)
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
parent, idx, parents = node
|
||||
idx += 1
|
||||
if len(parent.childNodes) > idx:
|
||||
return (parent, idx, parents)
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
parent, idx, parents = node
|
||||
if parents:
|
||||
parent, idx = parents.pop()
|
||||
return parent, idx, parents
|
||||
else:
|
||||
# HACK: We could return ``parent`` but None will stop the algorithm the same way
|
||||
return None
|
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Declaration): # DocumentType
|
||||
#Slice needed to remove markup added during unicode conversion
|
||||
return _base.DOCTYPE, unicode(node.string)[2:-1]
|
||||
|
||||
elif isinstance(node, Comment):
|
||||
return _base.COMMENT, unicode(node.string)[4:-3]
|
||||
|
||||
elif isinstance(node, unicode): # TextNode
|
||||
return _base.TEXT, node
|
||||
|
||||
elif isinstance(node, Tag): # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
dict(node.attrs).items(), node.contents
|
||||
else:
|
||||
return _base.UNKNOWN, node.__class__.__name__
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.contents[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parent
|
Loading…
x
Reference in New Issue
Block a user