Update to latest html5lib; move packaged dependencies to vendor directory
This commit is contained in:
parent
65e41f7b22
commit
fc90da7fc0
@ -1,5 +1,5 @@
|
|||||||
import sys
|
import sys
|
||||||
from planet import html5lib
|
import html5lib
|
||||||
tree=html5lib.treebuilders.dom.TreeBuilder
|
tree=html5lib.treebuilders.dom.TreeBuilder
|
||||||
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
||||||
document = parser.parse(sys.stdin)
|
document = parser.parse(sys.stdin)
|
||||||
|
@ -23,8 +23,9 @@ from xml.sax.saxutils import escape
|
|||||||
from htmlentitydefs import entitydefs
|
from htmlentitydefs import entitydefs
|
||||||
|
|
||||||
import planet
|
import planet
|
||||||
from planet import config, feedparser
|
from planet import config
|
||||||
from planet.spider import filename
|
from planet.spider import filename
|
||||||
|
import feedparser
|
||||||
log = planet.logger
|
log = planet.logger
|
||||||
options = config.filter_options(sys.argv[0])
|
options = config.filter_options(sys.argv[0])
|
||||||
|
|
||||||
|
@ -32,7 +32,9 @@ def getLogger(level, format):
|
|||||||
loggerParms = (level,format)
|
loggerParms = (level,format)
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__),'vendor'))
|
||||||
|
|
||||||
# Configure feed parser
|
# Configure feed parser
|
||||||
from planet import feedparser
|
import feedparser
|
||||||
feedparser.SANITIZE_HTML=0
|
feedparser.SANITIZE_HTML=0
|
||||||
feedparser.RESOLVE_RELATIVE_URIS=0
|
feedparser.RESOLVE_RELATIVE_URIS=0
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
"""A collection of modules for building different kinds of tree from
|
|
||||||
HTML documents.
|
|
||||||
|
|
||||||
To create a treebuilder for a new type of tree, you need to do
|
|
||||||
implement several things:
|
|
||||||
|
|
||||||
1) A set of classes for various types of elements: Document, Doctype,
|
|
||||||
Comment, Element. These must implement the interface of
|
|
||||||
_base.treebuilders.Node (although comment nodes have a different
|
|
||||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
|
||||||
Textual content may also be implemented as another node type, or not, as
|
|
||||||
your tree implementation requires.
|
|
||||||
|
|
||||||
2) A treebuilder object (called TreeBuilder by convention) that
|
|
||||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
|
||||||
documentClass - the class to use for the bottommost node of a document
|
|
||||||
elementClass - the class to use for HTML Elements
|
|
||||||
commentClass - the class to use for comments
|
|
||||||
doctypeClass - the class to use for doctypes
|
|
||||||
It also has one required method:
|
|
||||||
getDocument - Returns the root node of the complete document tree
|
|
||||||
|
|
||||||
3) If you wish to run the unit tests, you must also create a
|
|
||||||
testSerializer method on your treebuilder which accepts a node and
|
|
||||||
returns a string containing Node and its children serialized according
|
|
||||||
to the format used in the unittests
|
|
||||||
|
|
||||||
The supplied simpletree module provides a python-only implementation
|
|
||||||
of a full treebuilder and is a useful reference for the semantics of
|
|
||||||
the various methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os.path
|
|
||||||
__path__.append(os.path.dirname(__path__[0]))
|
|
||||||
|
|
||||||
import dom
|
|
||||||
import simpletree
|
|
||||||
|
|
||||||
try:
|
|
||||||
import etree
|
|
||||||
except:
|
|
||||||
pass
|
|
@ -1,5 +0,0 @@
|
|||||||
import etreefull
|
|
||||||
|
|
||||||
class TreeBuilder(etreefull.TreeBuilder):
|
|
||||||
def getDocument(self):
|
|
||||||
return self.document._element.find("html")
|
|
@ -1,227 +0,0 @@
|
|||||||
try:
|
|
||||||
from xml.etree import ElementTree
|
|
||||||
except ImportError:
|
|
||||||
try:
|
|
||||||
from elementtree import ElementTree
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
import _base
|
|
||||||
|
|
||||||
class Element(_base.Node):
|
|
||||||
def __init__(self, name):
|
|
||||||
self._element = ElementTree.Element(name)
|
|
||||||
self.name = name
|
|
||||||
self.parent = None
|
|
||||||
self._childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
def _setName(self, name):
|
|
||||||
self._element.tag = name
|
|
||||||
|
|
||||||
def _getName(self):
|
|
||||||
return self._element.tag
|
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
|
||||||
|
|
||||||
def _getAttributes(self):
|
|
||||||
return self._element.attrib
|
|
||||||
|
|
||||||
def _setAttributes(self, attributes):
|
|
||||||
#Delete existing attributes first
|
|
||||||
#XXX - there may be a better way to do this...
|
|
||||||
for key in self._element.attrib.keys():
|
|
||||||
del self._element.attrib[key]
|
|
||||||
for key, value in attributes.iteritems():
|
|
||||||
self._element.set(key, value)
|
|
||||||
|
|
||||||
attributes = property(_getAttributes, _setAttributes)
|
|
||||||
|
|
||||||
def _getChildNodes(self):
|
|
||||||
return self._childNodes
|
|
||||||
|
|
||||||
def _setChildNodes(self, value):
|
|
||||||
del self._element[:]
|
|
||||||
self._childNodes = []
|
|
||||||
for element in value:
|
|
||||||
self.insertChild(element)
|
|
||||||
|
|
||||||
childNodes = property(_getChildNodes, _setChildNodes)
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
"""Return true if the node has children or text"""
|
|
||||||
return bool(self._element.text or self._element.getchildren())
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self._childNodes.append(node)
|
|
||||||
self._element.append(node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
index = self._element.getchildren().index(refNode._element)
|
|
||||||
self._element.insert(index, node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
self._element.remove(node._element)
|
|
||||||
node.parent=None
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
if not(len(self._element)):
|
|
||||||
if not self._element.text:
|
|
||||||
self._element.text = ""
|
|
||||||
self._element.text += data
|
|
||||||
elif insertBefore is None:
|
|
||||||
#Insert the text as the tail of the last child element
|
|
||||||
if not self._element[-1].tail:
|
|
||||||
self._element[-1].tail = ""
|
|
||||||
self._element[-1].tail += data
|
|
||||||
else:
|
|
||||||
#Insert the text before the specified node
|
|
||||||
children = self._element.getchildren()
|
|
||||||
index = children.index(insertBefore._element)
|
|
||||||
if index > 0:
|
|
||||||
if not self._element[index-1].tail:
|
|
||||||
self._element[index-1].tail = ""
|
|
||||||
self._element[index-1].tail += data
|
|
||||||
else:
|
|
||||||
if not self._element.text:
|
|
||||||
self._element.text = ""
|
|
||||||
self._element.text += data
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
element = Element(self.name)
|
|
||||||
element.attributes = self.attributes
|
|
||||||
return element
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
if newParent.childNodes:
|
|
||||||
newParent.childNodes[-1]._element.tail += self._element.text
|
|
||||||
else:
|
|
||||||
if not newParent._element.text:
|
|
||||||
newParent._element.text = ""
|
|
||||||
if self._element.text is not None:
|
|
||||||
newParent._element.text += self._element.text
|
|
||||||
self._element.text = ""
|
|
||||||
_base.Node.reparentChildren(self, newParent)
|
|
||||||
|
|
||||||
class Comment(Element):
|
|
||||||
def __init__(self, data):
|
|
||||||
#Use the superclass constructor to set all properties on the
|
|
||||||
#wrapper element
|
|
||||||
Element.__init__(self, None)
|
|
||||||
self._element = ElementTree.Comment(data)
|
|
||||||
|
|
||||||
def _getData(self):
|
|
||||||
return self._element.text
|
|
||||||
|
|
||||||
def _setData(self, value):
|
|
||||||
self._element.text = value
|
|
||||||
|
|
||||||
data = property(_getData, _setData)
|
|
||||||
|
|
||||||
class DocumentType(Element):
|
|
||||||
def __init__(self, name):
|
|
||||||
Element.__init__(self, DocumentType)
|
|
||||||
self._element.text = name
|
|
||||||
|
|
||||||
class Document(Element):
|
|
||||||
def __init__(self):
|
|
||||||
Element.__init__(self, Document)
|
|
||||||
|
|
||||||
class DocumentFragment(Element):
|
|
||||||
def __init__(self):
|
|
||||||
Element.__init__(self, DocumentFragment)
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if element.tag is DocumentType:
|
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
|
||||||
elif element.tag is Document:
|
|
||||||
rv.append("#document")
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
|
||||||
if element.tail:
|
|
||||||
finalText = element.tail
|
|
||||||
elif element.tag is ElementTree.Comment:
|
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
|
||||||
if hasattr(element, "attrib"):
|
|
||||||
for name, value in element.attrib.iteritems():
|
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
|
||||||
indent += 2
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child, indent)
|
|
||||||
if element.tail:
|
|
||||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
def tostring(element):
|
|
||||||
"""Serialize an element and its child nodes to a string"""
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
def serializeElement(element):
|
|
||||||
if element.tag is DocumentType:
|
|
||||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
|
||||||
elif element.tag is Document:
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
if element.tail:
|
|
||||||
finalText = element.tail
|
|
||||||
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
elif element.tag is ElementTree.Comment:
|
|
||||||
rv.append("<!--%s-->"%(element.text,))
|
|
||||||
else:
|
|
||||||
#This is assumed to be an ordinary element
|
|
||||||
if not element.attrib:
|
|
||||||
rv.append("<%s>"%(element.tag,))
|
|
||||||
else:
|
|
||||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
|
||||||
for name, value in element.attrib.iteritems()])
|
|
||||||
rv.append("<%s %s>"%(element.tag, attr))
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
rv.append("</%s>"%(element.tag,))
|
|
||||||
|
|
||||||
if element.tail:
|
|
||||||
rv.append(element.tail)
|
|
||||||
|
|
||||||
serializeElement(element)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "".join(rv)
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocumentType
|
|
||||||
elementClass = Element
|
|
||||||
commentClass = Comment
|
|
||||||
fragmentClass = DocumentFragment
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
def getDocument(self):
|
|
||||||
return self.document._element
|
|
||||||
|
|
||||||
def getFragment(self):
|
|
||||||
return _base.TreeBuilder.getFragment(self)._element
|
|
@ -16,7 +16,8 @@ Todo:
|
|||||||
import re, time, md5, sgmllib
|
import re, time, md5, sgmllib
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
from xml.dom import minidom, Node
|
from xml.dom import minidom, Node
|
||||||
from planet.html5lib import liberalxmlparser, treebuilders
|
from html5lib import liberalxmlparser
|
||||||
|
from html5lib.treebuilders import dom
|
||||||
import planet, config
|
import planet, config
|
||||||
|
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -154,7 +155,7 @@ def content(xentry, name, detail, bozo):
|
|||||||
data = minidom.parseString(xdiv % detail.value).documentElement
|
data = minidom.parseString(xdiv % detail.value).documentElement
|
||||||
xcontent.setAttribute('type', 'xhtml')
|
xcontent.setAttribute('type', 'xhtml')
|
||||||
else:
|
else:
|
||||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
|
||||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||||
for body in html.documentElement.childNodes:
|
for body in html.documentElement.childNodes:
|
||||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
import sgmllib, time, os, sys, new, urlparse, re
|
import sgmllib, time, os, sys, new, urlparse, re
|
||||||
from planet import config, feedparser, htmltmpl
|
from planet import config, feedparser
|
||||||
|
import htmltmpl
|
||||||
|
|
||||||
voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
|
voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
|
||||||
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
|
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
|
||||||
|
@ -340,7 +340,7 @@ def spiderPlanet(only_if_new = False):
|
|||||||
log.info("Socket timeout set to %d seconds", timeout)
|
log.info("Socket timeout set to %d seconds", timeout)
|
||||||
except:
|
except:
|
||||||
try:
|
try:
|
||||||
from planet import timeoutsocket
|
import timeoutsocket
|
||||||
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
||||||
log.info("Socket timeout set to %d seconds", timeout)
|
log.info("Socket timeout set to %d seconds", timeout)
|
||||||
except:
|
except:
|
||||||
|
@ -158,6 +158,38 @@ voidElements = frozenset((
|
|||||||
"input"
|
"input"
|
||||||
))
|
))
|
||||||
|
|
||||||
|
cdataElements = frozenset(('title', 'textarea'))
|
||||||
|
|
||||||
|
rcdataElements = frozenset((
|
||||||
|
'style',
|
||||||
|
'script',
|
||||||
|
'xmp',
|
||||||
|
'iframe',
|
||||||
|
'noembed',
|
||||||
|
'noframes',
|
||||||
|
'noscript'
|
||||||
|
))
|
||||||
|
|
||||||
|
booleanAttributes = {
|
||||||
|
"": frozenset(("irrelevant",)),
|
||||||
|
"style": frozenset(("scoped",)),
|
||||||
|
"img": frozenset(("ismap",)),
|
||||||
|
"audio": frozenset(("autoplay","controls")),
|
||||||
|
"video": frozenset(("autoplay","controls")),
|
||||||
|
"script": frozenset(("defer", "async")),
|
||||||
|
"details": frozenset(("open",)),
|
||||||
|
"datagrid": frozenset(("multiple", "disabled")),
|
||||||
|
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
||||||
|
"menu": frozenset(("autosubmit",)),
|
||||||
|
"fieldset": frozenset(("disabled", "readonly")),
|
||||||
|
"option": frozenset(("disabled", "readonly", "selected")),
|
||||||
|
"optgroup": frozenset(("disabled", "readonly")),
|
||||||
|
"button": frozenset(("disabled", "autofocus")),
|
||||||
|
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
||||||
|
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
||||||
|
"output": frozenset(("disabled", "readonly")),
|
||||||
|
}
|
||||||
|
|
||||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||||
# therefore can't be a frozenset.
|
# therefore can't be a frozenset.
|
||||||
entitiesWindows1252 = (
|
entitiesWindows1252 = (
|
||||||
@ -196,265 +228,372 @@ entitiesWindows1252 = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
entities = {
|
entities = {
|
||||||
|
"AElig;": u"\u00C6",
|
||||||
"AElig": u"\u00C6",
|
"AElig": u"\u00C6",
|
||||||
"Aacute": u"\u00C1",
|
"AMP;": u"\u0026",
|
||||||
"Acirc": u"\u00C2",
|
|
||||||
"Agrave": u"\u00C0",
|
|
||||||
"Alpha": u"\u0391",
|
|
||||||
"Aring": u"\u00C5",
|
|
||||||
"Atilde": u"\u00C3",
|
|
||||||
"Auml": u"\u00C4",
|
|
||||||
"Beta": u"\u0392",
|
|
||||||
"Ccedil": u"\u00C7",
|
|
||||||
"Chi": u"\u03A7",
|
|
||||||
"Dagger": u"\u2021",
|
|
||||||
"Delta": u"\u0394",
|
|
||||||
"ETH": u"\u00D0",
|
|
||||||
"Eacute": u"\u00C9",
|
|
||||||
"Ecirc": u"\u00CA",
|
|
||||||
"Egrave": u"\u00C8",
|
|
||||||
"Epsilon": u"\u0395",
|
|
||||||
"Eta": u"\u0397",
|
|
||||||
"Euml": u"\u00CB",
|
|
||||||
"Gamma": u"\u0393",
|
|
||||||
"Iacute": u"\u00CD",
|
|
||||||
"Icirc": u"\u00CE",
|
|
||||||
"Igrave": u"\u00CC",
|
|
||||||
"Iota": u"\u0399",
|
|
||||||
"Iuml": u"\u00CF",
|
|
||||||
"Kappa": u"\u039A",
|
|
||||||
"Lambda": u"\u039B",
|
|
||||||
"Mu": u"\u039C",
|
|
||||||
"Ntilde": u"\u00D1",
|
|
||||||
"Nu": u"\u039D",
|
|
||||||
"OElig": u"\u0152",
|
|
||||||
"Oacute": u"\u00D3",
|
|
||||||
"Ocirc": u"\u00D4",
|
|
||||||
"Ograve": u"\u00D2",
|
|
||||||
"Omega": u"\u03A9",
|
|
||||||
"Omicron": u"\u039F",
|
|
||||||
"Oslash": u"\u00D8",
|
|
||||||
"Otilde": u"\u00D5",
|
|
||||||
"Ouml": u"\u00D6",
|
|
||||||
"Phi": u"\u03A6",
|
|
||||||
"Pi": u"\u03A0",
|
|
||||||
"Prime": u"\u2033",
|
|
||||||
"Psi": u"\u03A8",
|
|
||||||
"Rho": u"\u03A1",
|
|
||||||
"Scaron": u"\u0160",
|
|
||||||
"Sigma": u"\u03A3",
|
|
||||||
"THORN": u"\u00DE",
|
|
||||||
"Tau": u"\u03A4",
|
|
||||||
"Theta": u"\u0398",
|
|
||||||
"Uacute": u"\u00DA",
|
|
||||||
"Ucirc": u"\u00DB",
|
|
||||||
"Ugrave": u"\u00D9",
|
|
||||||
"Upsilon": u"\u03A5",
|
|
||||||
"Uuml": u"\u00DC",
|
|
||||||
"Xi": u"\u039E",
|
|
||||||
"Yacute": u"\u00DD",
|
|
||||||
"Yuml": u"\u0178",
|
|
||||||
"Zeta": u"\u0396",
|
|
||||||
"aacute": u"\u00E1",
|
|
||||||
"acirc": u"\u00E2",
|
|
||||||
"acute": u"\u00B4",
|
|
||||||
"aelig": u"\u00E6",
|
|
||||||
"agrave": u"\u00E0",
|
|
||||||
"alefsym": u"\u2135",
|
|
||||||
"alpha": u"\u03B1",
|
|
||||||
"amp": u"\u0026",
|
|
||||||
"AMP": u"\u0026",
|
"AMP": u"\u0026",
|
||||||
"and": u"\u2227",
|
"Aacute;": u"\u00C1",
|
||||||
"ang": u"\u2220",
|
"Aacute": u"\u00C1",
|
||||||
"apos": u"\u0027",
|
"Acirc;": u"\u00C2",
|
||||||
"aring": u"\u00E5",
|
"Acirc": u"\u00C2",
|
||||||
"asymp": u"\u2248",
|
"Agrave;": u"\u00C0",
|
||||||
"atilde": u"\u00E3",
|
"Agrave": u"\u00C0",
|
||||||
"auml": u"\u00E4",
|
"Alpha;": u"\u0391",
|
||||||
"bdquo": u"\u201E",
|
"Aring;": u"\u00C5",
|
||||||
"beta": u"\u03B2",
|
"Aring": u"\u00C5",
|
||||||
"brvbar": u"\u00A6",
|
"Atilde;": u"\u00C3",
|
||||||
"bull": u"\u2022",
|
"Atilde": u"\u00C3",
|
||||||
"cap": u"\u2229",
|
"Auml;": u"\u00C4",
|
||||||
"ccedil": u"\u00E7",
|
"Auml": u"\u00C4",
|
||||||
"cedil": u"\u00B8",
|
"Beta;": u"\u0392",
|
||||||
"cent": u"\u00A2",
|
"COPY;": u"\u00A9",
|
||||||
"chi": u"\u03C7",
|
|
||||||
"circ": u"\u02C6",
|
|
||||||
"clubs": u"\u2663",
|
|
||||||
"cong": u"\u2245",
|
|
||||||
"copy": u"\u00A9",
|
|
||||||
"COPY": u"\u00A9",
|
"COPY": u"\u00A9",
|
||||||
"crarr": u"\u21B5",
|
"Ccedil;": u"\u00C7",
|
||||||
"cup": u"\u222A",
|
"Ccedil": u"\u00C7",
|
||||||
"curren": u"\u00A4",
|
"Chi;": u"\u03A7",
|
||||||
"dArr": u"\u21D3",
|
"Dagger;": u"\u2021",
|
||||||
"dagger": u"\u2020",
|
"Delta;": u"\u0394",
|
||||||
"darr": u"\u2193",
|
"ETH;": u"\u00D0",
|
||||||
"deg": u"\u00B0",
|
"ETH": u"\u00D0",
|
||||||
"delta": u"\u03B4",
|
"Eacute;": u"\u00C9",
|
||||||
"diams": u"\u2666",
|
"Eacute": u"\u00C9",
|
||||||
"divide": u"\u00F7",
|
"Ecirc;": u"\u00CA",
|
||||||
"eacute": u"\u00E9",
|
"Ecirc": u"\u00CA",
|
||||||
"ecirc": u"\u00EA",
|
"Egrave;": u"\u00C8",
|
||||||
"egrave": u"\u00E8",
|
"Egrave": u"\u00C8",
|
||||||
"empty": u"\u2205",
|
"Epsilon;": u"\u0395",
|
||||||
"emsp": u"\u2003",
|
"Eta;": u"\u0397",
|
||||||
"ensp": u"\u2002",
|
"Euml;": u"\u00CB",
|
||||||
"epsilon": u"\u03B5",
|
"Euml": u"\u00CB",
|
||||||
"equiv": u"\u2261",
|
"GT;": u"\u003E",
|
||||||
"eta": u"\u03B7",
|
|
||||||
"eth": u"\u00F0",
|
|
||||||
"euml": u"\u00EB",
|
|
||||||
"euro": u"\u20AC",
|
|
||||||
"exist": u"\u2203",
|
|
||||||
"fnof": u"\u0192",
|
|
||||||
"forall": u"\u2200",
|
|
||||||
"frac12": u"\u00BD",
|
|
||||||
"frac14": u"\u00BC",
|
|
||||||
"frac34": u"\u00BE",
|
|
||||||
"frasl": u"\u2044",
|
|
||||||
"gamma": u"\u03B3",
|
|
||||||
"ge": u"\u2265",
|
|
||||||
"gt": u"\u003E",
|
|
||||||
"GT": u"\u003E",
|
"GT": u"\u003E",
|
||||||
"hArr": u"\u21D4",
|
"Gamma;": u"\u0393",
|
||||||
"harr": u"\u2194",
|
"Iacute;": u"\u00CD",
|
||||||
"hearts": u"\u2665",
|
"Iacute": u"\u00CD",
|
||||||
"hellip": u"\u2026",
|
"Icirc;": u"\u00CE",
|
||||||
"iacute": u"\u00ED",
|
"Icirc": u"\u00CE",
|
||||||
"icirc": u"\u00EE",
|
"Igrave;": u"\u00CC",
|
||||||
"iexcl": u"\u00A1",
|
"Igrave": u"\u00CC",
|
||||||
"igrave": u"\u00EC",
|
"Iota;": u"\u0399",
|
||||||
"image": u"\u2111",
|
"Iuml;": u"\u00CF",
|
||||||
"infin": u"\u221E",
|
"Iuml": u"\u00CF",
|
||||||
"int": u"\u222B",
|
"Kappa;": u"\u039A",
|
||||||
"iota": u"\u03B9",
|
"LT;": u"\u003C",
|
||||||
"iquest": u"\u00BF",
|
|
||||||
"isin": u"\u2208",
|
|
||||||
"iuml": u"\u00EF",
|
|
||||||
"kappa": u"\u03BA",
|
|
||||||
"lArr": u"\u21D0",
|
|
||||||
"lambda": u"\u03BB",
|
|
||||||
"lang": u"\u2329",
|
|
||||||
"laquo": u"\u00AB",
|
|
||||||
"larr": u"\u2190",
|
|
||||||
"lceil": u"\u2308",
|
|
||||||
"ldquo": u"\u201C",
|
|
||||||
"le": u"\u2264",
|
|
||||||
"lfloor": u"\u230A",
|
|
||||||
"lowast": u"\u2217",
|
|
||||||
"loz": u"\u25CA",
|
|
||||||
"lrm": u"\u200E",
|
|
||||||
"lsaquo": u"\u2039",
|
|
||||||
"lsquo": u"\u2018",
|
|
||||||
"lt": u"\u003C",
|
|
||||||
"LT": u"\u003C",
|
"LT": u"\u003C",
|
||||||
"macr": u"\u00AF",
|
"Lambda;": u"\u039B",
|
||||||
"mdash": u"\u2014",
|
"Mu;": u"\u039C",
|
||||||
"micro": u"\u00B5",
|
"Ntilde;": u"\u00D1",
|
||||||
"middot": u"\u00B7",
|
"Ntilde": u"\u00D1",
|
||||||
"minus": u"\u2212",
|
"Nu;": u"\u039D",
|
||||||
"mu": u"\u03BC",
|
"OElig;": u"\u0152",
|
||||||
"nabla": u"\u2207",
|
"Oacute;": u"\u00D3",
|
||||||
"nbsp": u"\u00A0",
|
"Oacute": u"\u00D3",
|
||||||
"ndash": u"\u2013",
|
"Ocirc;": u"\u00D4",
|
||||||
"ne": u"\u2260",
|
"Ocirc": u"\u00D4",
|
||||||
"ni": u"\u220B",
|
"Ograve;": u"\u00D2",
|
||||||
"not": u"\u00AC",
|
"Ograve": u"\u00D2",
|
||||||
"notin": u"\u2209",
|
"Omega;": u"\u03A9",
|
||||||
"nsub": u"\u2284",
|
"Omicron;": u"\u039F",
|
||||||
"ntilde": u"\u00F1",
|
"Oslash;": u"\u00D8",
|
||||||
"nu": u"\u03BD",
|
"Oslash": u"\u00D8",
|
||||||
"oacute": u"\u00F3",
|
"Otilde;": u"\u00D5",
|
||||||
"ocirc": u"\u00F4",
|
"Otilde": u"\u00D5",
|
||||||
"oelig": u"\u0153",
|
"Ouml;": u"\u00D6",
|
||||||
"ograve": u"\u00F2",
|
"Ouml": u"\u00D6",
|
||||||
"oline": u"\u203E",
|
"Phi;": u"\u03A6",
|
||||||
"omega": u"\u03C9",
|
"Pi;": u"\u03A0",
|
||||||
"omicron": u"\u03BF",
|
"Prime;": u"\u2033",
|
||||||
"oplus": u"\u2295",
|
"Psi;": u"\u03A8",
|
||||||
"or": u"\u2228",
|
"QUOT;": u"\u0022",
|
||||||
"ordf": u"\u00AA",
|
|
||||||
"ordm": u"\u00BA",
|
|
||||||
"oslash": u"\u00F8",
|
|
||||||
"otilde": u"\u00F5",
|
|
||||||
"otimes": u"\u2297",
|
|
||||||
"ouml": u"\u00F6",
|
|
||||||
"para": u"\u00B6",
|
|
||||||
"part": u"\u2202",
|
|
||||||
"permil": u"\u2030",
|
|
||||||
"perp": u"\u22A5",
|
|
||||||
"phi": u"\u03C6",
|
|
||||||
"pi": u"\u03C0",
|
|
||||||
"piv": u"\u03D6",
|
|
||||||
"plusmn": u"\u00B1",
|
|
||||||
"pound": u"\u00A3",
|
|
||||||
"prime": u"\u2032",
|
|
||||||
"prod": u"\u220F",
|
|
||||||
"prop": u"\u221D",
|
|
||||||
"psi": u"\u03C8",
|
|
||||||
"quot": u"\u0022",
|
|
||||||
"QUOT": u"\u0022",
|
"QUOT": u"\u0022",
|
||||||
"rArr": u"\u21D2",
|
"REG;": u"\u00AE",
|
||||||
"radic": u"\u221A",
|
|
||||||
"rang": u"\u232A",
|
|
||||||
"raquo": u"\u00BB",
|
|
||||||
"rarr": u"\u2192",
|
|
||||||
"rceil": u"\u2309",
|
|
||||||
"rdquo": u"\u201D",
|
|
||||||
"real": u"\u211C",
|
|
||||||
"reg": u"\u00AE",
|
|
||||||
"REG": u"\u00AE",
|
"REG": u"\u00AE",
|
||||||
"rfloor": u"\u230B",
|
"Rho;": u"\u03A1",
|
||||||
"rho": u"\u03C1",
|
"Scaron;": u"\u0160",
|
||||||
"rlm": u"\u200F",
|
"Sigma;": u"\u03A3",
|
||||||
"rsaquo": u"\u203A",
|
"THORN;": u"\u00DE",
|
||||||
"rsquo": u"\u2019",
|
"THORN": u"\u00DE",
|
||||||
"sbquo": u"\u201A",
|
"TRADE;": u"\u2122",
|
||||||
"scaron": u"\u0161",
|
"Tau;": u"\u03A4",
|
||||||
"sdot": u"\u22C5",
|
"Theta;": u"\u0398",
|
||||||
|
"Uacute;": u"\u00DA",
|
||||||
|
"Uacute": u"\u00DA",
|
||||||
|
"Ucirc;": u"\u00DB",
|
||||||
|
"Ucirc": u"\u00DB",
|
||||||
|
"Ugrave;": u"\u00D9",
|
||||||
|
"Ugrave": u"\u00D9",
|
||||||
|
"Upsilon;": u"\u03A5",
|
||||||
|
"Uuml;": u"\u00DC",
|
||||||
|
"Uuml": u"\u00DC",
|
||||||
|
"Xi;": u"\u039E",
|
||||||
|
"Yacute;": u"\u00DD",
|
||||||
|
"Yacute": u"\u00DD",
|
||||||
|
"Yuml;": u"\u0178",
|
||||||
|
"Zeta;": u"\u0396",
|
||||||
|
"aacute;": u"\u00E1",
|
||||||
|
"aacute": u"\u00E1",
|
||||||
|
"acirc;": u"\u00E2",
|
||||||
|
"acirc": u"\u00E2",
|
||||||
|
"acute;": u"\u00B4",
|
||||||
|
"acute": u"\u00B4",
|
||||||
|
"aelig;": u"\u00E6",
|
||||||
|
"aelig": u"\u00E6",
|
||||||
|
"agrave;": u"\u00E0",
|
||||||
|
"agrave": u"\u00E0",
|
||||||
|
"alefsym;": u"\u2135",
|
||||||
|
"alpha;": u"\u03B1",
|
||||||
|
"amp;": u"\u0026",
|
||||||
|
"amp": u"\u0026",
|
||||||
|
"and;": u"\u2227",
|
||||||
|
"ang;": u"\u2220",
|
||||||
|
"apos;": u"\u0027",
|
||||||
|
"aring;": u"\u00E5",
|
||||||
|
"aring": u"\u00E5",
|
||||||
|
"asymp;": u"\u2248",
|
||||||
|
"atilde;": u"\u00E3",
|
||||||
|
"atilde": u"\u00E3",
|
||||||
|
"auml;": u"\u00E4",
|
||||||
|
"auml": u"\u00E4",
|
||||||
|
"bdquo;": u"\u201E",
|
||||||
|
"beta;": u"\u03B2",
|
||||||
|
"brvbar;": u"\u00A6",
|
||||||
|
"brvbar": u"\u00A6",
|
||||||
|
"bull;": u"\u2022",
|
||||||
|
"cap;": u"\u2229",
|
||||||
|
"ccedil;": u"\u00E7",
|
||||||
|
"ccedil": u"\u00E7",
|
||||||
|
"cedil;": u"\u00B8",
|
||||||
|
"cedil": u"\u00B8",
|
||||||
|
"cent;": u"\u00A2",
|
||||||
|
"cent": u"\u00A2",
|
||||||
|
"chi;": u"\u03C7",
|
||||||
|
"circ;": u"\u02C6",
|
||||||
|
"clubs;": u"\u2663",
|
||||||
|
"cong;": u"\u2245",
|
||||||
|
"copy;": u"\u00A9",
|
||||||
|
"copy": u"\u00A9",
|
||||||
|
"crarr;": u"\u21B5",
|
||||||
|
"cup;": u"\u222A",
|
||||||
|
"curren;": u"\u00A4",
|
||||||
|
"curren": u"\u00A4",
|
||||||
|
"dArr;": u"\u21D3",
|
||||||
|
"dagger;": u"\u2020",
|
||||||
|
"darr;": u"\u2193",
|
||||||
|
"deg;": u"\u00B0",
|
||||||
|
"deg": u"\u00B0",
|
||||||
|
"delta;": u"\u03B4",
|
||||||
|
"diams;": u"\u2666",
|
||||||
|
"divide;": u"\u00F7",
|
||||||
|
"divide": u"\u00F7",
|
||||||
|
"eacute;": u"\u00E9",
|
||||||
|
"eacute": u"\u00E9",
|
||||||
|
"ecirc;": u"\u00EA",
|
||||||
|
"ecirc": u"\u00EA",
|
||||||
|
"egrave;": u"\u00E8",
|
||||||
|
"egrave": u"\u00E8",
|
||||||
|
"empty;": u"\u2205",
|
||||||
|
"emsp;": u"\u2003",
|
||||||
|
"ensp;": u"\u2002",
|
||||||
|
"epsilon;": u"\u03B5",
|
||||||
|
"equiv;": u"\u2261",
|
||||||
|
"eta;": u"\u03B7",
|
||||||
|
"eth;": u"\u00F0",
|
||||||
|
"eth": u"\u00F0",
|
||||||
|
"euml;": u"\u00EB",
|
||||||
|
"euml": u"\u00EB",
|
||||||
|
"euro;": u"\u20AC",
|
||||||
|
"exist;": u"\u2203",
|
||||||
|
"fnof;": u"\u0192",
|
||||||
|
"forall;": u"\u2200",
|
||||||
|
"frac12;": u"\u00BD",
|
||||||
|
"frac12": u"\u00BD",
|
||||||
|
"frac14;": u"\u00BC",
|
||||||
|
"frac14": u"\u00BC",
|
||||||
|
"frac34;": u"\u00BE",
|
||||||
|
"frac34": u"\u00BE",
|
||||||
|
"frasl;": u"\u2044",
|
||||||
|
"gamma;": u"\u03B3",
|
||||||
|
"ge;": u"\u2265",
|
||||||
|
"gt;": u"\u003E",
|
||||||
|
"gt": u"\u003E",
|
||||||
|
"hArr;": u"\u21D4",
|
||||||
|
"harr;": u"\u2194",
|
||||||
|
"hearts;": u"\u2665",
|
||||||
|
"hellip;": u"\u2026",
|
||||||
|
"iacute;": u"\u00ED",
|
||||||
|
"iacute": u"\u00ED",
|
||||||
|
"icirc;": u"\u00EE",
|
||||||
|
"icirc": u"\u00EE",
|
||||||
|
"iexcl;": u"\u00A1",
|
||||||
|
"iexcl": u"\u00A1",
|
||||||
|
"igrave;": u"\u00EC",
|
||||||
|
"igrave": u"\u00EC",
|
||||||
|
"image;": u"\u2111",
|
||||||
|
"infin;": u"\u221E",
|
||||||
|
"int;": u"\u222B",
|
||||||
|
"iota;": u"\u03B9",
|
||||||
|
"iquest;": u"\u00BF",
|
||||||
|
"iquest": u"\u00BF",
|
||||||
|
"isin;": u"\u2208",
|
||||||
|
"iuml;": u"\u00EF",
|
||||||
|
"iuml": u"\u00EF",
|
||||||
|
"kappa;": u"\u03BA",
|
||||||
|
"lArr;": u"\u21D0",
|
||||||
|
"lambda;": u"\u03BB",
|
||||||
|
"lang;": u"\u3008",
|
||||||
|
"laquo;": u"\u00AB",
|
||||||
|
"laquo": u"\u00AB",
|
||||||
|
"larr;": u"\u2190",
|
||||||
|
"lceil;": u"\u2308",
|
||||||
|
"ldquo;": u"\u201C",
|
||||||
|
"le;": u"\u2264",
|
||||||
|
"lfloor;": u"\u230A",
|
||||||
|
"lowast;": u"\u2217",
|
||||||
|
"loz;": u"\u25CA",
|
||||||
|
"lrm;": u"\u200E",
|
||||||
|
"lsaquo;": u"\u2039",
|
||||||
|
"lsquo;": u"\u2018",
|
||||||
|
"lt;": u"\u003C",
|
||||||
|
"lt": u"\u003C",
|
||||||
|
"macr;": u"\u00AF",
|
||||||
|
"macr": u"\u00AF",
|
||||||
|
"mdash;": u"\u2014",
|
||||||
|
"micro;": u"\u00B5",
|
||||||
|
"micro": u"\u00B5",
|
||||||
|
"middot;": u"\u00B7",
|
||||||
|
"middot": u"\u00B7",
|
||||||
|
"minus;": u"\u2212",
|
||||||
|
"mu;": u"\u03BC",
|
||||||
|
"nabla;": u"\u2207",
|
||||||
|
"nbsp;": u"\u00A0",
|
||||||
|
"nbsp": u"\u00A0",
|
||||||
|
"ndash;": u"\u2013",
|
||||||
|
"ne;": u"\u2260",
|
||||||
|
"ni;": u"\u220B",
|
||||||
|
"not;": u"\u00AC",
|
||||||
|
"not": u"\u00AC",
|
||||||
|
"notin;": u"\u2209",
|
||||||
|
"nsub;": u"\u2284",
|
||||||
|
"ntilde;": u"\u00F1",
|
||||||
|
"ntilde": u"\u00F1",
|
||||||
|
"nu;": u"\u03BD",
|
||||||
|
"oacute;": u"\u00F3",
|
||||||
|
"oacute": u"\u00F3",
|
||||||
|
"ocirc;": u"\u00F4",
|
||||||
|
"ocirc": u"\u00F4",
|
||||||
|
"oelig;": u"\u0153",
|
||||||
|
"ograve;": u"\u00F2",
|
||||||
|
"ograve": u"\u00F2",
|
||||||
|
"oline;": u"\u203E",
|
||||||
|
"omega;": u"\u03C9",
|
||||||
|
"omicron;": u"\u03BF",
|
||||||
|
"oplus;": u"\u2295",
|
||||||
|
"or;": u"\u2228",
|
||||||
|
"ordf;": u"\u00AA",
|
||||||
|
"ordf": u"\u00AA",
|
||||||
|
"ordm;": u"\u00BA",
|
||||||
|
"ordm": u"\u00BA",
|
||||||
|
"oslash;": u"\u00F8",
|
||||||
|
"oslash": u"\u00F8",
|
||||||
|
"otilde;": u"\u00F5",
|
||||||
|
"otilde": u"\u00F5",
|
||||||
|
"otimes;": u"\u2297",
|
||||||
|
"ouml;": u"\u00F6",
|
||||||
|
"ouml": u"\u00F6",
|
||||||
|
"para;": u"\u00B6",
|
||||||
|
"para": u"\u00B6",
|
||||||
|
"part;": u"\u2202",
|
||||||
|
"permil;": u"\u2030",
|
||||||
|
"perp;": u"\u22A5",
|
||||||
|
"phi;": u"\u03C6",
|
||||||
|
"pi;": u"\u03C0",
|
||||||
|
"piv;": u"\u03D6",
|
||||||
|
"plusmn;": u"\u00B1",
|
||||||
|
"plusmn": u"\u00B1",
|
||||||
|
"pound;": u"\u00A3",
|
||||||
|
"pound": u"\u00A3",
|
||||||
|
"prime;": u"\u2032",
|
||||||
|
"prod;": u"\u220F",
|
||||||
|
"prop;": u"\u221D",
|
||||||
|
"psi;": u"\u03C8",
|
||||||
|
"quot;": u"\u0022",
|
||||||
|
"quot": u"\u0022",
|
||||||
|
"rArr;": u"\u21D2",
|
||||||
|
"radic;": u"\u221A",
|
||||||
|
"rang;": u"\u3009",
|
||||||
|
"raquo;": u"\u00BB",
|
||||||
|
"raquo": u"\u00BB",
|
||||||
|
"rarr;": u"\u2192",
|
||||||
|
"rceil;": u"\u2309",
|
||||||
|
"rdquo;": u"\u201D",
|
||||||
|
"real;": u"\u211C",
|
||||||
|
"reg;": u"\u00AE",
|
||||||
|
"reg": u"\u00AE",
|
||||||
|
"rfloor;": u"\u230B",
|
||||||
|
"rho;": u"\u03C1",
|
||||||
|
"rlm;": u"\u200F",
|
||||||
|
"rsaquo;": u"\u203A",
|
||||||
|
"rsquo;": u"\u2019",
|
||||||
|
"sbquo;": u"\u201A",
|
||||||
|
"scaron;": u"\u0161",
|
||||||
|
"sdot;": u"\u22C5",
|
||||||
|
"sect;": u"\u00A7",
|
||||||
"sect": u"\u00A7",
|
"sect": u"\u00A7",
|
||||||
|
"shy;": u"\u00AD",
|
||||||
"shy": u"\u00AD",
|
"shy": u"\u00AD",
|
||||||
"sigma": u"\u03C3",
|
"sigma;": u"\u03C3",
|
||||||
"sigmaf": u"\u03C2",
|
"sigmaf;": u"\u03C2",
|
||||||
"sim": u"\u223C",
|
"sim;": u"\u223C",
|
||||||
"spades": u"\u2660",
|
"spades;": u"\u2660",
|
||||||
"sub": u"\u2282",
|
"sub;": u"\u2282",
|
||||||
"sube": u"\u2286",
|
"sube;": u"\u2286",
|
||||||
"sum": u"\u2211",
|
"sum;": u"\u2211",
|
||||||
"sup": u"\u2283",
|
"sup1;": u"\u00B9",
|
||||||
"sup1": u"\u00B9",
|
"sup1": u"\u00B9",
|
||||||
|
"sup2;": u"\u00B2",
|
||||||
"sup2": u"\u00B2",
|
"sup2": u"\u00B2",
|
||||||
|
"sup3;": u"\u00B3",
|
||||||
"sup3": u"\u00B3",
|
"sup3": u"\u00B3",
|
||||||
"supe": u"\u2287",
|
"sup;": u"\u2283",
|
||||||
|
"supe;": u"\u2287",
|
||||||
|
"szlig;": u"\u00DF",
|
||||||
"szlig": u"\u00DF",
|
"szlig": u"\u00DF",
|
||||||
"tau": u"\u03C4",
|
"tau;": u"\u03C4",
|
||||||
"there4": u"\u2234",
|
"there4;": u"\u2234",
|
||||||
"theta": u"\u03B8",
|
"theta;": u"\u03B8",
|
||||||
"thetasym": u"\u03D1",
|
"thetasym;": u"\u03D1",
|
||||||
"thinsp": u"\u2009",
|
"thinsp;": u"\u2009",
|
||||||
|
"thorn;": u"\u00FE",
|
||||||
"thorn": u"\u00FE",
|
"thorn": u"\u00FE",
|
||||||
"tilde": u"\u02DC",
|
"tilde;": u"\u02DC",
|
||||||
|
"times;": u"\u00D7",
|
||||||
"times": u"\u00D7",
|
"times": u"\u00D7",
|
||||||
"trade": u"\u2122",
|
"trade;": u"\u2122",
|
||||||
"uArr": u"\u21D1",
|
"uArr;": u"\u21D1",
|
||||||
|
"uacute;": u"\u00FA",
|
||||||
"uacute": u"\u00FA",
|
"uacute": u"\u00FA",
|
||||||
"uarr": u"\u2191",
|
"uarr;": u"\u2191",
|
||||||
|
"ucirc;": u"\u00FB",
|
||||||
"ucirc": u"\u00FB",
|
"ucirc": u"\u00FB",
|
||||||
|
"ugrave;": u"\u00F9",
|
||||||
"ugrave": u"\u00F9",
|
"ugrave": u"\u00F9",
|
||||||
|
"uml;": u"\u00A8",
|
||||||
"uml": u"\u00A8",
|
"uml": u"\u00A8",
|
||||||
"upsih": u"\u03D2",
|
"upsih;": u"\u03D2",
|
||||||
"upsilon": u"\u03C5",
|
"upsilon;": u"\u03C5",
|
||||||
|
"uuml;": u"\u00FC",
|
||||||
"uuml": u"\u00FC",
|
"uuml": u"\u00FC",
|
||||||
"weierp": u"\u2118",
|
"weierp;": u"\u2118",
|
||||||
"xi": u"\u03BE",
|
"xi;": u"\u03BE",
|
||||||
|
"yacute;": u"\u00FD",
|
||||||
"yacute": u"\u00FD",
|
"yacute": u"\u00FD",
|
||||||
|
"yen;": u"\u00A5",
|
||||||
"yen": u"\u00A5",
|
"yen": u"\u00A5",
|
||||||
|
"yuml;": u"\u00FF",
|
||||||
"yuml": u"\u00FF",
|
"yuml": u"\u00FF",
|
||||||
"zeta": u"\u03B6",
|
"zeta;": u"\u03B6",
|
||||||
"zwj": u"\u200D",
|
"zwj;": u"\u200D",
|
||||||
"zwnj": u"\u200C"
|
"zwnj;": u"\u200C"
|
||||||
}
|
}
|
||||||
|
|
||||||
encodings = frozenset((
|
encodings = frozenset((
|
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
|
||||||
|
class Filter(object):
|
||||||
|
def __init__(self, source):
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.source)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.source, name)
|
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import _base
|
||||||
|
|
||||||
|
class Filter(_base.Filter):
|
||||||
|
def __init__(self, source, encoding):
|
||||||
|
_base.Filter.__init__(self, source)
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
state = "pre_head"
|
||||||
|
meta_found = (self.encoding is None)
|
||||||
|
pending = []
|
||||||
|
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag":
|
||||||
|
if token["name"].lower() == "head":
|
||||||
|
state = "in_head"
|
||||||
|
|
||||||
|
elif type == "EmptyTag":
|
||||||
|
if token["name"].lower() == "meta":
|
||||||
|
# replace charset with actual encoding
|
||||||
|
has_http_equiv_content_type = False
|
||||||
|
content_index = -1
|
||||||
|
for i,(name,value) in enumerate(token["data"]):
|
||||||
|
if name.lower() == 'charset':
|
||||||
|
token["data"][i] = (u'charset', self.encoding)
|
||||||
|
meta_found = True
|
||||||
|
break
|
||||||
|
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||||
|
has_http_equiv_content_type = True
|
||||||
|
elif name == 'content':
|
||||||
|
content_index = i
|
||||||
|
else:
|
||||||
|
if has_http_equiv_content_type and content_index >= 0:
|
||||||
|
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
|
||||||
|
meta_found = True
|
||||||
|
|
||||||
|
elif token["name"].lower() == "head" and not meta_found:
|
||||||
|
# insert meta into empty head
|
||||||
|
yield {"type": "StartTag", "name": "head",
|
||||||
|
"data": token["data"]}
|
||||||
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
|
"data": [["charset", self.encoding]]}
|
||||||
|
yield {"type": "EndTag", "name": "head"}
|
||||||
|
meta_found = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
if token["name"].lower() == "head" and pending:
|
||||||
|
# insert meta into head (if necessary) and flush pending queue
|
||||||
|
yield pending.pop(0)
|
||||||
|
if not meta_found:
|
||||||
|
yield {"type": "EmptyTag", "name": "meta",
|
||||||
|
"data": [["charset", self.encoding]]}
|
||||||
|
while pending:
|
||||||
|
yield pending.pop(0)
|
||||||
|
meta_found = True
|
||||||
|
state = "post_head"
|
||||||
|
|
||||||
|
if state == "in_head":
|
||||||
|
pending.append(token)
|
||||||
|
else:
|
||||||
|
yield token
|
90
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
90
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
from gettext import gettext
|
||||||
|
_ = gettext
|
||||||
|
|
||||||
|
import _base
|
||||||
|
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
||||||
|
|
||||||
|
from html5lib.constants import spaceCharacters
|
||||||
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
class LintError(Exception): pass
|
||||||
|
|
||||||
|
class Filter(_base.Filter):
|
||||||
|
def __iter__(self):
|
||||||
|
open_elements = []
|
||||||
|
contentModelFlag = "PCDATA"
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type in ("StartTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
if contentModelFlag != "PCDATA":
|
||||||
|
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
||||||
|
if not isinstance(name, unicode):
|
||||||
|
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||||
|
if not name:
|
||||||
|
raise LintError(_(u"Empty tag name"))
|
||||||
|
if type == "StartTag" and name in voidElements:
|
||||||
|
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
||||||
|
elif type == "EmptyTag" and name not in voidElements:
|
||||||
|
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
||||||
|
if type == "StartTag":
|
||||||
|
open_elements.append(name)
|
||||||
|
for name, value in token["data"]:
|
||||||
|
if not isinstance(name, unicode):
|
||||||
|
raise LintError(_("Attribute name is not a string: %r") % name)
|
||||||
|
if not name:
|
||||||
|
raise LintError(_(u"Empty attribute name"))
|
||||||
|
if not isinstance(value, unicode):
|
||||||
|
raise LintError(_("Attribute value is not a string: %r") % value)
|
||||||
|
if name in cdataElements:
|
||||||
|
contentModelFlag = "CDATA"
|
||||||
|
elif name in rcdataElements:
|
||||||
|
contentModelFlag = "RCDATA"
|
||||||
|
elif name == "plaintext":
|
||||||
|
contentModelFlag = "PLAINTEXT"
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
name = token["name"]
|
||||||
|
if not isinstance(name, unicode):
|
||||||
|
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||||
|
if not name:
|
||||||
|
raise LintError(_(u"Empty tag name"))
|
||||||
|
if name in voidElements:
|
||||||
|
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
||||||
|
start_name = open_elements.pop()
|
||||||
|
if start_name != name:
|
||||||
|
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
||||||
|
contentModelFlag = "PCDATA"
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
if contentModelFlag != "PCDATA":
|
||||||
|
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
data = token["data"]
|
||||||
|
if not isinstance(data, unicode):
|
||||||
|
raise LintError(_("Attribute name is not a string: %r") % data)
|
||||||
|
if not data:
|
||||||
|
raise LintError(_(u"%s token with empty data") % type)
|
||||||
|
if type == "SpaceCharacters":
|
||||||
|
data = data.strip(spaceCharacters)
|
||||||
|
if data:
|
||||||
|
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
||||||
|
|
||||||
|
elif type == "Doctype":
|
||||||
|
name = token["name"]
|
||||||
|
if contentModelFlag != "PCDATA":
|
||||||
|
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||||
|
if not isinstance(name, unicode):
|
||||||
|
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||||
|
if not name:
|
||||||
|
raise LintError(_(u"Empty tag name"))
|
||||||
|
# XXX: what to do with token["data"] ?
|
||||||
|
|
||||||
|
elif type in ("ParseError", "SerializeError"):
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise LintError(_(u"Unknown token type: %s") % type)
|
||||||
|
|
||||||
|
yield token
|
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
import _base
|
||||||
|
|
||||||
|
class Filter(_base.Filter):
|
||||||
|
def slider(self):
|
||||||
|
previous1 = previous2 = None
|
||||||
|
for token in self.source:
|
||||||
|
if previous1 is not None:
|
||||||
|
yield previous2, previous1, token
|
||||||
|
previous2 = previous1
|
||||||
|
previous1 = token
|
||||||
|
yield previous2, previous1, None
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for previous, token, next in self.slider():
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag":
|
||||||
|
if token["data"] or not self.is_optional_start(token["name"], previous, next):
|
||||||
|
yield token
|
||||||
|
elif type == "EndTag":
|
||||||
|
if not self.is_optional_end(token["name"], next):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def is_optional_start(self, tagname, previous, next):
|
||||||
|
type = next and next["type"] or None
|
||||||
|
if tagname in 'html':
|
||||||
|
# An html element's start tag may be omitted if the first thing
|
||||||
|
# inside the html element is not a space character or a comment.
|
||||||
|
return type not in ("Comment", "SpaceCharacters")
|
||||||
|
elif tagname == 'head':
|
||||||
|
# A head element's start tag may be omitted if the first thing
|
||||||
|
# inside the head element is an element.
|
||||||
|
return type == "StartTag"
|
||||||
|
elif tagname == 'body':
|
||||||
|
# A body element's start tag may be omitted if the first thing
|
||||||
|
# inside the body element is not a space character or a comment,
|
||||||
|
# except if the first thing inside the body element is a script
|
||||||
|
# or style element and the node immediately preceding the body
|
||||||
|
# element is a head element whose end tag has been omitted.
|
||||||
|
if type in ("Comment", "SpaceCharacters"):
|
||||||
|
return False
|
||||||
|
elif type == "StartTag":
|
||||||
|
# XXX: we do not look at the preceding event, so we never omit
|
||||||
|
# the body element's start tag if it's followed by a script or
|
||||||
|
# a style element.
|
||||||
|
return next["name"] not in ('script', 'style')
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
elif tagname == 'colgroup':
|
||||||
|
# A colgroup element's start tag may be omitted if the first thing
|
||||||
|
# inside the colgroup element is a col element, and if the element
|
||||||
|
# is not immediately preceeded by another colgroup element whose
|
||||||
|
# end tag has been omitted.
|
||||||
|
if type == "StartTag":
|
||||||
|
# XXX: we do not look at the preceding event, so instead we never
|
||||||
|
# omit the colgroup element's end tag when it is immediately
|
||||||
|
# followed by another colgroup element. See is_optional_end.
|
||||||
|
return next["name"] == "col"
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'tbody':
|
||||||
|
# A tbody element's start tag may be omitted if the first thing
|
||||||
|
# inside the tbody element is a tr element, and if the element is
|
||||||
|
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||||
|
# whose end tag has been omitted.
|
||||||
|
if type == "StartTag":
|
||||||
|
# omit the thead and tfoot elements' end tag when they are
|
||||||
|
# immediately followed by a tbody element. See is_optional_end.
|
||||||
|
if previous and previous['type'] == 'EndTag' and \
|
||||||
|
previous['name'] in ('tbody','thead','tfoot'):
|
||||||
|
return False
|
||||||
|
return next["name"] == 'tr'
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_optional_end(self, tagname, next):
|
||||||
|
type = next and next["type"] or None
|
||||||
|
if tagname in ('html', 'head', 'body'):
|
||||||
|
# An html element's end tag may be omitted if the html element
|
||||||
|
# is not immediately followed by a space character or a comment.
|
||||||
|
return type not in ("Comment", "SpaceCharacters")
|
||||||
|
elif tagname in ('li', 'optgroup', 'option', 'tr'):
|
||||||
|
# A li element's end tag may be omitted if the li element is
|
||||||
|
# immediately followed by another li element or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# An optgroup element's end tag may be omitted if the optgroup
|
||||||
|
# element is immediately followed by another optgroup element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
# An option element's end tag may be omitted if the option
|
||||||
|
# element is immediately followed by another option element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
# A tr element's end tag may be omitted if the tr element is
|
||||||
|
# immediately followed by another tr element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] == tagname
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('dt', 'dd'):
|
||||||
|
# A dt element's end tag may be omitted if the dt element is
|
||||||
|
# immediately followed by another dt element or a dd element.
|
||||||
|
# A dd element's end tag may be omitted if the dd element is
|
||||||
|
# immediately followed by another dd element or a dt element,
|
||||||
|
# or if there is no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('dt', 'dd')
|
||||||
|
elif tagname == 'dd':
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'p':
|
||||||
|
# A p element's end tag may be omitted if the p element is
|
||||||
|
# immediately followed by an address, blockquote, dl, fieldset,
|
||||||
|
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||||
|
# or ul element, or if there is no more content in the parent
|
||||||
|
# element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('address', 'blockquote', \
|
||||||
|
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
|
||||||
|
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname == 'colgroup':
|
||||||
|
# A colgroup element's end tag may be omitted if the colgroup
|
||||||
|
# element is not immediately followed by a space character or
|
||||||
|
# a comment.
|
||||||
|
if type in ("Comment", "SpaceCharacters"):
|
||||||
|
return False
|
||||||
|
elif type == "StartTag":
|
||||||
|
# XXX: we also look for an immediately following colgroup
|
||||||
|
# element. See is_optional_start.
|
||||||
|
return next["name"] != 'colgroup'
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
elif tagname in ('thead', 'tbody'):
|
||||||
|
# A thead element's end tag may be omitted if the thead element
|
||||||
|
# is immediately followed by a tbody or tfoot element.
|
||||||
|
# A tbody element's end tag may be omitted if the tbody element
|
||||||
|
# is immediately followed by a tbody or tfoot element, or if
|
||||||
|
# there is no more content in the parent element.
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ['tbody', 'tfoot']
|
||||||
|
elif tagname == 'tbody':
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
elif tagname == 'tfoot':
|
||||||
|
# A tfoot element's end tag may be omitted if the tfoot element
|
||||||
|
# is immediately followed by a tbody element, or if there is no
|
||||||
|
# more content in the parent element.
|
||||||
|
# XXX: we never omit the end tag when the following element is
|
||||||
|
# a tbody. See is_optional_start.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] == 'tbody'
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
elif tagname in ('td', 'th'):
|
||||||
|
# A td element's end tag may be omitted if the td element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
# A th element's end tag may be omitted if the th element is
|
||||||
|
# immediately followed by a td or th element, or if there is
|
||||||
|
# no more content in the parent element.
|
||||||
|
if type == "StartTag":
|
||||||
|
return next["name"] in ('td', 'th')
|
||||||
|
else:
|
||||||
|
return type == "EndTag" or type is None
|
||||||
|
return False
|
38
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
38
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
try:
|
||||||
|
frozenset
|
||||||
|
except NameError:
|
||||||
|
# Import from the sets module for python 2.3
|
||||||
|
from sets import ImmutableSet as frozenset
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import _base
|
||||||
|
from html5lib.constants import rcdataElements, spaceCharacters
|
||||||
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
class Filter(_base.Filter):
|
||||||
|
|
||||||
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
preserve = 0
|
||||||
|
for token in _base.Filter.__iter__(self):
|
||||||
|
type = token["type"]
|
||||||
|
if type == "StartTag" \
|
||||||
|
and (preserve or token["name"] in self.spacePreserveElements):
|
||||||
|
preserve += 1
|
||||||
|
|
||||||
|
elif type == "EndTag" and preserve:
|
||||||
|
preserve -= 1
|
||||||
|
|
||||||
|
elif not preserve and type == "SpaceCharacters":
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif not preserve and type == "Characters":
|
||||||
|
token["data"] = collapse_spaces(token["data"])
|
||||||
|
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def collapse_spaces(text):
|
||||||
|
return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
|
||||||
|
|
@ -3,14 +3,14 @@
|
|||||||
# * Phases and insertion modes are one concept in parser.py.
|
# * Phases and insertion modes are one concept in parser.py.
|
||||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||||
# always exist.
|
# always exist.
|
||||||
# * We also deal with content when there's no DOCTYPE.
|
# * </br> creates a <br> element.
|
||||||
# It is expected that the specification will catch up with us in due course ;-)
|
#
|
||||||
|
# We haven't updated DOCTYPE handling yet
|
||||||
#
|
#
|
||||||
# It should be trivial to add the following cases. However, we should probably
|
# It should be trivial to add the following cases. However, we should probably
|
||||||
# also look into comment handling and such then...
|
# also look into comment handling and such then...
|
||||||
# * A <p> element end tag creates an empty <p> element when there's no <p>
|
# * A <p> element end tag creates an empty <p> element when there's no <p>
|
||||||
# element in scope.
|
# element in scope.
|
||||||
# * A <br> element end tag creates an empty <br> element.
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
frozenset
|
frozenset
|
||||||
@ -20,6 +20,7 @@ except NameError:
|
|||||||
from sets import ImmutableSet as frozenset
|
from sets import ImmutableSet as frozenset
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
import sys
|
||||||
|
|
||||||
import tokenizer
|
import tokenizer
|
||||||
|
|
||||||
@ -30,27 +31,32 @@ from treebuilders import simpletree
|
|||||||
import utils
|
import utils
|
||||||
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
|
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
|
||||||
from constants import scopingElements, formattingElements, specialElements
|
from constants import scopingElements, formattingElements, specialElements
|
||||||
from constants import headingElements, tableInsertModeElements, voidElements
|
from constants import headingElements, tableInsertModeElements
|
||||||
|
from constants import cdataElements, rcdataElements, voidElements
|
||||||
|
|
||||||
class HTMLParser(object):
|
class HTMLParser(object):
|
||||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||||
malformed) HTML"""
|
malformed) HTML"""
|
||||||
|
|
||||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
|
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
|
||||||
"""
|
"""
|
||||||
strict - raise an exception when a parse error is encountered
|
strict - raise an exception when a parse error is encountered
|
||||||
|
|
||||||
tree - a treebuilder class controlling the type of tree that will be
|
tree - a treebuilder class controlling the type of tree that will be
|
||||||
returned. This class is almost always a subclass of
|
returned. Built in treebuilders can be accessed through
|
||||||
html5lib.treebuilders._base.TreeBuilder
|
html5lib.treebuilders.getTreeBuilder(treeType)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Raise an exception on the first error encountered
|
# Raise an exception on the first error encountered
|
||||||
self.strict = strict
|
self.strict = strict
|
||||||
|
|
||||||
self.tree = tree()
|
self.tree = tree()
|
||||||
|
self.tokenizer_class = tokenizer
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
|
# "quirks" / "almost-standards" / "standards"
|
||||||
|
self.quirksMode = "standards"
|
||||||
|
|
||||||
self.phases = {
|
self.phases = {
|
||||||
"initial": InitialPhase(self, self.tree),
|
"initial": InitialPhase(self, self.tree),
|
||||||
"rootElement": RootElementPhase(self, self.tree),
|
"rootElement": RootElementPhase(self, self.tree),
|
||||||
@ -78,15 +84,15 @@ class HTMLParser(object):
|
|||||||
self.firstStartTag = False
|
self.firstStartTag = False
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
|
self.tokenizer = self.tokenizer_class(stream, encoding,
|
||||||
parseMeta=innerHTML)
|
parseMeta=not innerHTML)
|
||||||
|
|
||||||
if innerHTML:
|
if innerHTML:
|
||||||
self.innerHTML = container.lower()
|
self.innerHTML = container.lower()
|
||||||
|
|
||||||
if self.innerHTML in ('title', 'textarea'):
|
if self.innerHTML in cdataElements:
|
||||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
|
||||||
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
|
elif self.innerHTML in rcdataElements:
|
||||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
|
||||||
elif self.innerHTML == 'plaintext':
|
elif self.innerHTML == 'plaintext':
|
||||||
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
|
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
|
||||||
@ -113,10 +119,12 @@ class HTMLParser(object):
|
|||||||
method = getattr(self.phase, "process%s" % type, None)
|
method = getattr(self.phase, "process%s" % type, None)
|
||||||
if type in ("Characters", "SpaceCharacters", "Comment"):
|
if type in ("Characters", "SpaceCharacters", "Comment"):
|
||||||
method(token["data"])
|
method(token["data"])
|
||||||
elif type in ("StartTag", "Doctype"):
|
elif type == "StartTag":
|
||||||
method(token["name"], token["data"])
|
method(token["name"], token["data"])
|
||||||
elif type == "EndTag":
|
elif type == "EndTag":
|
||||||
method(token["name"])
|
method(token["name"])
|
||||||
|
elif type == "Doctype":
|
||||||
|
method(token["name"], token["publicId"], token["systemId"], token["correct"])
|
||||||
else:
|
else:
|
||||||
self.parseError(token["data"])
|
self.parseError(token["data"])
|
||||||
|
|
||||||
@ -158,10 +166,6 @@ class HTMLParser(object):
|
|||||||
if self.strict:
|
if self.strict:
|
||||||
raise ParseError
|
raise ParseError
|
||||||
|
|
||||||
def atheistParseError(self):
|
|
||||||
"""This error is not an error"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
def normalizeToken(self, token):
|
||||||
""" HTML5 specific normalizations to the token stream """
|
""" HTML5 specific normalizations to the token stream """
|
||||||
|
|
||||||
@ -171,9 +175,7 @@ class HTMLParser(object):
|
|||||||
# element. If it matches a void element atheists did the wrong
|
# element. If it matches a void element atheists did the wrong
|
||||||
# thing and if it doesn't it's wrong for everyone.
|
# thing and if it doesn't it's wrong for everyone.
|
||||||
|
|
||||||
if token["name"] in voidElements:
|
if token["name"] not in voidElements:
|
||||||
self.atheistParseError()
|
|
||||||
else:
|
|
||||||
self.parseError(_("Solidus (/) incorrectly placed in tag."))
|
self.parseError(_("Solidus (/) incorrectly placed in tag."))
|
||||||
|
|
||||||
token["type"] = "StartTag"
|
token["type"] = "StartTag"
|
||||||
@ -283,7 +285,7 @@ class Phase(object):
|
|||||||
# overridden.
|
# overridden.
|
||||||
self.tree.insertComment(data, self.tree.openElements[-1])
|
self.tree.insertComment(data, self.tree.openElements[-1])
|
||||||
|
|
||||||
def processDoctype(self, name, error):
|
def processDoctype(self, name, publicId, systemId, correct):
|
||||||
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
||||||
|
|
||||||
def processSpaceCharacters(self, data):
|
def processSpaceCharacters(self, data):
|
||||||
@ -319,10 +321,101 @@ class InitialPhase(Phase):
|
|||||||
def processComment(self, data):
|
def processComment(self, data):
|
||||||
self.tree.insertComment(data, self.tree.document)
|
self.tree.insertComment(data, self.tree.document)
|
||||||
|
|
||||||
def processDoctype(self, name, error):
|
def processDoctype(self, name, publicId, systemId, correct):
|
||||||
if error:
|
nameLower = name.translate(asciiUpper2Lower)
|
||||||
|
if nameLower != "html" or publicId != None or\
|
||||||
|
systemId != None:
|
||||||
self.parser.parseError(_("Erroneous DOCTYPE."))
|
self.parser.parseError(_("Erroneous DOCTYPE."))
|
||||||
|
# XXX need to update DOCTYPE tokens
|
||||||
self.tree.insertDoctype(name)
|
self.tree.insertDoctype(name)
|
||||||
|
|
||||||
|
if publicId == None:
|
||||||
|
publicId = ""
|
||||||
|
if publicId != "":
|
||||||
|
publicId = publicId.translate(asciiUpper2Lower)
|
||||||
|
|
||||||
|
if nameLower != "html":
|
||||||
|
# XXX quirks mode
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if publicId in\
|
||||||
|
("+//silmaril//dtd html pro v0r11 19970101//en",
|
||||||
|
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
|
||||||
|
"-//as//dtd html 3.0 aswedit + extensions//en",
|
||||||
|
"-//ietf//dtd html 2.0 level 1//en",
|
||||||
|
"-//ietf//dtd html 2.0 level 2//en",
|
||||||
|
"-//ietf//dtd html 2.0 strict level 1//en",
|
||||||
|
"-//ietf//dtd html 2.0 strict level 2//en",
|
||||||
|
"-//ietf//dtd html 2.0 strict//en",
|
||||||
|
"-//ietf//dtd html 2.0//en",
|
||||||
|
"-//ietf//dtd html 2.1e//en",
|
||||||
|
"-//ietf//dtd html 3.0//en",
|
||||||
|
"-//ietf//dtd html 3.0//en//",
|
||||||
|
"-//ietf//dtd html 3.2 final//en",
|
||||||
|
"-//ietf//dtd html 3.2//en",
|
||||||
|
"-//ietf//dtd html 3//en",
|
||||||
|
"-//ietf//dtd html level 0//en",
|
||||||
|
"-//ietf//dtd html level 0//en//2.0",
|
||||||
|
"-//ietf//dtd html level 1//en",
|
||||||
|
"-//ietf//dtd html level 1//en//2.0",
|
||||||
|
"-//ietf//dtd html level 2//en",
|
||||||
|
"-//ietf//dtd html level 2//en//2.0",
|
||||||
|
"-//ietf//dtd html level 3//en",
|
||||||
|
"-//ietf//dtd html level 3//en//3.0",
|
||||||
|
"-//ietf//dtd html strict level 0//en",
|
||||||
|
"-//ietf//dtd html strict level 0//en//2.0",
|
||||||
|
"-//ietf//dtd html strict level 1//en",
|
||||||
|
"-//ietf//dtd html strict level 1//en//2.0",
|
||||||
|
"-//ietf//dtd html strict level 2//en",
|
||||||
|
"-//ietf//dtd html strict level 2//en//2.0",
|
||||||
|
"-//ietf//dtd html strict level 3//en",
|
||||||
|
"-//ietf//dtd html strict level 3//en//3.0",
|
||||||
|
"-//ietf//dtd html strict//en",
|
||||||
|
"-//ietf//dtd html strict//en//2.0",
|
||||||
|
"-//ietf//dtd html strict//en//3.0",
|
||||||
|
"-//ietf//dtd html//en",
|
||||||
|
"-//ietf//dtd html//en//2.0",
|
||||||
|
"-//ietf//dtd html//en//3.0",
|
||||||
|
"-//metrius//dtd metrius presentational//en",
|
||||||
|
"-//microsoft//dtd internet explorer 2.0 html strict//en",
|
||||||
|
"-//microsoft//dtd internet explorer 2.0 html//en",
|
||||||
|
"-//microsoft//dtd internet explorer 2.0 tables//en",
|
||||||
|
"-//microsoft//dtd internet explorer 3.0 html strict//en",
|
||||||
|
"-//microsoft//dtd internet explorer 3.0 html//en",
|
||||||
|
"-//microsoft//dtd internet explorer 3.0 tables//en",
|
||||||
|
"-//netscape comm. corp.//dtd html//en",
|
||||||
|
"-//netscape comm. corp.//dtd strict html//en",
|
||||||
|
"-//o'reilly and associates//dtd html 2.0//en",
|
||||||
|
"-//o'reilly and associates//dtd html extended 1.0//en",
|
||||||
|
"-//spyglass//dtd html 2.0 extended//en",
|
||||||
|
"-//sq//dtd html 2.0 hotmetal + extensions//en",
|
||||||
|
"-//sun microsystems corp.//dtd hotjava html//en",
|
||||||
|
"-//sun microsystems corp.//dtd hotjava strict html//en",
|
||||||
|
"-//w3c//dtd html 3 1995-03-24//en",
|
||||||
|
"-//w3c//dtd html 3.2 draft//en",
|
||||||
|
"-//w3c//dtd html 3.2 final//en",
|
||||||
|
"-//w3c//dtd html 3.2//en",
|
||||||
|
"-//w3c//dtd html 3.2s draft//en",
|
||||||
|
"-//w3c//dtd html 4.0 frameset//en",
|
||||||
|
"-//w3c//dtd html 4.0 transitional//en",
|
||||||
|
"-//w3c//dtd html experimental 19960712//en",
|
||||||
|
"-//w3c//dtd html experimental 970421//en",
|
||||||
|
"-//w3c//dtd w3 html//en",
|
||||||
|
"-//w3o//dtd w3 html 3.0//en",
|
||||||
|
"-//w3o//dtd w3 html 3.0//en//",
|
||||||
|
"-//w3o//dtd w3 html strict 3.0//en//",
|
||||||
|
"-//webtechs//dtd mozilla html 2.0//en",
|
||||||
|
"-//webtechs//dtd mozilla html//en",
|
||||||
|
"-/w3c/dtd html 4.0 transitional/en",
|
||||||
|
"html")\
|
||||||
|
or (publicId in\
|
||||||
|
("-//w3c//dtd html 4.01 frameset//EN",
|
||||||
|
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
|
||||||
|
or (systemId != None and\
|
||||||
|
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
|
||||||
|
#XXX quirks mode
|
||||||
|
pass
|
||||||
|
|
||||||
self.parser.phase = self.parser.phases["rootElement"]
|
self.parser.phase = self.parser.phases["rootElement"]
|
||||||
|
|
||||||
def processSpaceCharacters(self, data):
|
def processSpaceCharacters(self, data):
|
||||||
@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase):
|
|||||||
self.startTagHandler.default = self.startTagOther
|
self.startTagHandler.default = self.startTagOther
|
||||||
|
|
||||||
self.endTagHandler = utils.MethodDispatcher([
|
self.endTagHandler = utils.MethodDispatcher([
|
||||||
("html", self.endTagHtml)
|
(("html", "head", "body", "br"), self.endTagImplyHead)
|
||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase):
|
|||||||
self.startTagHead("head", {})
|
self.startTagHead("head", {})
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def endTagHtml(self, name):
|
def endTagImplyHead(self, name):
|
||||||
self.startTagHead("head", {})
|
self.startTagHead("head", {})
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
@ -437,7 +530,7 @@ class InHeadPhase(Phase):
|
|||||||
|
|
||||||
self. endTagHandler = utils.MethodDispatcher([
|
self. endTagHandler = utils.MethodDispatcher([
|
||||||
("head", self.endTagHead),
|
("head", self.endTagHead),
|
||||||
("html", self.endTagHtml),
|
(("html", "body", "br"), self.endTagImplyAfterHead),
|
||||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
(("title", "style", "script"), self.endTagTitleStyleScript)
|
||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
@ -499,7 +592,11 @@ class InHeadPhase(Phase):
|
|||||||
|
|
||||||
def startTagBaseLinkMeta(self, name, attributes):
|
def startTagBaseLinkMeta(self, name, attributes):
|
||||||
element = self.tree.createElement(name, attributes)
|
element = self.tree.createElement(name, attributes)
|
||||||
self.appendToHead(element)
|
if (self.tree.headPointer is not None and
|
||||||
|
self.parser.phase == self.parser.phases["inHead"]):
|
||||||
|
self.appendToHead(element)
|
||||||
|
else:
|
||||||
|
self.tree.openElements[-1].appendChild(element)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.anythingElse()
|
self.anythingElse()
|
||||||
@ -512,7 +609,7 @@ class InHeadPhase(Phase):
|
|||||||
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
||||||
self.parser.phase = self.parser.phases["afterHead"]
|
self.parser.phase = self.parser.phases["afterHead"]
|
||||||
|
|
||||||
def endTagHtml(self, name):
|
def endTagImplyAfterHead(self, name):
|
||||||
self.anythingElse()
|
self.anythingElse()
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
@ -592,9 +689,9 @@ class InBodyPhase(Phase):
|
|||||||
|
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
("html", self.startTagHtml),
|
("html", self.startTagHtml),
|
||||||
(("script", "style"), self.startTagScriptStyle),
|
(("base", "link", "meta", "script", "style"),
|
||||||
(("base", "link", "meta", "title"),
|
self.startTagProcessInHead),
|
||||||
self.startTagFromHead),
|
("title", self.startTagTitle),
|
||||||
("body", self.startTagBody),
|
("body", self.startTagBody),
|
||||||
(("address", "blockquote", "center", "dir", "div", "dl",
|
(("address", "blockquote", "center", "dir", "div", "dl",
|
||||||
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
|
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
|
||||||
@ -604,8 +701,9 @@ class InBodyPhase(Phase):
|
|||||||
("plaintext",self.startTagPlaintext),
|
("plaintext",self.startTagPlaintext),
|
||||||
(headingElements, self.startTagHeading),
|
(headingElements, self.startTagHeading),
|
||||||
("a", self.startTagA),
|
("a", self.startTagA),
|
||||||
(("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
|
(("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
|
||||||
"strong", "tt", "u"),self.startTagFormatting),
|
"tt", "u"),self.startTagFormatting),
|
||||||
|
("nobr", self.startTagNobr),
|
||||||
("button", self.startTagButton),
|
("button", self.startTagButton),
|
||||||
(("marquee", "object"), self.startTagMarqueeObject),
|
(("marquee", "object"), self.startTagMarqueeObject),
|
||||||
("xmp", self.startTagXmp),
|
("xmp", self.startTagXmp),
|
||||||
@ -642,7 +740,8 @@ class InBodyPhase(Phase):
|
|||||||
(("head", "frameset", "select", "optgroup", "option", "table",
|
(("head", "frameset", "select", "optgroup", "option", "table",
|
||||||
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
|
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
|
||||||
"td", "th"), self.endTagMisplaced),
|
"td", "th"), self.endTagMisplaced),
|
||||||
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
|
("br", self.endTagBr),
|
||||||
|
(("area", "basefont", "bgsound", "embed", "hr", "image",
|
||||||
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
|
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
|
||||||
self.endTagNone),
|
self.endTagNone),
|
||||||
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
|
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
|
||||||
@ -659,11 +758,13 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.openElements[-1])
|
self.tree.openElements[-1])
|
||||||
|
|
||||||
# the real deal
|
# the real deal
|
||||||
def processSpaceCharactersPre(self, data):
|
def processSpaceCharactersDropNewline(self, data):
|
||||||
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
|
||||||
|
# leading newlines
|
||||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||||
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
|
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
|
||||||
and not self.tree.openElements[-1].hasContent()):
|
or self.tree.openElements[-1].name == "textarea")
|
||||||
|
and not self.tree.openElements[-1].hasContent()):
|
||||||
data = data[1:]
|
data = data[1:]
|
||||||
if data:
|
if data:
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
@ -675,10 +776,10 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
|
|
||||||
def startTagScriptStyle(self, name, attributes):
|
def startTagProcessInHead(self, name, attributes):
|
||||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagFromHead(self, name, attributes):
|
def startTagTitle(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||||
") that belongs in the head. Moved."))
|
") that belongs in the head. Moved."))
|
||||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||||
@ -698,7 +799,7 @@ class InBodyPhase(Phase):
|
|||||||
self.endTagP("p")
|
self.endTagP("p")
|
||||||
self.tree.insertElement(name, attributes)
|
self.tree.insertElement(name, attributes)
|
||||||
if name == "pre":
|
if name == "pre":
|
||||||
self.processSpaceCharacters = self.processSpaceCharactersPre
|
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
||||||
|
|
||||||
def startTagForm(self, name, attributes):
|
def startTagForm(self, name, attributes):
|
||||||
if self.tree.formPointer:
|
if self.tree.formPointer:
|
||||||
@ -717,10 +818,17 @@ class InBodyPhase(Phase):
|
|||||||
# AT Use reversed in Python 2.4...
|
# AT Use reversed in Python 2.4...
|
||||||
for i, node in enumerate(self.tree.openElements[::-1]):
|
for i, node in enumerate(self.tree.openElements[::-1]):
|
||||||
if node.name in stopName:
|
if node.name in stopName:
|
||||||
|
poppedNodes = []
|
||||||
for j in range(i+1):
|
for j in range(i+1):
|
||||||
self.tree.openElements.pop()
|
poppedNodes.append(self.tree.openElements.pop())
|
||||||
|
if i >= 1:
|
||||||
|
self.parser.parseError("Missing end tag%s (%s)"%
|
||||||
|
(i > 1 and "s" or "",
|
||||||
|
", ".join([item.name for item in
|
||||||
|
poppedNodes[:-1]])))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
# Phrasing elements are all non special, non scoping, non
|
# Phrasing elements are all non special, non scoping, non
|
||||||
# formatting elements
|
# formatting elements
|
||||||
if (node.name in (specialElements | scopingElements)
|
if (node.name in (specialElements | scopingElements)
|
||||||
@ -738,14 +846,16 @@ class InBodyPhase(Phase):
|
|||||||
def startTagHeading(self, name, attributes):
|
def startTagHeading(self, name, attributes):
|
||||||
if self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
self.endTagP("p")
|
self.endTagP("p")
|
||||||
for item in headingElements:
|
# Uncomment the following for IE7 behavior:
|
||||||
if self.tree.elementInScope(item):
|
#
|
||||||
self.parser.parseError(_("Unexpected start tag (" + name +\
|
#for item in headingElements:
|
||||||
")."))
|
# if self.tree.elementInScope(item):
|
||||||
item = self.tree.openElements.pop()
|
# self.parser.parseError(_("Unexpected start tag (" + name +\
|
||||||
while item.name not in headingElements:
|
# ")."))
|
||||||
item = self.tree.openElements.pop()
|
# item = self.tree.openElements.pop()
|
||||||
break
|
# while item.name not in headingElements:
|
||||||
|
# item = self.tree.openElements.pop()
|
||||||
|
# break
|
||||||
self.tree.insertElement(name, attributes)
|
self.tree.insertElement(name, attributes)
|
||||||
|
|
||||||
def startTagA(self, name, attributes):
|
def startTagA(self, name, attributes):
|
||||||
@ -765,6 +875,12 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.addFormattingElement(name, attributes)
|
self.addFormattingElement(name, attributes)
|
||||||
|
|
||||||
|
def startTagNobr(self, name, attributes):
|
||||||
|
self.tree.reconstructActiveFormattingElements()
|
||||||
|
if self.tree.elementInScope("nobr"):
|
||||||
|
self.processEndTag("nobr")
|
||||||
|
self.addFormattingElement(name, attributes)
|
||||||
|
|
||||||
def startTagButton(self, name, attributes):
|
def startTagButton(self, name, attributes):
|
||||||
if self.tree.elementInScope("button"):
|
if self.tree.elementInScope("button"):
|
||||||
self.parser.parseError(_("Unexpected start tag (button) implied "
|
self.parser.parseError(_("Unexpected start tag (button) implied "
|
||||||
@ -840,6 +956,7 @@ class InBodyPhase(Phase):
|
|||||||
# XXX Form element pointer checking here as well...
|
# XXX Form element pointer checking here as well...
|
||||||
self.tree.insertElement(name, attributes)
|
self.tree.insertElement(name, attributes)
|
||||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
|
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
|
||||||
|
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
||||||
|
|
||||||
def startTagCdata(self, name, attributes):
|
def startTagCdata(self, name, attributes):
|
||||||
"""iframe, noembed noframes, noscript(if scripting enabled)"""
|
"""iframe, noembed noframes, noscript(if scripting enabled)"""
|
||||||
@ -861,11 +978,13 @@ class InBodyPhase(Phase):
|
|||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||||
u"). Ignored."))
|
u"). Ignored."))
|
||||||
|
|
||||||
def startTagNew(self, name, other):
|
def startTagNew(self, name, attributes):
|
||||||
"""New HTML5 elements, "event-source", "section", "nav",
|
"""New HTML5 elements, "event-source", "section", "nav",
|
||||||
"article", "aside", "header", "footer", "datagrid", "command"
|
"article", "aside", "header", "footer", "datagrid", "command"
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
|
||||||
|
self.startTagOther(name, attributes)
|
||||||
|
#raise NotImplementedError
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
@ -1082,6 +1201,12 @@ class InBodyPhase(Phase):
|
|||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||||
u"). Ignored."))
|
u"). Ignored."))
|
||||||
|
|
||||||
|
def endTagBr(self, name):
|
||||||
|
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
|
||||||
|
self.tree.reconstructActiveFormattingElements()
|
||||||
|
self.tree.insertElement(name, {})
|
||||||
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
def endTagNone(self, name):
|
def endTagNone(self, name):
|
||||||
# This handles elements with no end tag.
|
# This handles elements with no end tag.
|
||||||
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
||||||
@ -1097,7 +1222,9 @@ class InBodyPhase(Phase):
|
|||||||
"""New HTML5 elements, "event-source", "section", "nav",
|
"""New HTML5 elements, "event-source", "section", "nav",
|
||||||
"article", "aside", "header", "footer", "datagrid", "command"
|
"article", "aside", "header", "footer", "datagrid", "command"
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
|
||||||
|
self.endTagOther(name)
|
||||||
|
#raise NotImplementedError
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
# XXX This logic should be moved into the treebuilder
|
# XXX This logic should be moved into the treebuilder
|
||||||
@ -1222,10 +1349,10 @@ class InTablePhase(Phase):
|
|||||||
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
||||||
u"table context caused voodoo mode."))
|
u"table context caused voodoo mode."))
|
||||||
# Make all the special element rearranging voodoo kick in
|
# Make all the special element rearranging voodoo kick in
|
||||||
self.parser.insertFromTable = True
|
self.tree.insertFromTable = True
|
||||||
# Process the end tag in the "in body" mode
|
# Process the end tag in the "in body" mode
|
||||||
self.parser.phases["inBody"].processEndTag(name)
|
self.parser.phases["inBody"].processEndTag(name)
|
||||||
self.parser.insertFromTable = False
|
self.tree.insertFromTable = False
|
||||||
|
|
||||||
|
|
||||||
class InCaptionPhase(Phase):
|
class InCaptionPhase(Phase):
|
||||||
@ -1699,7 +1826,7 @@ class AfterBodyPhase(Phase):
|
|||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
# XXX We should prolly add a handler for "html" here as well...
|
# XXX We should prolly add a handler for here as well...
|
||||||
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
|
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
@ -31,15 +31,17 @@ class HTMLInputStream(object):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = []
|
self.newLines = [0]
|
||||||
|
|
||||||
# Raw Stream
|
# Raw Stream
|
||||||
self.rawStream = self.openStream(source)
|
self.rawStream = self.openStream(source)
|
||||||
|
|
||||||
# Encoding Information
|
# Encoding Information
|
||||||
#Number of bytes to use when looking for a meta element with
|
#Number of bytes to use when looking for a meta element with
|
||||||
#encoding information
|
#encoding information
|
||||||
self.numBytesMeta = 512
|
self.numBytesMeta = 512
|
||||||
|
#Number of bytes to use when using detecting encoding using chardet
|
||||||
|
self.numBytesChardet = 100
|
||||||
#Encoding to use if no other information can be found
|
#Encoding to use if no other information can be found
|
||||||
self.defaultEncoding = "windows-1252"
|
self.defaultEncoding = "windows-1252"
|
||||||
|
|
||||||
@ -48,20 +50,12 @@ class HTMLInputStream(object):
|
|||||||
encoding = self.detectEncoding(parseMeta, chardet)
|
encoding = self.detectEncoding(parseMeta, chardet)
|
||||||
self.charEncoding = encoding
|
self.charEncoding = encoding
|
||||||
|
|
||||||
# Read bytes from stream decoding them into Unicode
|
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
||||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
|
||||||
|
|
||||||
# Normalize new ipythonlines and null characters
|
|
||||||
uString = re.sub('\r\n?', '\n', uString)
|
|
||||||
uString = re.sub('\x00', u'\uFFFD', uString)
|
|
||||||
|
|
||||||
# Convert the unicode string into a list to be used as the data stream
|
|
||||||
self.dataStream = uString
|
|
||||||
|
|
||||||
self.queue = []
|
self.queue = []
|
||||||
|
|
||||||
# Reset position in the list to read from
|
self.line = self.col = 0
|
||||||
self.reset()
|
self.lineLengths = []
|
||||||
|
|
||||||
def openStream(self, source):
|
def openStream(self, source):
|
||||||
"""Produces a file object from source.
|
"""Produces a file object from source.
|
||||||
@ -74,6 +68,8 @@ class HTMLInputStream(object):
|
|||||||
stream = source
|
stream = source
|
||||||
else:
|
else:
|
||||||
# Otherwise treat source as a string and convert to a file object
|
# Otherwise treat source as a string and convert to a file object
|
||||||
|
if isinstance(source, unicode):
|
||||||
|
source = source.encode('utf-8')
|
||||||
import cStringIO
|
import cStringIO
|
||||||
stream = cStringIO.StringIO(str(source))
|
stream = cStringIO.StringIO(str(source))
|
||||||
return stream
|
return stream
|
||||||
@ -90,10 +86,18 @@ class HTMLInputStream(object):
|
|||||||
#Guess with chardet, if avaliable
|
#Guess with chardet, if avaliable
|
||||||
if encoding is None and chardet:
|
if encoding is None and chardet:
|
||||||
try:
|
try:
|
||||||
import chardet
|
from chardet.universaldetector import UniversalDetector
|
||||||
buffer = self.rawStream.read()
|
buffers = []
|
||||||
encoding = chardet.detect(buffer)['encoding']
|
detector = UniversalDetector()
|
||||||
self.rawStream = self.openStream(buffer)
|
while not detector.done:
|
||||||
|
buffer = self.rawStream.read(self.numBytesChardet)
|
||||||
|
if not buffer:
|
||||||
|
break
|
||||||
|
buffers.append(buffer)
|
||||||
|
detector.feed(buffer)
|
||||||
|
detector.close()
|
||||||
|
encoding = detector.result['encoding']
|
||||||
|
self.seek("".join(buffers), 0)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
# If all else fails use the default encoding
|
# If all else fails use the default encoding
|
||||||
@ -119,60 +123,83 @@ class HTMLInputStream(object):
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Go to beginning of file and read in 4 bytes
|
# Go to beginning of file and read in 4 bytes
|
||||||
self.rawStream.seek(0)
|
|
||||||
string = self.rawStream.read(4)
|
string = self.rawStream.read(4)
|
||||||
|
|
||||||
# Try detecting the BOM using bytes from the string
|
# Try detecting the BOM using bytes from the string
|
||||||
encoding = bomDict.get(string[:3]) # UTF-8
|
encoding = bomDict.get(string[:3]) # UTF-8
|
||||||
seek = 3
|
seek = 3
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = bomDict.get(string[:2]) # UTF-16
|
# Need to detect UTF-32 before UTF-16
|
||||||
seek = 2
|
encoding = bomDict.get(string) # UTF-32
|
||||||
|
seek = 4
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = bomDict.get(string) # UTF-32
|
encoding = bomDict.get(string[:2]) # UTF-16
|
||||||
seek = 4
|
seek = 2
|
||||||
|
|
||||||
#AT - move this to the caller?
|
|
||||||
# Set the read position past the BOM if one was found, otherwise
|
# Set the read position past the BOM if one was found, otherwise
|
||||||
# set it to the start of the stream
|
# set it to the start of the stream
|
||||||
self.rawStream.seek(encoding and seek or 0)
|
self.seek(string, encoding and seek or 0)
|
||||||
|
|
||||||
return encoding
|
return encoding
|
||||||
|
|
||||||
|
def seek(self, buffer, n):
|
||||||
|
"""Unget buffer[n:]"""
|
||||||
|
if hasattr(self.rawStream, 'unget'):
|
||||||
|
self.rawStream.unget(buffer[n:])
|
||||||
|
return
|
||||||
|
|
||||||
|
if hasattr(self.rawStream, 'seek'):
|
||||||
|
try:
|
||||||
|
self.rawStream.seek(n)
|
||||||
|
return
|
||||||
|
except IOError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class BufferedStream:
|
||||||
|
def __init__(self, data, stream):
|
||||||
|
self.data = data
|
||||||
|
self.stream = stream
|
||||||
|
def read(self, chars=-1):
|
||||||
|
if chars == -1 or chars > len(self.data):
|
||||||
|
result = self.data
|
||||||
|
self.data = ''
|
||||||
|
if chars == -1:
|
||||||
|
return result + self.stream.read()
|
||||||
|
else:
|
||||||
|
return result + self.stream.read(chars-len(result))
|
||||||
|
elif not self.data:
|
||||||
|
return self.stream.read(chars)
|
||||||
|
else:
|
||||||
|
result = self.data[:chars]
|
||||||
|
self.data = self.data[chars:]
|
||||||
|
return result
|
||||||
|
def unget(self, data):
|
||||||
|
if self.data:
|
||||||
|
self.data += data
|
||||||
|
else:
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
|
||||||
|
|
||||||
def detectEncodingMeta(self):
|
def detectEncodingMeta(self):
|
||||||
"""Report the encoding declared by the meta element
|
"""Report the encoding declared by the meta element
|
||||||
"""
|
"""
|
||||||
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
|
buffer = self.rawStream.read(self.numBytesMeta)
|
||||||
self.rawStream.seek(0)
|
parser = EncodingParser(buffer)
|
||||||
|
self.seek(buffer, 0)
|
||||||
return parser.getEncoding()
|
return parser.getEncoding()
|
||||||
|
|
||||||
def determineNewLines(self):
|
|
||||||
# Looks through the stream to find where new lines occur so
|
|
||||||
# the position method can tell where it is.
|
|
||||||
self.newLines.append(0)
|
|
||||||
for i in xrange(len(self.dataStream)):
|
|
||||||
if self.dataStream[i] == u"\n":
|
|
||||||
self.newLines.append(i)
|
|
||||||
|
|
||||||
def position(self):
|
def position(self):
|
||||||
"""Returns (line, col) of the current position in the stream."""
|
"""Returns (line, col) of the current position in the stream."""
|
||||||
# Generate list of new lines first time around
|
line, col = self.line, self.col
|
||||||
if not self.newLines:
|
for c in self.queue[::-1]:
|
||||||
self.determineNewLines()
|
if c == '\n':
|
||||||
|
line -= 1
|
||||||
line = 0
|
assert col == 0
|
||||||
tell = self.tell
|
col = self.lineLengths[line]
|
||||||
for pos in self.newLines:
|
|
||||||
if pos < tell:
|
|
||||||
line += 1
|
|
||||||
else:
|
else:
|
||||||
break
|
col -= 1
|
||||||
col = tell - self.newLines[line-1] - 1
|
return (line + 1, col)
|
||||||
return (line, col)
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""Resets the position in the stream back to the start."""
|
|
||||||
self.tell = 0
|
|
||||||
|
|
||||||
def char(self):
|
def char(self):
|
||||||
""" Read one character from the stream or queue if available. Return
|
""" Read one character from the stream or queue if available. Return
|
||||||
@ -181,12 +208,28 @@ class HTMLInputStream(object):
|
|||||||
if self.queue:
|
if self.queue:
|
||||||
return self.queue.pop(0)
|
return self.queue.pop(0)
|
||||||
else:
|
else:
|
||||||
try:
|
c = self.dataStream.read(1, 1)
|
||||||
self.tell += 1
|
if not c:
|
||||||
return self.dataStream[self.tell - 1]
|
self.col += 1
|
||||||
except:
|
|
||||||
return EOF
|
return EOF
|
||||||
|
|
||||||
|
# Normalize newlines and null characters
|
||||||
|
if c == '\x00': c = u'\uFFFD'
|
||||||
|
if c == '\r':
|
||||||
|
c = self.dataStream.read(1, 1)
|
||||||
|
if c != '\n':
|
||||||
|
self.queue.insert(0, unicode(c))
|
||||||
|
c = '\n'
|
||||||
|
|
||||||
|
# update position in stream
|
||||||
|
if c == '\n':
|
||||||
|
self.lineLengths.append(self.col)
|
||||||
|
self.line += 1
|
||||||
|
self.col = 0
|
||||||
|
else:
|
||||||
|
self.col += 1
|
||||||
|
return unicode(c)
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite = False):
|
def charsUntil(self, characters, opposite = False):
|
||||||
""" Returns a string of characters from the stream up to but not
|
""" Returns a string of characters from the stream up to but not
|
||||||
including any character in characters or EOF. characters can be
|
including any character in characters or EOF. characters can be
|
||||||
@ -194,23 +237,20 @@ class HTMLInputStream(object):
|
|||||||
"""
|
"""
|
||||||
charStack = [self.char()]
|
charStack = [self.char()]
|
||||||
|
|
||||||
# First from the queue
|
|
||||||
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
|
||||||
and self.queue:
|
|
||||||
charStack.append(self.queue.pop(0))
|
|
||||||
|
|
||||||
# Then the rest
|
|
||||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||||
try:
|
charStack.append(self.char())
|
||||||
self.tell += 1
|
|
||||||
charStack.append(self.dataStream[self.tell - 1])
|
|
||||||
except:
|
|
||||||
charStack.append(EOF)
|
|
||||||
|
|
||||||
# Put the character stopped on back to the front of the queue
|
# Put the character stopped on back to the front of the queue
|
||||||
# from where it came.
|
# from where it came.
|
||||||
self.queue.insert(0, charStack.pop())
|
c = charStack.pop()
|
||||||
return "".join(charStack)
|
if c != EOF:
|
||||||
|
self.queue.insert(0, c)
|
||||||
|
|
||||||
|
return u"".join(charStack)
|
||||||
|
|
||||||
|
def unget(self, chars):
|
||||||
|
if chars:
|
||||||
|
self.queue = list(chars) + self.queue
|
||||||
|
|
||||||
class EncodingBytes(str):
|
class EncodingBytes(str):
|
||||||
"""String-like object with an assosiated position and various extra methods
|
"""String-like object with an assosiated position and various extra methods
|
@ -15,10 +15,13 @@ References:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import html5parser
|
import html5parser
|
||||||
from constants import voidElements
|
from constants import voidElements, contentModelFlags
|
||||||
import gettext
|
import gettext
|
||||||
_ = gettext.gettext
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
from xml.dom import XHTML_NAMESPACE
|
||||||
|
from xml.sax.saxutils import unescape
|
||||||
|
|
||||||
class XMLParser(html5parser.HTMLParser):
|
class XMLParser(html5parser.HTMLParser):
|
||||||
""" liberal XML parser """
|
""" liberal XML parser """
|
||||||
|
|
||||||
@ -45,6 +48,11 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
if token["data"]:
|
if token["data"]:
|
||||||
self.parseError(_("End tag contains unexpected attributes."))
|
self.parseError(_("End tag contains unexpected attributes."))
|
||||||
|
|
||||||
|
elif token["type"] == "Characters":
|
||||||
|
# un-escape rcdataElements (e.g. style, script)
|
||||||
|
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
||||||
|
token["data"] = unescape(token["data"])
|
||||||
|
|
||||||
elif token["type"] == "Comment":
|
elif token["type"] == "Comment":
|
||||||
# Rescue CDATA from the comments
|
# Rescue CDATA from the comments
|
||||||
if (token["data"].startswith("[CDATA[") and
|
if (token["data"].startswith("[CDATA[") and
|
||||||
@ -66,16 +74,21 @@ class XHTMLParser(XMLParser):
|
|||||||
|
|
||||||
# ensure that non-void XHTML elements have content so that separate
|
# ensure that non-void XHTML elements have content so that separate
|
||||||
# open and close tags are emitted
|
# open and close tags are emitted
|
||||||
if token["type"] == "EndTag" and \
|
if token["type"] == "EndTag":
|
||||||
token["name"] not in voidElements and \
|
if token["name"] in voidElements:
|
||||||
token["name"] == self.tree.openElements[-1].name and \
|
if not self.tree.openElements or \
|
||||||
not self.tree.openElements[-1].hasContent():
|
self.tree.openElements[-1].name != token["name"]:
|
||||||
for e in self.tree.openElements:
|
token["type"] = "EmptyTag"
|
||||||
if 'xmlns' in e.attributes.keys():
|
if not token.has_key("data"): token["data"] = {}
|
||||||
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
self.tree.insertText('')
|
if token["name"] == self.tree.openElements[-1].name and \
|
||||||
|
not self.tree.openElements[-1].hasContent():
|
||||||
|
for e in self.tree.openElements:
|
||||||
|
if 'xmlns' in e.attributes.keys():
|
||||||
|
if e.attributes['xmlns'] != XHTML_NAMESPACE:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.tree.insertText('')
|
||||||
|
|
||||||
return token
|
return token
|
||||||
|
|
189
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
189
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
import re
|
||||||
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
from tokenizer import HTMLTokenizer
|
||||||
|
|
||||||
|
class HTMLSanitizer(HTMLTokenizer):
|
||||||
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
|
|
||||||
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||||
|
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||||
|
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||||
|
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||||
|
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||||
|
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||||
|
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||||
|
'ul', 'var']
|
||||||
|
|
||||||
|
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||||
|
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||||
|
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||||
|
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||||
|
'munderover', 'none']
|
||||||
|
|
||||||
|
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||||
|
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||||
|
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||||
|
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||||
|
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||||
|
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||||
|
|
||||||
|
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||||
|
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||||
|
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||||
|
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||||
|
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||||
|
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||||
|
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||||
|
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||||
|
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||||
|
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
|
||||||
|
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
|
||||||
|
'xml:lang']
|
||||||
|
|
||||||
|
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||||
|
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||||
|
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||||
|
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||||
|
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||||
|
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||||
|
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||||
|
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||||
|
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||||
|
|
||||||
|
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||||
|
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||||
|
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||||
|
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||||
|
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||||
|
'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||||
|
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
||||||
|
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
||||||
|
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
||||||
|
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
|
||||||
|
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
|
||||||
|
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
|
||||||
|
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
||||||
|
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
|
||||||
|
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
|
||||||
|
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
|
||||||
|
'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||||
|
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||||
|
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||||
|
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||||
|
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||||
|
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||||
|
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||||
|
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||||
|
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||||
|
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
|
||||||
|
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
|
||||||
|
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||||
|
|
||||||
|
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||||
|
'xlink:href', 'xml:base']
|
||||||
|
|
||||||
|
acceptable_css_properties = ['azimuth', 'background-color',
|
||||||
|
'border-bottom-color', 'border-collapse', 'border-color',
|
||||||
|
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||||
|
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||||
|
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||||
|
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||||
|
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||||
|
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||||
|
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||||
|
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||||
|
'white-space', 'width']
|
||||||
|
|
||||||
|
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||||
|
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||||
|
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||||
|
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||||
|
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||||
|
'transparent', 'underline', 'white', 'yellow']
|
||||||
|
|
||||||
|
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||||
|
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||||
|
'stroke-opacity']
|
||||||
|
|
||||||
|
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||||
|
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||||
|
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||||
|
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||||
|
|
||||||
|
# subclasses may define their own versions of these constants
|
||||||
|
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||||
|
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||||
|
allowed_css_properties = acceptable_css_properties
|
||||||
|
allowed_css_keywords = acceptable_css_keywords
|
||||||
|
allowed_svg_properties = acceptable_svg_properties
|
||||||
|
allowed_protocols = acceptable_protocols
|
||||||
|
|
||||||
|
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||||
|
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||||
|
# attributes are parsed, and a restricted set, # specified by
|
||||||
|
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||||
|
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||||
|
# in ALLOWED_PROTOCOLS are allowed.
|
||||||
|
#
|
||||||
|
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||||
|
# => <script> do_nasty_stuff() </script>
|
||||||
|
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||||
|
# => <a>Click here for $100</a>
|
||||||
|
def __iter__(self):
|
||||||
|
for token in HTMLTokenizer.__iter__(self):
|
||||||
|
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||||
|
if token["name"] in self.allowed_elements:
|
||||||
|
if token.has_key("data"):
|
||||||
|
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||||
|
for attr in self.attr_val_is_uri:
|
||||||
|
if not attrs.has_key(attr): continue
|
||||||
|
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||||
|
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||||
|
del attrs[attr]
|
||||||
|
if attrs.has_key('style'):
|
||||||
|
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||||
|
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
if token["type"] == "EndTag":
|
||||||
|
token["data"] = "</%s>" % token["name"]
|
||||||
|
elif token["data"]:
|
||||||
|
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||||
|
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||||
|
else:
|
||||||
|
token["data"] = "<%s>" % token["name"]
|
||||||
|
if token["type"] == "EmptyTag":
|
||||||
|
token["data"]=token["data"][:-1] + "/>"
|
||||||
|
token["type"] = "Characters"
|
||||||
|
del token["name"]
|
||||||
|
yield token
|
||||||
|
elif token["type"] == "Comment":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
def sanitize_css(self, style):
|
||||||
|
# disallow urls
|
||||||
|
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
||||||
|
|
||||||
|
# gauntlet
|
||||||
|
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||||
|
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
||||||
|
|
||||||
|
clean = []
|
||||||
|
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||||
|
if not value: continue
|
||||||
|
if prop.lower() in self.allowed_css_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
||||||
|
for keyword in value.split():
|
||||||
|
if not keyword in self.acceptable_css_keywords and \
|
||||||
|
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
elif prop.lower() in self.allowed_svg_properties:
|
||||||
|
clean.append(prop + ': ' + value + ';')
|
||||||
|
|
||||||
|
return ' '.join(clean)
|
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
from htmlserializer import HTMLSerializer
|
||||||
|
from xhtmlserializer import XHTMLSerializer
|
216
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
216
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
try:
|
||||||
|
frozenset
|
||||||
|
except NameError:
|
||||||
|
# Import from the sets module for python 2.3
|
||||||
|
from sets import ImmutableSet as frozenset
|
||||||
|
|
||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
from html5lib.filters.whitespace import Filter as WhitespaceFilter
|
||||||
|
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
|
||||||
|
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||||
|
from html5lib.constants import rcdataElements
|
||||||
|
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from codecs import register_error, xmlcharrefreplace_errors
|
||||||
|
except ImportError:
|
||||||
|
unicode_encode_errors = "strict"
|
||||||
|
else:
|
||||||
|
unicode_encode_errors = "htmlentityreplace"
|
||||||
|
|
||||||
|
from html5lib.constants import entities
|
||||||
|
|
||||||
|
encode_entity_map = {}
|
||||||
|
for k, v in entities.items():
|
||||||
|
if v != "&" and encode_entity_map.get(v) != k.lower():
|
||||||
|
# prefer < over < and similarly for &, >, etc.
|
||||||
|
encode_entity_map[v] = k
|
||||||
|
|
||||||
|
def htmlentityreplace_errors(exc):
|
||||||
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||||
|
res = []
|
||||||
|
for c in ex.object[exc.start:exc.end]:
|
||||||
|
c = encode_entity_map.get(c)
|
||||||
|
if c:
|
||||||
|
res.append("&")
|
||||||
|
res.append(c)
|
||||||
|
res.append(";")
|
||||||
|
else:
|
||||||
|
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||||
|
return (u"".join(res), exc.end)
|
||||||
|
else:
|
||||||
|
return xmlcharrefreplace_errors(exc)
|
||||||
|
|
||||||
|
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||||
|
|
||||||
|
del register_error
|
||||||
|
|
||||||
|
def encode(text, encoding):
|
||||||
|
return text.encode(encoding, unicode_encode_errors)
|
||||||
|
|
||||||
|
class HTMLSerializer(object):
|
||||||
|
|
||||||
|
quote_attr_values = False
|
||||||
|
quote_char = '"'
|
||||||
|
use_best_quote_char = True
|
||||||
|
minimize_boolean_attributes = True
|
||||||
|
|
||||||
|
use_trailing_solidus = False
|
||||||
|
space_before_trailing_solidus = True
|
||||||
|
escape_lt_in_attrs = False
|
||||||
|
escape_rcdata = False
|
||||||
|
|
||||||
|
omit_optional_tags = True
|
||||||
|
|
||||||
|
strip_whitespace = False
|
||||||
|
|
||||||
|
inject_meta_charset = True
|
||||||
|
|
||||||
|
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||||
|
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||||
|
"space_before_trailing_solidus", "omit_optional_tags",
|
||||||
|
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||||
|
"escape_rcdata")
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
if kwargs.has_key('quote_char'):
|
||||||
|
self.use_best_quote_char = False
|
||||||
|
for attr in self.options:
|
||||||
|
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||||
|
self.errors = []
|
||||||
|
self.strict = False
|
||||||
|
|
||||||
|
def serialize(self, treewalker, encoding=None):
|
||||||
|
in_cdata = False
|
||||||
|
self.errors = []
|
||||||
|
if encoding and self.inject_meta_charset:
|
||||||
|
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
|
||||||
|
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||||
|
# for maximum efficiently of this latter filter
|
||||||
|
if self.strip_whitespace:
|
||||||
|
treewalker = WhitespaceFilter(treewalker)
|
||||||
|
if self.omit_optional_tags:
|
||||||
|
treewalker = OptionalTagFilter(treewalker)
|
||||||
|
for token in treewalker:
|
||||||
|
type = token["type"]
|
||||||
|
if type == "Doctype":
|
||||||
|
doctype = u"<!DOCTYPE %s>" % token["name"]
|
||||||
|
if encoding:
|
||||||
|
yield doctype.encode(encoding)
|
||||||
|
else:
|
||||||
|
yield doctype
|
||||||
|
|
||||||
|
elif type in ("Characters", "SpaceCharacters"):
|
||||||
|
if type == "SpaceCharacters" or in_cdata:
|
||||||
|
if in_cdata and token["data"].find("</") >= 0:
|
||||||
|
self.serializeError(_("Unexpected </ in CDATA"))
|
||||||
|
if encoding:
|
||||||
|
yield token["data"].encode(encoding, "strict")
|
||||||
|
else:
|
||||||
|
yield token["data"]
|
||||||
|
elif encoding:
|
||||||
|
yield encode(escape(token["data"]), encoding)
|
||||||
|
else:
|
||||||
|
yield escape(token["data"])
|
||||||
|
|
||||||
|
elif type in ("StartTag", "EmptyTag"):
|
||||||
|
name = token["name"]
|
||||||
|
if name in rcdataElements and not self.escape_rcdata:
|
||||||
|
in_cdata = True
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
|
attrs = token["data"]
|
||||||
|
if hasattr(attrs, "items"):
|
||||||
|
attrs = attrs.items()
|
||||||
|
attrs.sort()
|
||||||
|
attributes = []
|
||||||
|
for k,v in attrs:
|
||||||
|
if encoding:
|
||||||
|
k = k.encode(encoding, "strict")
|
||||||
|
attributes.append(' ')
|
||||||
|
|
||||||
|
attributes.append(k)
|
||||||
|
if not self.minimize_boolean_attributes or \
|
||||||
|
(k not in booleanAttributes.get(name, tuple()) \
|
||||||
|
and k not in booleanAttributes.get("", tuple())):
|
||||||
|
attributes.append("=")
|
||||||
|
if self.quote_attr_values or not v:
|
||||||
|
quote_attr = True
|
||||||
|
else:
|
||||||
|
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||||
|
spaceCharacters + "<>\"'", False)
|
||||||
|
v = v.replace("&", "&")
|
||||||
|
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
||||||
|
if encoding:
|
||||||
|
v = encode(v, encoding)
|
||||||
|
if quote_attr:
|
||||||
|
quote_char = self.quote_char
|
||||||
|
if self.use_best_quote_char:
|
||||||
|
if "'" in v and '"' not in v:
|
||||||
|
quote_char = '"'
|
||||||
|
elif '"' in v and "'" not in v:
|
||||||
|
quote_char = "'"
|
||||||
|
if quote_char == "'":
|
||||||
|
v = v.replace("'", "'")
|
||||||
|
else:
|
||||||
|
v = v.replace('"', """)
|
||||||
|
attributes.append(quote_char)
|
||||||
|
attributes.append(v)
|
||||||
|
attributes.append(quote_char)
|
||||||
|
else:
|
||||||
|
attributes.append(v)
|
||||||
|
if name in voidElements and self.use_trailing_solidus:
|
||||||
|
if self.space_before_trailing_solidus:
|
||||||
|
attributes.append(" /")
|
||||||
|
else:
|
||||||
|
attributes.append("/")
|
||||||
|
if encoding:
|
||||||
|
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
|
||||||
|
else:
|
||||||
|
yield u"<%s%s>" % (name, u"".join(attributes))
|
||||||
|
|
||||||
|
elif type == "EndTag":
|
||||||
|
name = token["name"]
|
||||||
|
if name in rcdataElements:
|
||||||
|
in_cdata = False
|
||||||
|
elif in_cdata:
|
||||||
|
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||||
|
end_tag = u"</%s>" % name
|
||||||
|
if encoding:
|
||||||
|
end_tag = end_tag.encode(encoding, "strict")
|
||||||
|
yield end_tag
|
||||||
|
|
||||||
|
elif type == "Comment":
|
||||||
|
data = token["data"]
|
||||||
|
if data.find("--") >= 0:
|
||||||
|
self.serializeError(_("Comment contains --"))
|
||||||
|
comment = u"<!--%s-->" % token["data"]
|
||||||
|
if encoding:
|
||||||
|
comment = comment.encode(encoding, unicode_encode_errors)
|
||||||
|
yield comment
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.serializeError(token["data"])
|
||||||
|
|
||||||
|
def render(self, treewalker, encoding=None):
|
||||||
|
if encoding:
|
||||||
|
return "".join(list(self.serialize(treewalker, encoding)))
|
||||||
|
else:
|
||||||
|
return u"".join(list(self.serialize(treewalker)))
|
||||||
|
|
||||||
|
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||||
|
# XXX The idea is to make data mandatory.
|
||||||
|
self.errors.append(data)
|
||||||
|
if self.strict:
|
||||||
|
raise SerializeError
|
||||||
|
|
||||||
|
def SerializeError(Exception):
|
||||||
|
"""Error in serialized tree"""
|
||||||
|
pass
|
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from htmlserializer import HTMLSerializer
|
||||||
|
|
||||||
|
class XHTMLSerializer(HTMLSerializer):
|
||||||
|
quote_attr_values = True
|
||||||
|
minimize_boolean_attributes = False
|
||||||
|
use_trailing_solidus = True
|
||||||
|
escape_lt_in_attrs = True
|
||||||
|
omit_optional_tags = False
|
||||||
|
escape_rcdata = True
|
@ -9,7 +9,7 @@ _ = gettext.gettext
|
|||||||
|
|
||||||
from constants import contentModelFlags, spaceCharacters
|
from constants import contentModelFlags, spaceCharacters
|
||||||
from constants import entitiesWindows1252, entities
|
from constants import entitiesWindows1252, entities
|
||||||
from constants import asciiLowercase, asciiLetters
|
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
||||||
from constants import digits, hexDigits, EOF
|
from constants import digits, hexDigits, EOF
|
||||||
|
|
||||||
from inputstream import HTMLInputStream
|
from inputstream import HTMLInputStream
|
||||||
@ -50,18 +50,30 @@ class HTMLTokenizer(object):
|
|||||||
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||||
"bogusComment":self.bogusCommentState,
|
"bogusComment":self.bogusCommentState,
|
||||||
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||||
|
"commentStart":self.commentStartState,
|
||||||
|
"commentStartDash":self.commentStartDashState,
|
||||||
"comment":self.commentState,
|
"comment":self.commentState,
|
||||||
"commentDash":self.commentDashState,
|
"commentEndDash":self.commentEndDashState,
|
||||||
"commentEnd":self.commentEndState,
|
"commentEnd":self.commentEndState,
|
||||||
"doctype":self.doctypeState,
|
"doctype":self.doctypeState,
|
||||||
"beforeDoctypeName":self.beforeDoctypeNameState,
|
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||||
"doctypeName":self.doctypeNameState,
|
"doctypeName":self.doctypeNameState,
|
||||||
"afterDoctypeName":self.afterDoctypeNameState,
|
"afterDoctypeName":self.afterDoctypeNameState,
|
||||||
|
"beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
|
||||||
|
"doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
|
||||||
|
"doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
|
||||||
|
"afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
|
||||||
|
"beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
|
||||||
|
"doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
|
||||||
|
"doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
|
||||||
|
"afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
|
||||||
"bogusDoctype":self.bogusDoctypeState
|
"bogusDoctype":self.bogusDoctypeState
|
||||||
}
|
}
|
||||||
|
|
||||||
# Setup the initial tokenizer state
|
# Setup the initial tokenizer state
|
||||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||||
|
self.escapeFlag = False
|
||||||
|
self.lastFourChars = []
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
|
|
||||||
# The current token being created
|
# The current token being created
|
||||||
@ -77,7 +89,6 @@ class HTMLTokenizer(object):
|
|||||||
to return we yield the token which pauses processing until the next token
|
to return we yield the token which pauses processing until the next token
|
||||||
is requested.
|
is requested.
|
||||||
"""
|
"""
|
||||||
self.stream.reset()
|
|
||||||
self.tokenQueue = []
|
self.tokenQueue = []
|
||||||
# Start processing. When EOF is reached self.state will return False
|
# Start processing. When EOF is reached self.state will return False
|
||||||
# instead of True and the loop will terminate.
|
# instead of True and the loop will terminate.
|
||||||
@ -102,7 +113,7 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
# The character we just consumed need to be put back on the stack so it
|
# The character we just consumed need to be put back on the stack so it
|
||||||
# doesn't get lost...
|
# doesn't get lost...
|
||||||
self.stream.queue.append(data)
|
self.stream.unget(data)
|
||||||
|
|
||||||
def consumeNumberEntity(self, isHex):
|
def consumeNumberEntity(self, isHex):
|
||||||
"""This function returns either U+FFFD or the character based on the
|
"""This function returns either U+FFFD or the character based on the
|
||||||
@ -132,70 +143,71 @@ class HTMLTokenizer(object):
|
|||||||
# Convert the set of characters consumed to an int.
|
# Convert the set of characters consumed to an int.
|
||||||
charAsInt = int("".join(charStack), radix)
|
charAsInt = int("".join(charStack), radix)
|
||||||
|
|
||||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
if charAsInt == 13:
|
||||||
# smaller) we need to do the "windows trick".
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
if 127 < charAsInt < 160:
|
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||||
#XXX - removed parse error from windows 1252 entity for now
|
charAsInt = 10
|
||||||
#we may want to reenable this later
|
elif 127 < charAsInt < 160:
|
||||||
#self.tokenQueue.append({"type": "ParseError", "data":
|
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
# and smaller) we need to do the "windows trick".
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Entity used with illegal number (windows-1252 reference).")})
|
||||||
|
|
||||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||||
|
|
||||||
# 0 is not a good number.
|
# 0 is not a good number, neither are illegal Unicode code points.
|
||||||
if charAsInt == 0:
|
if charAsInt > 0 and charAsInt <= 1114111:
|
||||||
charAsInt = 65533
|
try:
|
||||||
|
# XXX We should have a separate function that does "int" to
|
||||||
try:
|
# "unicodestring" conversion since this doesn't always work
|
||||||
# XXX We should have a separate function that does "int" to
|
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||||
# "unicodestring" conversion since this doesn't always work
|
char = unichr(charAsInt)
|
||||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
except:
|
||||||
char = unichr(charAsInt)
|
try:
|
||||||
except:
|
char = eval("u'\\U%08x'" % charAsInt)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
except:
|
||||||
_("Numeric entity couldn't be converted to character.")})
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Numeric entity couldn't be converted to character.")})
|
||||||
|
|
||||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
# invoke parseError on parser.
|
# invoke parseError on parser.
|
||||||
if c != u";":
|
if c != u";":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity didn't end with ';'.")})
|
_("Numeric entity didn't end with ';'.")})
|
||||||
self.stream.queue.append(c)
|
self.stream.unget(c)
|
||||||
|
|
||||||
return char
|
return char
|
||||||
|
|
||||||
def consumeEntity(self):
|
def consumeEntity(self, fromAttribute=False):
|
||||||
char = None
|
char = None
|
||||||
charStack = [self.stream.char()]
|
charStack = [self.stream.char()]
|
||||||
if charStack[0] == u"#":
|
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
|
||||||
|
self.stream.unget(charStack)
|
||||||
|
elif charStack[0] == u"#":
|
||||||
# We might have a number entity here.
|
# We might have a number entity here.
|
||||||
charStack.extend([self.stream.char(), self.stream.char()])
|
charStack.extend([self.stream.char(), self.stream.char()])
|
||||||
if EOF in charStack:
|
if EOF in charStack:
|
||||||
# If we reach the end of the file put everything up to EOF
|
# If we reach the end of the file put everything up to EOF
|
||||||
# back in the queue
|
# back in the queue
|
||||||
charStack = charStack[:charStack.index(EOF)]
|
charStack = charStack[:charStack.index(EOF)]
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.unget(charStack)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity expected. Got end of file instead.")})
|
_("Numeric entity expected. Got end of file instead.")})
|
||||||
else:
|
else:
|
||||||
if charStack[1].lower() == u"x" \
|
if charStack[1].lower() == u"x" \
|
||||||
and charStack[2] in hexDigits:
|
and charStack[2] in hexDigits:
|
||||||
# Hexadecimal entity detected.
|
# Hexadecimal entity detected.
|
||||||
self.stream.queue.append(charStack[2])
|
self.stream.unget(charStack[2])
|
||||||
char = self.consumeNumberEntity(True)
|
char = self.consumeNumberEntity(True)
|
||||||
elif charStack[1] in digits:
|
elif charStack[1] in digits:
|
||||||
# Decimal entity detected.
|
# Decimal entity detected.
|
||||||
self.stream.queue.extend(charStack[1:])
|
self.stream.unget(charStack[1:])
|
||||||
char = self.consumeNumberEntity(False)
|
char = self.consumeNumberEntity(False)
|
||||||
else:
|
else:
|
||||||
# No number entity detected.
|
# No number entity detected.
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.unget(charStack)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity expected but none found.")})
|
_("Numeric entity expected but none found.")})
|
||||||
# Break out if we reach the end of the file
|
|
||||||
elif charStack[0] == EOF:
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Entity expected. Got end of file instead.")})
|
|
||||||
else:
|
else:
|
||||||
# At this point in the process might have named entity. Entities
|
# At this point in the process might have named entity. Entities
|
||||||
# are stored in the global variable "entities".
|
# are stored in the global variable "entities".
|
||||||
@ -216,7 +228,8 @@ class HTMLTokenizer(object):
|
|||||||
# that may match an entity
|
# that may match an entity
|
||||||
entityName = None
|
entityName = None
|
||||||
|
|
||||||
# Try to find the longest entity the string will match
|
# Try to find the longest entity the string will match to take care
|
||||||
|
# of ¬i for instance.
|
||||||
for entityLength in xrange(len(charStack)-1,1,-1):
|
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||||
possibleEntityName = "".join(charStack[:entityLength])
|
possibleEntityName = "".join(charStack[:entityLength])
|
||||||
if possibleEntityName in entities:
|
if possibleEntityName in entities:
|
||||||
@ -224,24 +237,26 @@ class HTMLTokenizer(object):
|
|||||||
break
|
break
|
||||||
|
|
||||||
if entityName is not None:
|
if entityName is not None:
|
||||||
char = entities[entityName]
|
if entityName[-1] != ";":
|
||||||
|
|
||||||
# Check whether or not the last character returned can be
|
|
||||||
# discarded or needs to be put back.
|
|
||||||
if not charStack[-1] == ";":
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Named entity didn't end with ';'.")})
|
_("Named entity didn't end with ';'.")})
|
||||||
self.stream.queue.extend(charStack[entityLength:])
|
if entityName[-1] != ";" and fromAttribute and \
|
||||||
|
(charStack[entityLength] in asciiLetters
|
||||||
|
or charStack[entityLength] in digits):
|
||||||
|
self.stream.unget(charStack)
|
||||||
|
else:
|
||||||
|
char = entities[entityName]
|
||||||
|
self.stream.unget(charStack[entityLength:])
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Named entity expected. Got none.")})
|
_("Named entity expected. Got none.")})
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.unget(charStack)
|
||||||
return char
|
return char
|
||||||
|
|
||||||
def processEntityInAttribute(self):
|
def processEntityInAttribute(self):
|
||||||
"""This method replaces the need for "entityInAttributeValueState".
|
"""This method replaces the need for "entityInAttributeValueState".
|
||||||
"""
|
"""
|
||||||
entity = self.consumeEntity()
|
entity = self.consumeEntity(True)
|
||||||
if entity:
|
if entity:
|
||||||
self.currentToken["data"][-1][1] += entity
|
self.currentToken["data"][-1][1] += entity
|
||||||
else:
|
else:
|
||||||
@ -266,12 +281,30 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
def dataState(self):
|
def dataState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
if data == u"&" and self.contentModelFlag in\
|
if self.contentModelFlag in\
|
||||||
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
||||||
|
if len(self.lastFourChars) == 4:
|
||||||
|
self.lastFourChars.pop(0)
|
||||||
|
self.lastFourChars.append(data)
|
||||||
|
if data == "&" and self.contentModelFlag in\
|
||||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||||
self.state = self.states["entityData"]
|
self.state = self.states["entityData"]
|
||||||
elif data == u"<" and self.contentModelFlag !=\
|
elif data == "-" and self.contentModelFlag in\
|
||||||
contentModelFlags["PLAINTEXT"]:
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||||
|
self.escapeFlag == False and\
|
||||||
|
"".join(self.lastFourChars) == "<!--":
|
||||||
|
self.escapeFlag = True
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||||
|
elif data == "<" and (self.contentModelFlag ==\
|
||||||
|
contentModelFlags["PCDATA"] or (self.contentModelFlag in
|
||||||
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||||
|
self.escapeFlag == False)):
|
||||||
self.state = self.states["tagOpen"]
|
self.state = self.states["tagOpen"]
|
||||||
|
elif data == ">" and self.contentModelFlag in\
|
||||||
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||||
|
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
|
||||||
|
self.escapeFlag = False
|
||||||
|
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
# Tokenization ends.
|
# Tokenization ends.
|
||||||
return False
|
return False
|
||||||
@ -285,7 +318,7 @@ class HTMLTokenizer(object):
|
|||||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "Characters", "data":
|
self.tokenQueue.append({"type": "Characters", "data":
|
||||||
data + self.stream.charsUntil((u"&", u"<"))})
|
data + self.stream.charsUntil(("&", "<", ">", "-"))})
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def entityDataState(self):
|
def entityDataState(self):
|
||||||
@ -321,14 +354,14 @@ class HTMLTokenizer(object):
|
|||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
_("Expected tag name. Got '?' instead (HTML doesn't "
|
||||||
"support processing instructions).")})
|
"support processing instructions).")})
|
||||||
self.stream.queue.append(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["bogusComment"]
|
self.state = self.states["bogusComment"]
|
||||||
else:
|
else:
|
||||||
# XXX
|
# XXX
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected tag name. Got something else instead")})
|
_("Expected tag name. Got something else instead")})
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
self.stream.queue.append(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
# We know the content model flag is set to either RCDATA or CDATA
|
# We know the content model flag is set to either RCDATA or CDATA
|
||||||
@ -338,7 +371,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["closeTagOpen"]
|
self.state = self.states["closeTagOpen"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
self.stream.queue.insert(0, data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -361,7 +394,7 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
# Since this is just for checking. We put the characters back on
|
# Since this is just for checking. We put the characters back on
|
||||||
# the stack.
|
# the stack.
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.unget(charStack)
|
||||||
|
|
||||||
if self.currentToken \
|
if self.currentToken \
|
||||||
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||||
@ -372,8 +405,6 @@ class HTMLTokenizer(object):
|
|||||||
# emitting the end tag token.
|
# emitting the end tag token.
|
||||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Expected closing tag after seeing '</'. None found.")})
|
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
|
|
||||||
@ -381,27 +412,25 @@ class HTMLTokenizer(object):
|
|||||||
# method to be walked through.
|
# method to be walked through.
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
data = self.stream.char()
|
||||||
data = self.stream.char()
|
if data in asciiLetters:
|
||||||
if data in asciiLetters:
|
self.currentToken = {"type":"EndTag", "name":data, "data":[]}
|
||||||
self.currentToken =\
|
self.state = self.states["tagName"]
|
||||||
{"type": "EndTag", "name": data, "data": []}
|
elif data == u">":
|
||||||
self.state = self.states["tagName"]
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
elif data == u">":
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.state = self.states["data"]
|
||||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
elif data == EOF:
|
||||||
self.state = self.states["data"]
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
elif data == EOF:
|
_("Expected closing tag. Unexpected end of file.")})
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||||
_("Expected closing tag. Unexpected end of file.")})
|
self.state = self.states["data"]
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
else:
|
||||||
self.state = self.states["data"]
|
# XXX data can be _'_...
|
||||||
else:
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
# XXX data can be _'_...
|
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.stream.unget(data)
|
||||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
self.state = self.states["bogusComment"]
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.state = self.states["bogusComment"]
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def tagNameState(self):
|
def tagNameState(self):
|
||||||
@ -413,11 +442,6 @@ class HTMLTokenizer(object):
|
|||||||
self.stream.charsUntil(asciiLetters, True)
|
self.stream.charsUntil(asciiLetters, True)
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character when getting the tag name.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in the tag name.")})
|
_("Unexpected end of file in the tag name.")})
|
||||||
@ -440,11 +464,6 @@ class HTMLTokenizer(object):
|
|||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == u"/":
|
elif data == u"/":
|
||||||
self.processSolidusInTag()
|
self.processSolidusInTag()
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character. Expected attribute name instead.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected attribute name instead.")})
|
_("Unexpected end of file. Expected attribute name instead.")})
|
||||||
@ -473,12 +492,6 @@ class HTMLTokenizer(object):
|
|||||||
elif data == u"/":
|
elif data == u"/":
|
||||||
self.processSolidusInTag()
|
self.processSolidusInTag()
|
||||||
self.state = self.states["beforeAttributeName"]
|
self.state = self.states["beforeAttributeName"]
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character in attribute name.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
leavingThisState = False
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute name.")})
|
_("Unexpected end of file in attribute name.")})
|
||||||
@ -515,11 +528,6 @@ class HTMLTokenizer(object):
|
|||||||
elif data == u"/":
|
elif data == u"/":
|
||||||
self.processSolidusInTag()
|
self.processSolidusInTag()
|
||||||
self.state = self.states["beforeAttributeName"]
|
self.state = self.states["beforeAttributeName"]
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character. Expected = or end of tag.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected = or end of tag.")})
|
_("Unexpected end of file. Expected = or end of tag.")})
|
||||||
@ -537,16 +545,11 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["attributeValueDoubleQuoted"]
|
self.state = self.states["attributeValueDoubleQuoted"]
|
||||||
elif data == u"&":
|
elif data == u"&":
|
||||||
self.state = self.states["attributeValueUnQuoted"]
|
self.state = self.states["attributeValueUnQuoted"]
|
||||||
self.stream.queue.append(data);
|
self.stream.unget(data);
|
||||||
elif data == u"'":
|
elif data == u"'":
|
||||||
self.state = self.states["attributeValueSingleQuoted"]
|
self.state = self.states["attributeValueSingleQuoted"]
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character. Expected attribute value.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected attribute value.")})
|
_("Unexpected end of file. Expected attribute value.")})
|
||||||
@ -594,11 +597,6 @@ class HTMLTokenizer(object):
|
|||||||
self.processEntityInAttribute()
|
self.processEntityInAttribute()
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == u"<":
|
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
|
||||||
_("Unexpected < character in attribute value.")})
|
|
||||||
self.emitCurrentToken()
|
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute value.")})
|
_("Unexpected end of file in attribute value.")})
|
||||||
@ -625,27 +623,66 @@ class HTMLTokenizer(object):
|
|||||||
charStack = [self.stream.char(), self.stream.char()]
|
charStack = [self.stream.char(), self.stream.char()]
|
||||||
if charStack == [u"-", u"-"]:
|
if charStack == [u"-", u"-"]:
|
||||||
self.currentToken = {"type": "Comment", "data": ""}
|
self.currentToken = {"type": "Comment", "data": ""}
|
||||||
self.state = self.states["comment"]
|
self.state = self.states["commentStart"]
|
||||||
else:
|
else:
|
||||||
for x in xrange(5):
|
for x in xrange(5):
|
||||||
charStack.append(self.stream.char())
|
charStack.append(self.stream.char())
|
||||||
# Put in explicit EOF check
|
# Put in explicit EOF check
|
||||||
if (not EOF in charStack and
|
if (not EOF in charStack and
|
||||||
"".join(charStack).upper() == u"DOCTYPE"):
|
"".join(charStack).upper() == u"DOCTYPE"):
|
||||||
self.currentToken =\
|
self.currentToken = {"type":"Doctype", "name":"",
|
||||||
{"type": "Doctype", "name": "", "data": True}
|
"publicId":None, "systemId":None, "correct":True}
|
||||||
self.state = self.states["doctype"]
|
self.state = self.states["doctype"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||||
self.stream.queue.extend(charStack)
|
self.stream.unget(charStack)
|
||||||
self.state = self.states["bogusComment"]
|
self.state = self.states["bogusComment"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def commentStartState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "-":
|
||||||
|
self.state = self.states["commentStartDash"]
|
||||||
|
elif data == ">":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Incorrect comment.")})
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in comment.")})
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||||
|
self.state = self.states["comment"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def commentStartDashState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "-":
|
||||||
|
self.state = self.states["commentEnd"]
|
||||||
|
elif data == ">":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Incorrect comment.")})
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in comment.")})
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||||
|
self.state = self.states["comment"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def commentState(self):
|
def commentState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
if data == u"-":
|
if data == u"-":
|
||||||
self.state = self.states["commentDash"]
|
self.state = self.states["commentEndDash"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment.")})
|
_("Unexpected end of file in comment.")})
|
||||||
@ -655,7 +692,7 @@ class HTMLTokenizer(object):
|
|||||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def commentDashState(self):
|
def commentEndDashState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
if data == u"-":
|
if data == u"-":
|
||||||
self.state = self.states["commentEnd"]
|
self.state = self.states["commentEnd"]
|
||||||
@ -702,7 +739,7 @@ class HTMLTokenizer(object):
|
|||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("No space after literal string 'DOCTYPE'.")})
|
_("No space after literal string 'DOCTYPE'.")})
|
||||||
self.stream.queue.append(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["beforeDoctypeName"]
|
self.state = self.states["beforeDoctypeName"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -710,19 +747,16 @@ class HTMLTokenizer(object):
|
|||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
if data in spaceCharacters:
|
if data in spaceCharacters:
|
||||||
pass
|
pass
|
||||||
elif data in asciiLowercase:
|
|
||||||
self.currentToken["name"] = data.upper()
|
|
||||||
self.state = self.states["doctypeName"]
|
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
# Character needs to be consumed per the specification so don't
|
|
||||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
@ -732,30 +766,19 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
def doctypeNameState(self):
|
def doctypeNameState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
needsDoctypeCheck = False
|
|
||||||
if data in spaceCharacters:
|
if data in spaceCharacters:
|
||||||
self.state = self.states["afterDoctypeName"]
|
self.state = self.states["afterDoctypeName"]
|
||||||
needsDoctypeCheck = True
|
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE name.")})
|
_("Unexpected end of file in DOCTYPE name.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
# We can't just uppercase everything that arrives here. For
|
|
||||||
# instance, non-ASCII characters.
|
|
||||||
if data in asciiLowercase:
|
|
||||||
data = data.upper()
|
|
||||||
self.currentToken["name"] += data
|
self.currentToken["name"] += data
|
||||||
needsDoctypeCheck = True
|
|
||||||
|
|
||||||
# After some iterations through this state it should eventually say
|
|
||||||
# "HTML". Otherwise there's an error.
|
|
||||||
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
|
||||||
self.currentToken["data"] = False
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def afterDoctypeNameState(self):
|
def afterDoctypeNameState(self):
|
||||||
@ -766,28 +789,194 @@ class HTMLTokenizer(object):
|
|||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.currentToken["data"] = True
|
self.currentToken["correct"] = False
|
||||||
# XXX EMIT
|
self.stream.unget(data)
|
||||||
self.stream.queue.append(data)
|
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
|
charStack = [data]
|
||||||
|
for x in xrange(5):
|
||||||
|
charStack.append(self.stream.char())
|
||||||
|
if EOF not in charStack and\
|
||||||
|
"".join(charStack).translate(asciiUpper2Lower) == "public":
|
||||||
|
self.state = self.states["beforeDoctypePublicIdentifier"]
|
||||||
|
elif EOF not in charStack and\
|
||||||
|
"".join(charStack).translate(asciiUpper2Lower) == "system":
|
||||||
|
self.state = self.states["beforeDoctypeSystemIdentifier"]
|
||||||
|
else:
|
||||||
|
self.stream.unget(charStack)
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Expected space or '>'. Got '" + data + "'")})
|
||||||
|
self.state = self.states["bogusDoctype"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def beforeDoctypePublicIdentifierState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data == "\"":
|
||||||
|
self.currentToken["publicId"] = ""
|
||||||
|
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
||||||
|
elif data == "'":
|
||||||
|
self.currentToken["publicId"] = ""
|
||||||
|
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
||||||
|
elif data == ">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected space or '>'. Got '" + data + "'")})
|
_("Unexpected end of DOCTYPE.")})
|
||||||
self.currentToken["data"] = True
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in DOCTYPE.")})
|
||||||
|
self.state = self.states["bogusDoctype"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypePublicIdentifierDoubleQuotedState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "\"":
|
||||||
|
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["publicId"] += data
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypePublicIdentifierSingleQuotedState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "'":
|
||||||
|
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["publicId"] += data
|
||||||
|
return True
|
||||||
|
|
||||||
|
def afterDoctypePublicIdentifierState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data == "\"":
|
||||||
|
self.currentToken["systemId"] = ""
|
||||||
|
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||||
|
elif data == "'":
|
||||||
|
self.currentToken["systemId"] = ""
|
||||||
|
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||||
|
elif data == ">":
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in DOCTYPE.")})
|
||||||
|
self.state = self.states["bogusDoctype"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def beforeDoctypeSystemIdentifierState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data == "\"":
|
||||||
|
self.currentToken["systemId"] = ""
|
||||||
|
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||||
|
elif data == "'":
|
||||||
|
self.currentToken["systemId"] = ""
|
||||||
|
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||||
|
elif data == ">":
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in DOCTYPE.")})
|
||||||
|
self.state = self.states["bogusDoctype"]
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypeSystemIdentifierDoubleQuotedState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "\"":
|
||||||
|
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["systemId"] += data
|
||||||
|
return True
|
||||||
|
|
||||||
|
def doctypeSystemIdentifierSingleQuotedState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data == "'":
|
||||||
|
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.currentToken["systemId"] += data
|
||||||
|
return True
|
||||||
|
|
||||||
|
def afterDoctypeSystemIdentifierState(self):
|
||||||
|
data = self.stream.char()
|
||||||
|
if data in spaceCharacters:
|
||||||
|
pass
|
||||||
|
elif data == ">":
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
elif data == EOF:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected end of file in DOCTYPE.")})
|
||||||
|
self.currentToken["correct"] = False
|
||||||
|
self.tokenQueue.append(self.currentToken)
|
||||||
|
self.state = self.states["data"]
|
||||||
|
else:
|
||||||
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
|
_("Unexpected character in DOCTYPE.")})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def bogusDoctypeState(self):
|
def bogusDoctypeState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
|
self.currentToken["correct"] = False
|
||||||
if data == u">":
|
if data == u">":
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
# XXX EMIT
|
# XXX EMIT
|
||||||
self.stream.queue.append(data)
|
self.stream.unget(data)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in bogus doctype.")})
|
_("Unexpected end of file in bogus doctype.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
64
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
64
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
"""A collection of modules for building different kinds of tree from
|
||||||
|
HTML documents.
|
||||||
|
|
||||||
|
To create a treebuilder for a new type of tree, you need to do
|
||||||
|
implement several things:
|
||||||
|
|
||||||
|
1) A set of classes for various types of elements: Document, Doctype,
|
||||||
|
Comment, Element. These must implement the interface of
|
||||||
|
_base.treebuilders.Node (although comment nodes have a different
|
||||||
|
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||||
|
Textual content may also be implemented as another node type, or not, as
|
||||||
|
your tree implementation requires.
|
||||||
|
|
||||||
|
2) A treebuilder object (called TreeBuilder by convention) that
|
||||||
|
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||||
|
documentClass - the class to use for the bottommost node of a document
|
||||||
|
elementClass - the class to use for HTML Elements
|
||||||
|
commentClass - the class to use for comments
|
||||||
|
doctypeClass - the class to use for doctypes
|
||||||
|
It also has one required method:
|
||||||
|
getDocument - Returns the root node of the complete document tree
|
||||||
|
|
||||||
|
3) If you wish to run the unit tests, you must also create a
|
||||||
|
testSerializer method on your treebuilder which accepts a node and
|
||||||
|
returns a string containing Node and its children serialized according
|
||||||
|
to the format used in the unittests
|
||||||
|
|
||||||
|
The supplied simpletree module provides a python-only implementation
|
||||||
|
of a full treebuilder and is a useful reference for the semantics of
|
||||||
|
the various methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
treeBuilderCache = {}
|
||||||
|
|
||||||
|
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||||
|
"""Get a TreeBuilder class for various types of tree with built-in support
|
||||||
|
|
||||||
|
treeType - the name of the tree type required (case-insensitive). Supported
|
||||||
|
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||||
|
|
||||||
|
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||||
|
more pythonic idioms.
|
||||||
|
"dom" - The xml.dom.minidom DOM implementation
|
||||||
|
"etree" - A generic builder for tree implementations exposing an
|
||||||
|
elementtree-like interface (known to work with
|
||||||
|
ElementTree, cElementTree and lxml.etree).
|
||||||
|
"beautifulsoup" - Beautiful soup (if installed)
|
||||||
|
|
||||||
|
implementation - (Currently applies to the "etree" tree type only). A module
|
||||||
|
implementing the tree type e.g. xml.etree.ElementTree or
|
||||||
|
lxml.etree."""
|
||||||
|
|
||||||
|
treeType = treeType.lower()
|
||||||
|
if treeType not in treeBuilderCache:
|
||||||
|
if treeType in ("dom", "simpletree"):
|
||||||
|
mod = __import__(treeType, globals())
|
||||||
|
treeBuilderCache[treeType] = mod.TreeBuilder
|
||||||
|
elif treeType == "beautifulsoup":
|
||||||
|
import soup
|
||||||
|
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||||
|
elif treeType == "etree":
|
||||||
|
import etree
|
||||||
|
treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||||
|
return treeBuilderCache.get(treeType)
|
@ -1,4 +1,4 @@
|
|||||||
from constants import scopingElements, tableInsertModeElements
|
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||||
try:
|
try:
|
||||||
frozenset
|
frozenset
|
||||||
except NameError:
|
except NameError:
|
8
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
8
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
@ -2,7 +2,7 @@ import _base
|
|||||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||||
import new
|
import new
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
from constants import voidElements
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
import re
|
import re
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -80,9 +80,11 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
setattr(self.dom, 'hilite', method)
|
setattr(self.dom, 'hilite', method)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def doctypeClass(self,name):
|
def insertDoctype(self, name):
|
||||||
domimpl = minidom.getDOMImplementation()
|
domimpl = minidom.getDOMImplementation()
|
||||||
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
doctype = domimpl.createDocumentType(name,None,None)
|
||||||
|
self.document.appendChild(NodeBuilder(doctype))
|
||||||
|
doctype.ownerDocument = self.dom
|
||||||
|
|
||||||
def elementClass(self, name):
|
def elementClass(self, name):
|
||||||
return NodeBuilder(self.dom.createElement(name))
|
return NodeBuilder(self.dom.createElement(name))
|
249
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
249
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
@ -0,0 +1,249 @@
|
|||||||
|
import _base
|
||||||
|
import new
|
||||||
|
import copy
|
||||||
|
|
||||||
|
moduleCache = {}
|
||||||
|
|
||||||
|
def getETreeModule(ElementTreeImplementation, fullTree=False):
|
||||||
|
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||||
|
if name in moduleCache:
|
||||||
|
return moduleCache[name]
|
||||||
|
else:
|
||||||
|
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||||
|
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
|
||||||
|
mod.__dict__.update(objs)
|
||||||
|
moduleCache[name] = mod
|
||||||
|
return mod
|
||||||
|
|
||||||
|
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||||
|
ElementTree = ElementTreeImplementation
|
||||||
|
class Element(_base.Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
self._element = ElementTree.Element(name)
|
||||||
|
self.name = name
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._element.tag = name
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return self._element.tag
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._element.attrib
|
||||||
|
|
||||||
|
def _setAttributes(self, attributes):
|
||||||
|
#Delete existing attributes first
|
||||||
|
#XXX - there may be a better way to do this...
|
||||||
|
for key in self._element.attrib.keys():
|
||||||
|
del self._element.attrib[key]
|
||||||
|
for key, value in attributes.iteritems():
|
||||||
|
self._element.set(key, value)
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
def _setChildNodes(self, value):
|
||||||
|
del self._element[:]
|
||||||
|
self._childNodes = []
|
||||||
|
for element in value:
|
||||||
|
self.insertChild(element)
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes, _setChildNodes)
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text"""
|
||||||
|
return bool(self._element.text or self._element.getchildren())
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self._childNodes.append(node)
|
||||||
|
self._element.append(node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self._element.getchildren().index(refNode._element)
|
||||||
|
self._element.insert(index, node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
self._element.remove(node._element)
|
||||||
|
node.parent=None
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
if not(len(self._element)):
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
elif insertBefore is None:
|
||||||
|
#Insert the text as the tail of the last child element
|
||||||
|
if not self._element[-1].tail:
|
||||||
|
self._element[-1].tail = ""
|
||||||
|
self._element[-1].tail += data
|
||||||
|
else:
|
||||||
|
#Insert the text before the specified node
|
||||||
|
children = self._element.getchildren()
|
||||||
|
index = children.index(insertBefore._element)
|
||||||
|
if index > 0:
|
||||||
|
if not self._element[index-1].tail:
|
||||||
|
self._element[index-1].tail = ""
|
||||||
|
self._element[index-1].tail += data
|
||||||
|
else:
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
element = Element(self.name)
|
||||||
|
for name, value in self.attributes.iteritems():
|
||||||
|
element.attributes[name] = value
|
||||||
|
return element
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
if newParent.childNodes:
|
||||||
|
newParent.childNodes[-1]._element.tail += self._element.text
|
||||||
|
else:
|
||||||
|
if not newParent._element.text:
|
||||||
|
newParent._element.text = ""
|
||||||
|
if self._element.text is not None:
|
||||||
|
newParent._element.text += self._element.text
|
||||||
|
self._element.text = ""
|
||||||
|
_base.Node.reparentChildren(self, newParent)
|
||||||
|
|
||||||
|
class Comment(Element):
|
||||||
|
def __init__(self, data):
|
||||||
|
#Use the superclass constructor to set all properties on the
|
||||||
|
#wrapper element
|
||||||
|
self._element = ElementTree.Comment(data)
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
def _setData(self, value):
|
||||||
|
self._element.text = value
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
class DocumentType(Element):
|
||||||
|
def __init__(self, name):
|
||||||
|
Element.__init__(self, "<!DOCTYPE>")
|
||||||
|
self._element.text = name
|
||||||
|
|
||||||
|
class Document(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, "<DOCUMENT_ROOT>")
|
||||||
|
|
||||||
|
class DocumentFragment(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if not(hasattr(element, "tag")):
|
||||||
|
element = element.getroot()
|
||||||
|
if element.tag == "<!DOCTYPE>":
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||||
|
elif element.tag == "<DOCUMENT_ROOT>":
|
||||||
|
rv.append("#document")
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
elif type(element.tag) == type(ElementTree.Comment):
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
for name, value in element.attrib.iteritems():
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if element.tail:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def tostring(element):
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element):
|
||||||
|
if type(element) == type(ElementTree.ElementTree):
|
||||||
|
element = element.getroot()
|
||||||
|
|
||||||
|
if element.tag == "<!DOCTYPE>":
|
||||||
|
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||||
|
elif element.tag == "<DOCUMENT_ROOT>":
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
elif type(element.tag) == type(ElementTree.Comment):
|
||||||
|
rv.append("<!--%s-->"%(element.text,))
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>"%(element.tag,))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||||
|
for name, value in element.attrib.iteritems()])
|
||||||
|
rv.append("<%s %s>"%(element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>"%(element.tag,))
|
||||||
|
|
||||||
|
if element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = Element
|
||||||
|
commentClass = Comment
|
||||||
|
fragmentClass = DocumentFragment
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
if fullTree:
|
||||||
|
return self.document._element
|
||||||
|
else:
|
||||||
|
return self.document._element.find("html")
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return _base.TreeBuilder.getFragment(self)._element
|
||||||
|
|
||||||
|
return locals()
|
@ -1,5 +1,5 @@
|
|||||||
import _base
|
import _base
|
||||||
from constants import voidElements
|
from html5lib.constants import voidElements
|
||||||
from xml.sax.saxutils import escape
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
# Really crappy basic implementation of a DOM-core like thing
|
# Really crappy basic implementation of a DOM-core like thing
|
162
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
162
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
|
||||||
|
import sys
|
||||||
|
import copy
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
class AttrList(object):
|
||||||
|
def __init__(self, element):
|
||||||
|
self.element = element
|
||||||
|
self.attrs = dict(self.element.attrs)
|
||||||
|
def __iter__(self):
|
||||||
|
return self.attrs.items().__iter__()
|
||||||
|
def __setitem__(self, name, value):
|
||||||
|
"set attr", name, value
|
||||||
|
self.element[name] = value
|
||||||
|
def items(self):
|
||||||
|
return self.attrs.items()
|
||||||
|
def keys(self):
|
||||||
|
return self.attrs.keys()
|
||||||
|
def __getitem__(self, name):
|
||||||
|
return self.attrs[name]
|
||||||
|
def __contains__(self, name):
|
||||||
|
return name in self.attrs.keys()
|
||||||
|
|
||||||
|
|
||||||
|
class Element(_base.Node):
|
||||||
|
def __init__(self, element, soup):
|
||||||
|
_base.Node.__init__(self, element.name)
|
||||||
|
self.element = element
|
||||||
|
self.soup=soup
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
|
and self.element.contents[-1].__class__ == NavigableString):
|
||||||
|
newNode = TextNode(NavigableString(
|
||||||
|
self.element.contents[-1]+node.element), self.soup)
|
||||||
|
self.element.contents[-1].extract()
|
||||||
|
self.appendChild(newNode)
|
||||||
|
else:
|
||||||
|
self.element.insert(len(self.element.contents), node.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def getAttributes(self):
|
||||||
|
return AttrList(self.element)
|
||||||
|
|
||||||
|
def setAttributes(self, attributes):
|
||||||
|
if attributes:
|
||||||
|
for name, value in attributes.items():
|
||||||
|
self.element[name] = value
|
||||||
|
|
||||||
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
text = TextNode(NavigableString(data), self.soup)
|
||||||
|
if insertBefore:
|
||||||
|
self.insertBefore(text, insertBefore)
|
||||||
|
else:
|
||||||
|
self.appendChild(text)
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self.element.contents.index(refNode.element)
|
||||||
|
if (node.element.__class__ == NavigableString and self.element.contents
|
||||||
|
and self.element.contents[index-1].__class__ == NavigableString):
|
||||||
|
newNode = TextNode(NavigableString(
|
||||||
|
self.element.contents[index-1]+node.element), self.soup)
|
||||||
|
self.element.contents[index-1].extract()
|
||||||
|
self.insertBefore(newNode, refNode)
|
||||||
|
else:
|
||||||
|
self.element.insert(index, node.element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
node.element.extract()
|
||||||
|
node.parent = None
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
while self.element.contents:
|
||||||
|
child = self.element.contents[0]
|
||||||
|
child.extract()
|
||||||
|
if isinstance(child, Tag):
|
||||||
|
newParent.appendChild(Element(child, self.soup))
|
||||||
|
else:
|
||||||
|
newParent.appendChild(TextNode(child, self.soup))
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
node = Element(Tag(self.soup, self.element.name), self.soup)
|
||||||
|
for key,value in self.attributes:
|
||||||
|
node.attributes[key] = value
|
||||||
|
return node
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
return self.element.contents
|
||||||
|
|
||||||
|
class TextNode(Element):
|
||||||
|
def __init__(self, element, soup):
|
||||||
|
_base.Node.__init__(self, None)
|
||||||
|
self.element = element
|
||||||
|
self.soup=soup
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
def documentClass(self):
|
||||||
|
self.soup = BeautifulSoup("")
|
||||||
|
return Element(self.soup, self.soup)
|
||||||
|
|
||||||
|
def insertDoctype(self, name):
|
||||||
|
self.soup.insert(0, Declaration(name))
|
||||||
|
|
||||||
|
def elementClass(self, name):
|
||||||
|
return Element(Tag(self.soup, name), self.soup)
|
||||||
|
|
||||||
|
def commentClass(self, data):
|
||||||
|
return TextNode(Comment(data), self.soup)
|
||||||
|
|
||||||
|
def fragmentClass(self):
|
||||||
|
self.soup = BeautifulSoup("")
|
||||||
|
self.soup.name = "[document_fragment]"
|
||||||
|
return Element(self.soup, self.soup)
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self.soup.insert(len(self.soup.contents), node.element)
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.soup
|
||||||
|
|
||||||
|
def getFragment(self):
|
||||||
|
return _base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if isinstance(element, Declaration):
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
|
||||||
|
elif isinstance(element, BeautifulSoup):
|
||||||
|
if element.name == "[document_fragment]":
|
||||||
|
rv.append("#document-fragment")
|
||||||
|
else:
|
||||||
|
rv.append("#document")
|
||||||
|
|
||||||
|
elif isinstance(element, Comment):
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
|
||||||
|
elif isinstance(element, unicode):
|
||||||
|
rv.append("|%s\"%s\"" %(' '*indent, element))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, element.name))
|
||||||
|
if element.attrs:
|
||||||
|
for name, value in element.attrs:
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
indent += 2
|
||||||
|
if hasattr(element, "contents"):
|
||||||
|
for child in element.contents:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
47
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
47
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
"""A collection of modules for iterating through different kinds of
|
||||||
|
tree, generating tokens identical to those produced by the tokenizer
|
||||||
|
module.
|
||||||
|
|
||||||
|
To create a tree walker for a new type of tree, you need to do
|
||||||
|
implement a tree walker object (called TreeWalker by convention) that
|
||||||
|
implements a 'serialize' method taking a tree as sole argument and
|
||||||
|
returning an iterator generating tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
treeWalkerCache = {}
|
||||||
|
|
||||||
|
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||||
|
"""Get a TreeWalker class for various types of tree with built-in support
|
||||||
|
|
||||||
|
treeType - the name of the tree type required (case-insensitive). Supported
|
||||||
|
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||||
|
|
||||||
|
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||||
|
more pythonic idioms.
|
||||||
|
"dom" - The xml.dom.minidom DOM implementation
|
||||||
|
"pulldom" - The xml.dom.pulldom event stream
|
||||||
|
"etree" - A generic builder for tree implementations exposing an
|
||||||
|
elementtree-like interface (known to work with
|
||||||
|
ElementTree, cElementTree and lxml.etree).
|
||||||
|
"beautifulsoup" - Beautiful soup (if installed)
|
||||||
|
"genshi" - a Genshi stream
|
||||||
|
|
||||||
|
implementation - (Currently applies to the "etree" tree type only). A module
|
||||||
|
implementing the tree type e.g. xml.etree.ElementTree or
|
||||||
|
lxml.etree."""
|
||||||
|
|
||||||
|
treeType = treeType.lower()
|
||||||
|
if treeType not in treeWalkerCache:
|
||||||
|
if treeType in ("dom", "pulldom", "simpletree"):
|
||||||
|
mod = __import__(treeType, globals())
|
||||||
|
treeWalkerCache[treeType] = mod.TreeWalker
|
||||||
|
elif treeType == "genshi":
|
||||||
|
import genshistream
|
||||||
|
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||||
|
elif treeType == "beautifulsoup":
|
||||||
|
import soup
|
||||||
|
treeWalkerCache[treeType] = soup.TreeWalker
|
||||||
|
elif treeType == "etree":
|
||||||
|
import etree
|
||||||
|
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||||
|
return treeWalkerCache.get(treeType)
|
151
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
151
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements, spaceCharacters
|
||||||
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
class TreeWalker(object):
|
||||||
|
def __init__(self, tree):
|
||||||
|
self.tree = tree
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
return {"type": "SerializeError", "data": msg}
|
||||||
|
|
||||||
|
def normalizeAttrs(self, attrs):
|
||||||
|
if not attrs:
|
||||||
|
attrs = []
|
||||||
|
elif hasattr(attrs, 'items'):
|
||||||
|
attrs = attrs.items()
|
||||||
|
return [(unicode(name),unicode(value)) for name,value in attrs]
|
||||||
|
|
||||||
|
def emptyTag(self, name, attrs, hasChildren=False):
|
||||||
|
yield {"type": "EmptyTag", "name": unicode(name), \
|
||||||
|
"data": self.normalizeAttrs(attrs)}
|
||||||
|
if hasChildren:
|
||||||
|
yield self.error(_("Void element has children"))
|
||||||
|
|
||||||
|
def startTag(self, name, attrs):
|
||||||
|
return {"type": "StartTag", "name": unicode(name), \
|
||||||
|
"data": self.normalizeAttrs(attrs)}
|
||||||
|
|
||||||
|
def endTag(self, name):
|
||||||
|
return {"type": "EndTag", "name": unicode(name), "data": []}
|
||||||
|
|
||||||
|
def text(self, data):
|
||||||
|
data = unicode(data)
|
||||||
|
middle = data.lstrip(spaceCharacters)
|
||||||
|
left = data[:len(data)-len(middle)]
|
||||||
|
if left:
|
||||||
|
yield {"type": "SpaceCharacters", "data": left}
|
||||||
|
data = middle
|
||||||
|
middle = data.rstrip(spaceCharacters)
|
||||||
|
right = data[len(middle):]
|
||||||
|
if middle:
|
||||||
|
yield {"type": "Characters", "data": middle}
|
||||||
|
if right:
|
||||||
|
yield {"type": "SpaceCharacters", "data": right}
|
||||||
|
|
||||||
|
def comment(self, data):
|
||||||
|
return {"type": "Comment", "data": unicode(data)}
|
||||||
|
|
||||||
|
def doctype(self, name):
|
||||||
|
return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
|
||||||
|
|
||||||
|
def unknown(self, nodeType):
|
||||||
|
return self.error(_("Unknown node type: ") + nodeType)
|
||||||
|
|
||||||
|
class RecursiveTreeWalker(TreeWalker):
|
||||||
|
def walkChildren(self, node):
|
||||||
|
raise NodeImplementedError
|
||||||
|
|
||||||
|
def element(self, node, name, attrs, hasChildren):
|
||||||
|
if name in voidElements:
|
||||||
|
for token in self.emptyTag(name, attrs, hasChildren):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield self.startTag(name, attrs)
|
||||||
|
if hasChildren:
|
||||||
|
for token in self.walkChildren(node):
|
||||||
|
yield token
|
||||||
|
yield self.endTag(name)
|
||||||
|
|
||||||
|
from xml.dom import Node
|
||||||
|
|
||||||
|
DOCUMENT = Node.DOCUMENT_NODE
|
||||||
|
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||||
|
TEXT = Node.TEXT_NODE
|
||||||
|
ELEMENT = Node.ELEMENT_NODE
|
||||||
|
COMMENT = Node.COMMENT_NODE
|
||||||
|
UNKNOWN = "<#UNKNOWN#>"
|
||||||
|
|
||||||
|
class NonRecursiveTreeWalker(TreeWalker):
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
currentNode = self.tree
|
||||||
|
while currentNode is not None:
|
||||||
|
details = self.getNodeDetails(currentNode)
|
||||||
|
type, details = details[0], details[1:]
|
||||||
|
hasChildren = False
|
||||||
|
|
||||||
|
if type == DOCTYPE:
|
||||||
|
yield self.doctype(*details)
|
||||||
|
|
||||||
|
elif type == TEXT:
|
||||||
|
for token in self.text(*details):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif type == ELEMENT:
|
||||||
|
name, attributes, hasChildren = details
|
||||||
|
if name in voidElements:
|
||||||
|
for token in self.emptyTag(name, attributes, hasChildren):
|
||||||
|
yield token
|
||||||
|
hasChildren = False
|
||||||
|
else:
|
||||||
|
yield self.startTag(name, attributes)
|
||||||
|
|
||||||
|
elif type == COMMENT:
|
||||||
|
yield self.comment(details[0])
|
||||||
|
|
||||||
|
elif type == DOCUMENT:
|
||||||
|
hasChildren = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield self.unknown(details[0])
|
||||||
|
|
||||||
|
if hasChildren:
|
||||||
|
firstChild = self.getFirstChild(currentNode)
|
||||||
|
else:
|
||||||
|
firstChild = None
|
||||||
|
|
||||||
|
if firstChild is not None:
|
||||||
|
currentNode = firstChild
|
||||||
|
else:
|
||||||
|
while currentNode is not None:
|
||||||
|
details = self.getNodeDetails(currentNode)
|
||||||
|
type, details = details[0], details[1:]
|
||||||
|
if type == ELEMENT:
|
||||||
|
name, attributes, hasChildren = details
|
||||||
|
if name not in voidElements:
|
||||||
|
yield self.endTag(name)
|
||||||
|
nextSibling = self.getNextSibling(currentNode)
|
||||||
|
if nextSibling is not None:
|
||||||
|
currentNode = nextSibling
|
||||||
|
break
|
||||||
|
if self.tree is currentNode:
|
||||||
|
currentNode = None
|
||||||
|
else:
|
||||||
|
currentNode = self.getParentNode(currentNode)
|
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from xml.dom import Node
|
||||||
|
|
||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
return _base.DOCTYPE, node.nodeName
|
||||||
|
|
||||||
|
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||||
|
return _base.TEXT, node.nodeValue
|
||||||
|
|
||||||
|
elif node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
|
||||||
|
|
||||||
|
elif node.nodeType == Node.COMMENT_NODE:
|
||||||
|
return _base.COMMENT, node.nodeValue
|
||||||
|
|
||||||
|
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||||
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return _base.UNKNOWN, node.nodeType
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
return node.firstChild
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
return node.nextSibling
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
return node.parentNode
|
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
import new
|
||||||
|
import copy
|
||||||
|
|
||||||
|
import _base
|
||||||
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
|
moduleCache = {}
|
||||||
|
|
||||||
|
def getETreeModule(ElementTreeImplementation):
|
||||||
|
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||||
|
if name in moduleCache:
|
||||||
|
return moduleCache[name]
|
||||||
|
else:
|
||||||
|
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||||
|
objs = getETreeBuilder(ElementTreeImplementation)
|
||||||
|
mod.__dict__.update(objs)
|
||||||
|
moduleCache[name] = mod
|
||||||
|
return mod
|
||||||
|
|
||||||
|
def getETreeBuilder(ElementTreeImplementation):
|
||||||
|
ElementTree = ElementTreeImplementation
|
||||||
|
|
||||||
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
"""Given the particular ElementTree representation, this implementation,
|
||||||
|
to avoid using recursion, returns "nodes" as tuples with the following
|
||||||
|
content:
|
||||||
|
|
||||||
|
1. An Element node serving as *context* (it cannot be called the parent
|
||||||
|
node due to the particular ``tail`` text nodes.
|
||||||
|
|
||||||
|
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
|
||||||
|
|
||||||
|
3. A list used as a stack of all ancestor *context nodes*. It is a
|
||||||
|
pair tuple whose first item is an Element and second item is a child
|
||||||
|
index.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, tuple): # It might be the root Element
|
||||||
|
elt, key, parents = node
|
||||||
|
if key in ("text", "tail"):
|
||||||
|
return _base.TEXT, getattr(elt, key)
|
||||||
|
else:
|
||||||
|
node = elt[int(key)]
|
||||||
|
|
||||||
|
if not(hasattr(node, "tag")):
|
||||||
|
node = node.getroot()
|
||||||
|
|
||||||
|
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
|
||||||
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif node.tag == "<!DOCTYPE>":
|
||||||
|
return _base.DOCTYPE, node.text
|
||||||
|
|
||||||
|
elif type(node.tag) == type(ElementTree.Comment):
|
||||||
|
return _base.COMMENT, node.text
|
||||||
|
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
if isinstance(node, tuple): # It might be the root Element
|
||||||
|
elt, key, parents = node
|
||||||
|
assert key not in ("text", "tail"), "Text nodes have no children"
|
||||||
|
parents.append((elt, int(key)))
|
||||||
|
node = elt[int(key)]
|
||||||
|
else:
|
||||||
|
parents = []
|
||||||
|
|
||||||
|
assert len(node) or node.text, "Node has no children"
|
||||||
|
if node.text:
|
||||||
|
return (node, "text", parents)
|
||||||
|
else:
|
||||||
|
return (node, 0, parents)
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||||
|
|
||||||
|
elt, key, parents = node
|
||||||
|
if key == "text":
|
||||||
|
key = -1
|
||||||
|
elif key == "tail":
|
||||||
|
elt, key = parents.pop()
|
||||||
|
else:
|
||||||
|
# Look for "tail" of the "revisited" node
|
||||||
|
child = elt[key]
|
||||||
|
if child.tail:
|
||||||
|
parents.append((elt, key))
|
||||||
|
return (child, "tail", parents)
|
||||||
|
|
||||||
|
# case where key were "text" or "tail" or elt[key] had a tail
|
||||||
|
key += 1
|
||||||
|
if len(elt) > key:
|
||||||
|
return (elt, key, parents)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
assert isinstance(node, tuple)
|
||||||
|
elt, key, parents = node
|
||||||
|
if parents:
|
||||||
|
elt, key = parents.pop()
|
||||||
|
return elt, key, parents
|
||||||
|
else:
|
||||||
|
# HACK: We could return ``elt`` but None will stop the algorithm the same way
|
||||||
|
return None
|
||||||
|
|
||||||
|
return locals()
|
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
|
||||||
|
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||||
|
from genshi.output import NamespaceFlattener
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
|
class TreeWalker(_base.TreeWalker):
|
||||||
|
def __iter__(self):
|
||||||
|
depth = 0
|
||||||
|
ignore_until = None
|
||||||
|
previous = None
|
||||||
|
for event in NamespaceFlattener(prefixes={
|
||||||
|
'http://www.w3.org/1999/xhtml': ''
|
||||||
|
})(self.tree):
|
||||||
|
if previous is not None:
|
||||||
|
if previous[0] == START:
|
||||||
|
depth += 1
|
||||||
|
if ignore_until <= depth:
|
||||||
|
ignore_until = None
|
||||||
|
if ignore_until is None:
|
||||||
|
for token in self.tokens(previous, event):
|
||||||
|
yield token
|
||||||
|
if token["type"] == "EmptyTag":
|
||||||
|
ignore_until = depth
|
||||||
|
if previous[0] == END:
|
||||||
|
depth -= 1
|
||||||
|
previous = event
|
||||||
|
if previous is not None:
|
||||||
|
if ignore_until is None or ignore_until <= depth:
|
||||||
|
for token in self.tokens(previous, None):
|
||||||
|
yield token
|
||||||
|
elif ignore_until is not None:
|
||||||
|
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||||
|
|
||||||
|
def tokens(self, event, next):
|
||||||
|
kind, data, pos = event
|
||||||
|
if kind == START:
|
||||||
|
tag, attrib = data
|
||||||
|
if tag in voidElements:
|
||||||
|
for token in self.emptyTag(tag, list(attrib), \
|
||||||
|
not next or next[0] != END or next[1] != tag):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield self.startTag(tag, list(attrib))
|
||||||
|
|
||||||
|
elif kind == END:
|
||||||
|
if data not in voidElements:
|
||||||
|
yield self.endTag(data)
|
||||||
|
|
||||||
|
elif kind == COMMENT:
|
||||||
|
yield self.comment(data)
|
||||||
|
|
||||||
|
elif kind == TEXT:
|
||||||
|
for token in self.text(data):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
elif kind == DOCTYPE:
|
||||||
|
yield self.doctype(data[0])
|
||||||
|
|
||||||
|
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||||
|
START_CDATA, END_CDATA, PI):
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield self.unknown(kind)
|
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
||||||
|
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
from html5lib.constants import voidElements
|
||||||
|
|
||||||
|
class TreeWalker(_base.TreeWalker):
|
||||||
|
def __iter__(self):
|
||||||
|
ignore_until = None
|
||||||
|
previous = None
|
||||||
|
for event in self.tree:
|
||||||
|
if previous is not None and \
|
||||||
|
(ignore_until is None or previous[1] is ignore_until):
|
||||||
|
if previous[1] is ignore_until:
|
||||||
|
ignore_until = None
|
||||||
|
for token in self.tokens(previous, event):
|
||||||
|
yield token
|
||||||
|
if token["type"] == "EmptyTag":
|
||||||
|
ignore_until = previous[1]
|
||||||
|
previous = event
|
||||||
|
if ignore_until is None or previous[1] is ignore_until:
|
||||||
|
for token in self.tokens(previous, None):
|
||||||
|
yield token
|
||||||
|
elif ignore_until is not None:
|
||||||
|
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||||
|
|
||||||
|
def tokens(self, event, next):
|
||||||
|
type, node = event
|
||||||
|
if type == START_ELEMENT:
|
||||||
|
name = node.nodeName
|
||||||
|
if name in voidElements:
|
||||||
|
for token in self.emptyTag(name, \
|
||||||
|
node.attributes.items(), not next or next[1] is not node):
|
||||||
|
yield token
|
||||||
|
else:
|
||||||
|
yield self.startTag(name, node.attributes.items())
|
||||||
|
|
||||||
|
elif type == END_ELEMENT:
|
||||||
|
name = node.nodeName
|
||||||
|
if name not in voidElements:
|
||||||
|
yield self.endTag(name)
|
||||||
|
|
||||||
|
elif type == COMMENT:
|
||||||
|
yield self.comment(node.nodeValue)
|
||||||
|
|
||||||
|
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
||||||
|
for token in self.text(node.nodeValue):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
else:
|
||||||
|
yield self.unknown(type)
|
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
"""Given that simpletree has no performant way of getting a node's
|
||||||
|
next sibling, this implementation returns "nodes" as tuples with the
|
||||||
|
following content:
|
||||||
|
|
||||||
|
1. The parent Node (Element, Document or DocumentFragment)
|
||||||
|
|
||||||
|
2. The child index of the current node in its parent's children list
|
||||||
|
|
||||||
|
3. A list used as a stack of all ancestors. It is a pair tuple whose
|
||||||
|
first item is a parent Node and second item is a child index.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, tuple): # It might be the root Node
|
||||||
|
parent, idx, parents = node
|
||||||
|
node = parent.childNodes[idx]
|
||||||
|
|
||||||
|
# testing node.type allows us not to import treebuilders.simpletree
|
||||||
|
if node.type in (1, 2): # Document or DocumentFragment
|
||||||
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif node.type == 3: # DocumentType
|
||||||
|
return _base.DOCTYPE, node.name
|
||||||
|
|
||||||
|
elif node.type == 4: # TextNode
|
||||||
|
return _base.TEXT, node.value
|
||||||
|
|
||||||
|
elif node.type == 5: # Element
|
||||||
|
return _base.ELEMENT, node.name, \
|
||||||
|
node.attributes.items(), node.hasContent()
|
||||||
|
|
||||||
|
elif node.type == 6: # CommentNode
|
||||||
|
return _base.COMMENT, node.data
|
||||||
|
|
||||||
|
else:
|
||||||
|
return _node.UNKNOWN, node.type
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
if isinstance(node, tuple): # It might be the root Node
|
||||||
|
parent, idx, parents = node
|
||||||
|
parents.append((parent, idx))
|
||||||
|
node = parent.childNodes[idx]
|
||||||
|
else:
|
||||||
|
parents = []
|
||||||
|
|
||||||
|
assert node.hasContent(), "Node has no children"
|
||||||
|
return (node, 0, parents)
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||||
|
parent, idx, parents = node
|
||||||
|
idx += 1
|
||||||
|
if len(parent.childNodes) > idx:
|
||||||
|
return (parent, idx, parents)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
assert isinstance(node, tuple)
|
||||||
|
parent, idx, parents = node
|
||||||
|
if parents:
|
||||||
|
parent, idx = parents.pop()
|
||||||
|
return parent, idx, parents
|
||||||
|
else:
|
||||||
|
# HACK: We could return ``parent`` but None will stop the algorithm the same way
|
||||||
|
return None
|
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import gettext
|
||||||
|
_ = gettext.gettext
|
||||||
|
|
||||||
|
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
|
def getNodeDetails(self, node):
|
||||||
|
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||||
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
|
elif isinstance(node, Declaration): # DocumentType
|
||||||
|
#Slice needed to remove markup added during unicode conversion
|
||||||
|
return _base.DOCTYPE, unicode(node.string)[2:-1]
|
||||||
|
|
||||||
|
elif isinstance(node, Comment):
|
||||||
|
return _base.COMMENT, unicode(node.string)[4:-3]
|
||||||
|
|
||||||
|
elif isinstance(node, unicode): # TextNode
|
||||||
|
return _base.TEXT, node
|
||||||
|
|
||||||
|
elif isinstance(node, Tag): # Element
|
||||||
|
return _base.ELEMENT, node.name, \
|
||||||
|
dict(node.attrs).items(), node.contents
|
||||||
|
else:
|
||||||
|
return _base.UNKNOWN, node.__class__.__name__
|
||||||
|
|
||||||
|
def getFirstChild(self, node):
|
||||||
|
return node.contents[0]
|
||||||
|
|
||||||
|
def getNextSibling(self, node):
|
||||||
|
return node.nextSibling
|
||||||
|
|
||||||
|
def getParentNode(self, node):
|
||||||
|
return node.parent
|
Loading…
x
Reference in New Issue
Block a user