Sync with trunk.
This commit is contained in:
commit
da56f78d70
2
README
2
README
@ -9,7 +9,7 @@ also actively being maintained.
|
||||
|
||||
It uses Mark Pilgrim's Universal Feed Parser to read from CDF, RDF, RSS and
|
||||
Atom feeds; Leonard Richardson's Beautiful Soup to correct markup issues;
|
||||
and either Tomas Styblo's templating engine Daniel Viellard's implementation
|
||||
and either Tomas Styblo's templating engine or Daniel Viellard's implementation
|
||||
of XSLT to output static files in any format you can dream up.
|
||||
|
||||
To get started, check out the documentation in the docs directory. If you have
|
||||
|
@ -139,9 +139,10 @@ you are free to include as few or as many parameters as you like, most of
|
||||
the predefined themes presume that at least <code>name</code> is defined.</p>
|
||||
<p>The <code>content_type</code> parameter can be defined to indicate that
|
||||
this subscription is a <em>reading list</em>, i.e., is an external list
|
||||
of subscriptions. At the moment, two formats of reading lists are supported:
|
||||
<code>opml</code> and <code>foaf</code>. In the future, support for formats
|
||||
like <code>xoxo</code> could be added.</p>
|
||||
of subscriptions. At the moment, three formats of reading lists are supported:
|
||||
<code>opml</code>, <code>foaf</code>, <code>csv</code>, and
|
||||
<code>config</code>. In the future,
|
||||
support for formats like <code>xoxo</code> could be added.</p>
|
||||
<p><a href="normalization.html#overrides">Normalization overrides</a> can
|
||||
also be defined here.</p>
|
||||
|
||||
|
@ -84,8 +84,8 @@ then the output stream is
|
||||
through the specified filter and the output is planced into the named file; the
|
||||
other unmodified branch continues onto the next filter, if any.
|
||||
One use case for this function is to use
|
||||
<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
|
||||
an HTML output stream from one source.</li>
|
||||
<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
|
||||
and an HTML output stream from one source.</li>
|
||||
|
||||
<li>Templates written using htmltmpl or django currently only have access to a
|
||||
fixed set of fields, whereas XSLT and genshi templates have access to
|
||||
|
@ -80,8 +80,8 @@ that can be used to help.</p>
|
||||
<ul>
|
||||
<li><code>ignore_in_feed</code> allows you to list any number of elements
|
||||
or attributes which are to be ignored in feeds. This is often handy in the
|
||||
case of feeds where the <code>id</code>, <code>updated</code> or
|
||||
<code>xml:lang</code> values can't be trusted.</li>
|
||||
case of feeds where the <code>author</code>, <code>id</code>,
|
||||
<code>updated</code> or <code>xml:lang</code> values can't be trusted.</li>
|
||||
<li><code>title_type</code>, <code>summary_type</code>,
|
||||
<code>content_type</code> allow you to override the
|
||||
<a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>
|
||||
|
@ -31,13 +31,18 @@ activity_threshold = 90
|
||||
# filters to be run
|
||||
filters = excerpt.py
|
||||
|
||||
bill_of_materials:
|
||||
.htaccess
|
||||
favicon.ico
|
||||
robots.txt
|
||||
|
||||
# filter parameters
|
||||
[excerpt.py]
|
||||
omit = img p br
|
||||
width = 500
|
||||
|
||||
# add memes to output
|
||||
[index.html.tmpl]
|
||||
[index.html.xslt]
|
||||
filters = mememe.plugin
|
||||
|
||||
[mememe.plugin]
|
||||
|
@ -4,6 +4,7 @@ Generate an excerpt from either the summary or a content of an entry.
|
||||
Parameters:
|
||||
width: maximum number of characters in the excerpt. Default: 500
|
||||
omit: whitespace delimited list of html tags to remove. Default: none
|
||||
target: name of element created. Default: planet:excerpt
|
||||
|
||||
Notes:
|
||||
* if 'img' is in the list of tags to be omitted <img> tags are replaced with
|
||||
@ -23,6 +24,7 @@ args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2]))
|
||||
|
||||
wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
|
||||
omit = args.get('omit', '').split()
|
||||
target = args.get('target', 'planet:excerpt')
|
||||
|
||||
class copy:
|
||||
""" recursively copy a source to a target, up to a given width """
|
||||
@ -94,10 +96,14 @@ if not source:
|
||||
|
||||
# if present, recursively copy it to a planet:excerpt element
|
||||
if source:
|
||||
dom.documentElement.setAttribute('xmlns:planet', planetNS)
|
||||
target = dom.createElementNS(planetNS, 'planet:excerpt')
|
||||
source[0].parentNode.appendChild(target)
|
||||
copy(dom, source[0], target)
|
||||
if target.startswith('planet:'):
|
||||
dom.documentElement.setAttribute('xmlns:planet', planetNS)
|
||||
if target.startswith('atom:'): target = target.split(':',1)[1]
|
||||
excerpt = dom.createElementNS(planetNS, target)
|
||||
source[0].parentNode.appendChild(excerpt)
|
||||
copy(dom, source[0], excerpt)
|
||||
if source[0].nodeName == excerpt.nodeName:
|
||||
source[0].parentNode.removeChild(source[0])
|
||||
|
||||
# print out results
|
||||
print dom.toxml('utf-8')
|
||||
|
@ -1,5 +1,5 @@
|
||||
import sys
|
||||
from planet import html5lib
|
||||
import html5lib
|
||||
tree=html5lib.treebuilders.dom.TreeBuilder
|
||||
parser = html5lib.html5parser.HTMLParser(tree=tree)
|
||||
document = parser.parse(sys.stdin)
|
||||
|
@ -23,9 +23,10 @@ from xml.sax.saxutils import escape
|
||||
from htmlentitydefs import entitydefs
|
||||
|
||||
import planet
|
||||
from planet import config, feedparser
|
||||
from planet import config
|
||||
from planet.spider import filename
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
import feedparser
|
||||
log = planet.logger
|
||||
options = config.filter_options(sys.argv[0])
|
||||
|
||||
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
|
||||
@ -64,6 +65,7 @@ def cache_meme(url, headers):
|
||||
file.close()
|
||||
|
||||
urlmap = {}
|
||||
revmap = {}
|
||||
def canonicalize(url):
|
||||
url = urlmap.get(url,url)
|
||||
parts = list(urlparse.urlparse(url))
|
||||
@ -73,7 +75,10 @@ def canonicalize(url):
|
||||
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
|
||||
if not parts[2]: parts[2] = '/'
|
||||
parts[-1] = ''
|
||||
return urlparse.urlunparse(parts)
|
||||
|
||||
canonurl = urlparse.urlunparse(parts)
|
||||
revmap[canonurl] = url
|
||||
return canonurl
|
||||
|
||||
log.debug("Loading cached data")
|
||||
for name in glob.glob(os.path.join(cache, '*')):
|
||||
@ -125,7 +130,7 @@ for name in glob.glob(os.path.join(cache, '*')):
|
||||
|
||||
# identify the unique links
|
||||
entry_links = []
|
||||
for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
|
||||
for node in doc.xpathEval("//*[@href and not(@rel='source') and not(@rel='license')]"):
|
||||
parent = node.parent
|
||||
while parent:
|
||||
if parent.name == 'source': break
|
||||
@ -309,7 +314,7 @@ meme_feed.newTextChild(None, 'updated',
|
||||
|
||||
# parse the input
|
||||
log.debug("Parse input")
|
||||
doc=libxml2.parseDoc(sys.stdin.read())
|
||||
doc=libxml2.readDoc(sys.stdin.read(), '', 'utf-8', libxml2.XML_PARSE_NONET)
|
||||
|
||||
# find the sidebar/footer
|
||||
sidebar = options.get('sidebar','//*[@class="sidebar"]')
|
||||
@ -340,7 +345,7 @@ while child:
|
||||
if not title: continue
|
||||
li2 = ul2.newChild(None, 'li', None)
|
||||
a = li2.newTextChild(None, 'a', title)
|
||||
a.setProp('href', entry)
|
||||
a.setProp('href', revmap.get(entry,entry))
|
||||
link_count = link_count + 1
|
||||
if link_count >= 10: break
|
||||
if link_count > 0: state = None
|
||||
@ -388,7 +393,7 @@ for i in range(0,len(weighted_links)):
|
||||
|
||||
# otherwise, parse the html
|
||||
if not title:
|
||||
title = html(link).title
|
||||
title = html(revmap.get(link,link)).title
|
||||
|
||||
# dehtmlize
|
||||
title = re.sub('&(\w+);',
|
||||
@ -421,7 +426,7 @@ for i in range(0,len(weighted_links)):
|
||||
|
||||
# main link
|
||||
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
|
||||
a.setProp('href',link)
|
||||
a.setProp('href',revmap.get(link,link))
|
||||
if (((i==0) or (updated>=weighted_links[i-1][2])) and
|
||||
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
|
||||
rank = 0
|
||||
@ -437,7 +442,7 @@ for i in range(0,len(weighted_links)):
|
||||
if entry in voters: continue
|
||||
li2 = ul2.newChild(None, 'li', None)
|
||||
a = li2.newTextChild(None, 'a' , author)
|
||||
a.setProp('href',entry)
|
||||
a.setProp('href',revmap.get(entry,entry))
|
||||
if title: a.setProp('title',title)
|
||||
voters.append(entry)
|
||||
|
||||
|
36
filters/minhead.py
Normal file
36
filters/minhead.py
Normal file
@ -0,0 +1,36 @@
|
||||
#
|
||||
# Ensure that all headings are below a permissible maximum (like h3).
|
||||
# If not, all heading levels will be changed to conform.
|
||||
# Note: this may create "illegal" heading levels, like h7 and beyond.
|
||||
#
|
||||
|
||||
import sys
|
||||
from xml.dom import minidom, XHTML_NAMESPACE
|
||||
|
||||
# determine permissible minimimum heading
|
||||
if '--min' in sys.argv:
|
||||
minhead = int(sys.argv[sys.argv.index('--min')+1])
|
||||
else:
|
||||
minhead=3
|
||||
|
||||
# parse input stream
|
||||
doc = minidom.parse(sys.stdin)
|
||||
|
||||
# search for headings below the permissable minimum
|
||||
first=minhead
|
||||
for i in range(1,minhead):
|
||||
if doc.getElementsByTagName('h%d' % i):
|
||||
first=i
|
||||
break
|
||||
|
||||
# if found, bump all headings so that the top is the permissible minimum
|
||||
if first < minhead:
|
||||
for i in range(6,0,-1):
|
||||
for oldhead in doc.getElementsByTagName('h%d' % i):
|
||||
newhead = doc.createElementNS(XHTML_NAMESPACE, 'h%d' % (i+minhead-first))
|
||||
for child in oldhead.childNodes:
|
||||
newhead.appendChild(child)
|
||||
oldhead.parentNode.replaceChild(newhead, oldhead)
|
||||
|
||||
# return (possibly modified) document
|
||||
print doc.toxml('utf-8')
|
24
filters/xhtml2html.plugin
Normal file
24
filters/xhtml2html.plugin
Normal file
@ -0,0 +1,24 @@
|
||||
# Example usages:
|
||||
#
|
||||
# filters:
|
||||
# xhtml2html.plugin?quote_attr_values=True"e_char="'"
|
||||
#
|
||||
# -- or --
|
||||
#
|
||||
# [xhtml2html.plugin]
|
||||
# quote_attr_values=True
|
||||
# quote_char="'"
|
||||
|
||||
import sys
|
||||
opts = {}
|
||||
for name,value in zip(sys.argv[1::2],sys.argv[2::2]):
|
||||
name = name.lstrip('-')
|
||||
try: opts[name] = eval(value)
|
||||
except: opts[name] = value
|
||||
|
||||
from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
|
||||
tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
|
||||
serializer = serializer.HTMLSerializer(**dict(opts))
|
||||
for text in serializer.serialize(tokens, encoding='utf-8'):
|
||||
sys.stdout.write(text)
|
@ -1,5 +0,0 @@
|
||||
import sys
|
||||
from genshi.input import XMLParser
|
||||
from genshi.output import HTMLSerializer
|
||||
|
||||
print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
|
@ -1,6 +1,7 @@
|
||||
xmlns = 'http://planet.intertwingly.net/'
|
||||
|
||||
logger = None
|
||||
loggerParms = None
|
||||
|
||||
import os, sys, re
|
||||
import config
|
||||
@ -11,8 +12,8 @@ from urlparse import urljoin
|
||||
|
||||
def getLogger(level, format):
|
||||
""" get a logger with the specified log level """
|
||||
global logger
|
||||
if logger: return logger
|
||||
global logger, loggerParms
|
||||
if logger and loggerParms == (level,format): return logger
|
||||
|
||||
try:
|
||||
import logging
|
||||
@ -21,16 +22,19 @@ def getLogger(level, format):
|
||||
import compat_logging as logging
|
||||
logging.basicConfig(format=format)
|
||||
|
||||
logging.getLogger().setLevel(logging.getLevelName(level))
|
||||
logger = logging.getLogger("planet.runner")
|
||||
logger.setLevel(logging.getLevelName(level))
|
||||
try:
|
||||
logger.warning
|
||||
except:
|
||||
logger.warning = logger.warn
|
||||
|
||||
loggerParms = (level,format)
|
||||
return logger
|
||||
|
||||
sys.path.insert(1, os.path.join(os.path.dirname(__file__),'vendor'))
|
||||
|
||||
# Configure feed parser
|
||||
from planet import feedparser
|
||||
import feedparser
|
||||
feedparser.SANITIZE_HTML=0
|
||||
feedparser.RESOLVE_RELATIVE_URIS=0
|
||||
|
@ -138,8 +138,10 @@ def load(config_file):
|
||||
parser.read(config_file)
|
||||
|
||||
import config, planet
|
||||
from planet import opml, foaf
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
from planet import opml, foaf, csv_config
|
||||
log = planet.logger
|
||||
if not log:
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
|
||||
# Theme support
|
||||
theme = config.output_theme()
|
||||
@ -191,18 +193,22 @@ def load(config_file):
|
||||
os.makedirs(config.cache_lists_directory())
|
||||
|
||||
def data2config(data, cached_config):
|
||||
if content_type(list).find('opml')>=0:
|
||||
opml.opml2config(data, cached_config)
|
||||
elif content_type(list).find('foaf')>=0:
|
||||
foaf.foaf2config(data, cached_config)
|
||||
else:
|
||||
from planet import shell
|
||||
import StringIO
|
||||
cached_config.readfp(StringIO.StringIO(shell.run(
|
||||
content_type(list), data.getvalue(), mode="filter")))
|
||||
if content_type(list).find('opml')>=0:
|
||||
opml.opml2config(data, cached_config)
|
||||
elif content_type(list).find('foaf')>=0:
|
||||
foaf.foaf2config(data, cached_config)
|
||||
elif content_type(list).find('csv')>=0:
|
||||
csv_config.csv2config(data, cached_config)
|
||||
elif content_type(list).find('config')>=0:
|
||||
cached_config.readfp(data)
|
||||
else:
|
||||
from planet import shell
|
||||
import StringIO
|
||||
cached_config.readfp(StringIO.StringIO(shell.run(
|
||||
content_type(list), data.getvalue(), mode="filter")))
|
||||
|
||||
if cached_config.sections() in [[], [list]]:
|
||||
raise Exception
|
||||
if cached_config.sections() in [[], [list]]:
|
||||
raise Exception
|
||||
|
||||
for list in reading_lists:
|
||||
downloadReadingList(list, parser, data2config)
|
||||
@ -344,7 +350,9 @@ def reading_lists():
|
||||
for section in parser.sections():
|
||||
if parser.has_option(section, 'content_type'):
|
||||
type = parser.get(section, 'content_type')
|
||||
if type.find('opml')>=0 or type.find('foaf')>=0 or type.find('.')>=0:
|
||||
if type.find('opml')>=0 or type.find('foaf')>=0 or \
|
||||
type.find('csv')>=0 or type.find('config')>=0 or \
|
||||
type.find('.')>=0:
|
||||
result.append(section)
|
||||
return result
|
||||
|
||||
|
29
planet/csv_config.py
Executable file
29
planet/csv_config.py
Executable file
@ -0,0 +1,29 @@
|
||||
from ConfigParser import ConfigParser
|
||||
import csv
|
||||
|
||||
# input = csv, output = ConfigParser
|
||||
def csv2config(input, config=None):
|
||||
|
||||
if not hasattr(input, 'read'):
|
||||
input = csv.StringIO(input)
|
||||
|
||||
if not config:
|
||||
config = ConfigParser()
|
||||
|
||||
reader = csv.DictReader(input)
|
||||
for row in reader:
|
||||
section = row[reader.fieldnames[0]]
|
||||
config.add_section(section)
|
||||
for name, value in row.items():
|
||||
if value and name != reader.fieldnames[0]:
|
||||
config.set(section, name, value)
|
||||
|
||||
return config
|
||||
|
||||
if __name__ == "__main__":
|
||||
# small main program which converts CSV into config.ini format
|
||||
import sys, urllib
|
||||
config = ConfigParser()
|
||||
for input in sys.argv[1:]:
|
||||
csv2config(urllib.urlopen(input), config)
|
||||
config.write(sys.stdout)
|
@ -5,8 +5,7 @@ from spider import filename
|
||||
|
||||
def expungeCache():
|
||||
""" Expunge old entries from a cache of entries """
|
||||
import planet
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.logger
|
||||
|
||||
log.info("Determining feed subscriptions")
|
||||
entry_count = {}
|
||||
|
@ -35,13 +35,13 @@ def load_model(rdf, base_uri):
|
||||
return model
|
||||
|
||||
# input = foaf, output = ConfigParser
|
||||
def foaf2config(rdf, config, subject=None):
|
||||
def foaf2config(rdf, config, subject=None, section=None):
|
||||
|
||||
if not config or not config.sections():
|
||||
return
|
||||
|
||||
# there should be only be 1 section
|
||||
section = config.sections().pop()
|
||||
if not section: section = config.sections().pop()
|
||||
|
||||
try:
|
||||
from RDF import Model, NS, Parser, Statement
|
||||
@ -191,6 +191,7 @@ if __name__ == "__main__":
|
||||
|
||||
for uri in sys.argv[1:]:
|
||||
config.add_section(uri)
|
||||
foaf2config(urllib.urlopen(uri), config)
|
||||
foaf2config(urllib.urlopen(uri), config, section=uri)
|
||||
config.remove_section(uri)
|
||||
|
||||
config.write(sys.stdout)
|
||||
|
@ -1,42 +0,0 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
__path__.append(os.path.dirname(__path__[0]))
|
||||
|
||||
import dom
|
||||
import simpletree
|
||||
|
||||
try:
|
||||
import etree
|
||||
except:
|
||||
pass
|
@ -1,5 +0,0 @@
|
||||
import etreefull
|
||||
|
||||
class TreeBuilder(etreefull.TreeBuilder):
|
||||
def getDocument(self):
|
||||
return self.document._element.find("html")
|
@ -1,227 +0,0 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
try:
|
||||
from elementtree import ElementTree
|
||||
except:
|
||||
pass
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
Element.__init__(self, None)
|
||||
self._element = ElementTree.Comment(data)
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, DocumentFragment)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is ElementTree.Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
@ -1,5 +1,5 @@
|
||||
from glob import glob
|
||||
import os, sys, dbhash
|
||||
import os, sys
|
||||
|
||||
if __name__ == '__main__':
|
||||
rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@ -13,6 +13,7 @@ def open():
|
||||
cache = config.cache_directory()
|
||||
index=os.path.join(cache,'index')
|
||||
if not os.path.exists(index): return None
|
||||
import dbhash
|
||||
return dbhash.open(filename(index, 'id'),'w')
|
||||
except Exception, e:
|
||||
if e.__class__.__name__ == 'DBError': e = e.args[-1]
|
||||
@ -34,6 +35,7 @@ def create():
|
||||
cache = config.cache_directory()
|
||||
index=os.path.join(cache,'index')
|
||||
if not os.path.exists(index): os.makedirs(index)
|
||||
import dbhash
|
||||
index = dbhash.open(filename(index, 'id'),'c')
|
||||
|
||||
try:
|
||||
|
@ -16,7 +16,8 @@ Todo:
|
||||
import re, time, md5, sgmllib
|
||||
from xml.sax.saxutils import escape
|
||||
from xml.dom import minidom, Node
|
||||
from planet.html5lib import liberalxmlparser, treebuilders
|
||||
from html5lib import liberalxmlparser
|
||||
from html5lib.treebuilders import dom
|
||||
import planet, config
|
||||
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -106,12 +107,12 @@ def date(xentry, name, parsed):
|
||||
formatted = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsed)
|
||||
xdate = createTextElement(xentry, name, formatted)
|
||||
formatted = time.strftime(config.date_format(), parsed)
|
||||
xdate.setAttribute('planet:format', formatted)
|
||||
xdate.setAttribute('planet:format', formatted.decode('utf-8'))
|
||||
|
||||
def category(xentry, tag):
|
||||
xtag = xentry.ownerDocument.createElement('category')
|
||||
if tag.has_key('term') and tag.term:
|
||||
xtag.setAttribute('term', tag.get('term'))
|
||||
if not tag.has_key('term') or not tag.term: return
|
||||
xtag.setAttribute('term', tag.get('term'))
|
||||
if tag.has_key('scheme') and tag.scheme:
|
||||
xtag.setAttribute('scheme', tag.get('scheme'))
|
||||
if tag.has_key('label') and tag.label:
|
||||
@ -124,7 +125,11 @@ def author(xentry, name, detail):
|
||||
xdoc = xentry.ownerDocument
|
||||
xauthor = xdoc.createElement(name)
|
||||
|
||||
createTextElement(xauthor, 'name', detail.get('name', None))
|
||||
if detail.get('name', None):
|
||||
createTextElement(xauthor, 'name', detail.get('name'))
|
||||
else:
|
||||
xauthor.appendChild(xdoc.createElement('name'))
|
||||
|
||||
createTextElement(xauthor, 'email', detail.get('email', None))
|
||||
createTextElement(xauthor, 'uri', detail.get('href', None))
|
||||
|
||||
@ -150,7 +155,7 @@ def content(xentry, name, detail, bozo):
|
||||
data = minidom.parseString(xdiv % detail.value).documentElement
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
else:
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
||||
parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
|
||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||
for body in html.documentElement.childNodes:
|
||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||
@ -232,7 +237,7 @@ def reconstitute(feed, entry):
|
||||
links(xentry, entry)
|
||||
|
||||
bozo = feed.bozo
|
||||
if not entry.has_key('title'):
|
||||
if not entry.has_key('title') or not entry.title:
|
||||
xentry.appendChild(xdoc.createElement('title'))
|
||||
|
||||
content(xentry, 'title', entry.get('title_detail',None), bozo)
|
||||
|
@ -6,7 +6,7 @@ logged_modes = []
|
||||
|
||||
def run(template_file, doc, mode='template'):
|
||||
""" select a template module based on file extension and execute it """
|
||||
log = planet.getLogger(planet.config.log_level(),planet.config.log_format())
|
||||
log = planet.logger
|
||||
|
||||
if mode == 'template':
|
||||
dirs = planet.config.template_directories()
|
||||
|
@ -40,7 +40,9 @@ def run(script, doc, output_file=None, options={}):
|
||||
reluri = os.path.splitext(os.path.basename(output_file))[0]
|
||||
context['url'] = urlparse.urljoin(config.link(),reluri)
|
||||
f = open(output_file, 'w')
|
||||
f.write(t.render(context))
|
||||
ss = t.render(context)
|
||||
if isinstance(ss,unicode): ss=ss.encode('utf-8')
|
||||
f.write(ss)
|
||||
f.close()
|
||||
else:
|
||||
# @@this is useful for testing purposes, but does it
|
||||
|
@ -1,6 +1,10 @@
|
||||
from xml.sax.saxutils import escape
|
||||
import sgmllib, time, os, sys, new, urlparse
|
||||
from planet import config, feedparser, htmltmpl
|
||||
import sgmllib, time, os, sys, new, urlparse, re
|
||||
from planet import config, feedparser
|
||||
import htmltmpl
|
||||
|
||||
voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
|
||||
empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
|
||||
|
||||
class stripHtml(sgmllib.SGMLParser):
|
||||
"remove all tags from the data"
|
||||
@ -130,9 +134,12 @@ def tmpl_mapper(source, rules):
|
||||
node = source
|
||||
for path in rule[2:]:
|
||||
if isinstance(path, str) and path in node:
|
||||
if path == 'value' and node.get('type','')=='text/plain':
|
||||
node['value'] = escape(node['value'])
|
||||
node['type'] = 'text/html'
|
||||
if path == 'value':
|
||||
if node.get('type','')=='text/plain':
|
||||
node['value'] = escape(node['value'])
|
||||
node['type'] = 'text/html'
|
||||
elif node.get('type','')=='application/xhtml+xml':
|
||||
node['value'] = empty.sub(r"<\1 />", node['value'])
|
||||
node = node[path]
|
||||
elif isinstance(path, int):
|
||||
node = node[path]
|
||||
|
@ -52,6 +52,7 @@ def run(script, doc, output_file=None, options={}):
|
||||
|
||||
cmdopts = []
|
||||
for key,value in options.items():
|
||||
if value.find("'")>=0 and value.find('"')>=0: continue
|
||||
cmdopts += ['--stringparam', key, quote(value, apos=r"\'")]
|
||||
|
||||
os.system('xsltproc %s %s %s > %s' %
|
||||
|
@ -80,16 +80,23 @@ def writeCache(feed_uri, feed_info, data):
|
||||
|
||||
# process based on the HTTP status code
|
||||
if data.status == 200 and data.has_key("url"):
|
||||
data.feed['planet_http_location'] = data.url
|
||||
if feed_uri == data.url:
|
||||
feed_info.feed['planet_http_location'] = data.url
|
||||
if data.has_key("entries") and len(data.entries) == 0:
|
||||
log.warning("No data %s", feed_uri)
|
||||
feed_info.feed['planet_message'] = 'no data'
|
||||
elif feed_uri == data.url:
|
||||
log.info("Updating feed %s", feed_uri)
|
||||
else:
|
||||
log.info("Updating feed %s @ %s", feed_uri, data.url)
|
||||
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
||||
log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
|
||||
data.feed['planet_http_location'] = data.url
|
||||
elif data.status == 304:
|
||||
log.info("Feed %s unchanged", feed_uri)
|
||||
elif data.status == 304 and data.has_key("url"):
|
||||
feed_info.feed['planet_http_location'] = data.url
|
||||
if feed_uri == data.url:
|
||||
log.info("Feed %s unchanged", feed_uri)
|
||||
else:
|
||||
log.info("Feed %s unchanged @ %s", feed_uri, data.url)
|
||||
|
||||
if not feed_info.feed.has_key('planet_message'):
|
||||
if feed_info.feed.has_key('planet_updated'):
|
||||
@ -99,7 +106,9 @@ def writeCache(feed_uri, feed_info, data):
|
||||
else:
|
||||
if feed_info.feed.planet_message.startswith("no activity in"):
|
||||
return
|
||||
del feed_info.feed['planet_message']
|
||||
if not feed_info.feed.planet_message.startswith("duplicate") and \
|
||||
not feed_info.feed.planet_message.startswith("no data"):
|
||||
del feed_info.feed['planet_message']
|
||||
|
||||
elif data.status == 410:
|
||||
log.info("Feed %s gone", feed_uri)
|
||||
@ -154,15 +163,28 @@ def writeCache(feed_uri, feed_info, data):
|
||||
from planet import idindex
|
||||
global index
|
||||
if index != None: index = idindex.open()
|
||||
|
||||
# write each entry to the cache
|
||||
cache = config.cache_directory()
|
||||
|
||||
# select latest entry for each unique id
|
||||
ids = {}
|
||||
for entry in data.entries:
|
||||
# generate an id, if none is present
|
||||
if not entry.has_key('id') or not entry.id:
|
||||
entry['id'] = reconstitute.id(None, entry)
|
||||
if not entry['id']: continue
|
||||
|
||||
# determine updated date for purposes of selection
|
||||
updated = ''
|
||||
if entry.has_key('published'): updated=entry.published
|
||||
if entry.has_key('updated'): updated=entry.updated
|
||||
|
||||
# if not seen or newer than last seen, select it
|
||||
if updated >= ids.get(entry.id,('',))[0]:
|
||||
ids[entry.id] = (updated, entry)
|
||||
|
||||
# write each entry to the cache
|
||||
cache = config.cache_directory()
|
||||
for updated, entry in ids.values():
|
||||
|
||||
# compute cache file name based on the id
|
||||
cache_file = filename(cache, entry.id)
|
||||
|
||||
@ -329,7 +351,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
||||
|
||||
def spiderPlanet(only_if_new = False):
|
||||
""" Spider (fetch) an entire planet """
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.logger
|
||||
|
||||
global index
|
||||
index = True
|
||||
@ -340,7 +362,7 @@ def spiderPlanet(only_if_new = False):
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
@ -392,6 +414,7 @@ def spiderPlanet(only_if_new = False):
|
||||
fetch_queue.put(item=(None, None))
|
||||
|
||||
# Process the results as they arrive
|
||||
feeds_seen = {}
|
||||
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
||||
while parse_queue.qsize() == 0 and threads:
|
||||
time.sleep(0.1)
|
||||
@ -415,8 +438,33 @@ def spiderPlanet(only_if_new = False):
|
||||
else:
|
||||
data = feedparser.FeedParserDict({'version': None,
|
||||
'headers': feed.headers, 'entries': [], 'feed': {},
|
||||
'bozo': 0, 'status': int(feed.headers.status)})
|
||||
'href': feed.url, 'bozo': 0,
|
||||
'status': int(feed.headers.status)})
|
||||
|
||||
# duplicate feed?
|
||||
id = data.feed.get('id', None)
|
||||
if not id: id = feed_info.feed.get('id', None)
|
||||
|
||||
href=uri
|
||||
if data.has_key('href'): href=data.href
|
||||
|
||||
duplicate = None
|
||||
if id and id in feeds_seen:
|
||||
duplicate = id
|
||||
elif href and href in feeds_seen:
|
||||
duplicate = href
|
||||
|
||||
if duplicate:
|
||||
feed_info.feed['planet_message'] = \
|
||||
'duplicate subscription: ' + feeds_seen[duplicate]
|
||||
log.warn('Duplicate subscription: %s and %s' %
|
||||
(uri, feeds_seen[duplicate]))
|
||||
if href: feed_info.feed['planet_http_location'] = href
|
||||
|
||||
if id: feeds_seen[id] = uri
|
||||
if href: feeds_seen[href] = uri
|
||||
|
||||
# complete processing for the feed
|
||||
writeCache(uri, feed_info, data)
|
||||
|
||||
except Exception, e:
|
||||
|
@ -9,7 +9,7 @@ from planet import idindex
|
||||
def splice():
|
||||
""" Splice together a planet from a cache of entries """
|
||||
import planet
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.logger
|
||||
|
||||
log.info("Loading cached data")
|
||||
cache = config.cache_directory()
|
||||
@ -109,7 +109,7 @@ def splice():
|
||||
def apply(doc):
|
||||
output_dir = config.output_dir()
|
||||
if not os.path.exists(output_dir): os.makedirs(output_dir)
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.logger
|
||||
|
||||
planet_filters = config.filters('Planet')
|
||||
|
||||
|
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||
"""
|
||||
|
||||
__version__ = "4.2-pre-" + "$Revision: 262 $"[11:14] + "-svn"
|
||||
__version__ = "4.2-pre-" + "$Revision: 270 $"[11:14] + "-svn"
|
||||
__license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -466,6 +466,7 @@ class _FeedParserMixin:
|
||||
self.baseuri = baseuri or ''
|
||||
self.lang = baselang or None
|
||||
self.svgOK = 0
|
||||
self.hasTitle = 0
|
||||
if baselang:
|
||||
self.feeddata['language'] = baselang.replace('_','-')
|
||||
|
||||
@ -478,6 +479,11 @@ class _FeedParserMixin:
|
||||
# track xml:base and xml:lang
|
||||
attrsD = dict(attrs)
|
||||
baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
|
||||
if type(baseuri) != type(u''):
|
||||
try:
|
||||
baseuri = unicode(baseuri, self.encoding)
|
||||
except:
|
||||
baseuri = unicode(baseuri, 'iso-8859-1')
|
||||
self.baseuri = _urljoin(self.baseuri, baseuri)
|
||||
lang = attrsD.get('xml:lang', attrsD.get('lang'))
|
||||
if lang == '':
|
||||
@ -502,6 +508,7 @@ class _FeedParserMixin:
|
||||
|
||||
# track inline content
|
||||
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
||||
if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
|
||||
# element declared itself as escaped markup, but it isn't really
|
||||
self.contentparams['type'] = 'application/xhtml+xml'
|
||||
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
||||
@ -569,6 +576,7 @@ class _FeedParserMixin:
|
||||
# track inline content
|
||||
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
|
||||
# element declared itself as escaped markup, but it isn't really
|
||||
if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
|
||||
self.contentparams['type'] = 'application/xhtml+xml'
|
||||
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
|
||||
tag = tag.split(':')[-1]
|
||||
@ -794,6 +802,9 @@ class _FeedParserMixin:
|
||||
# categories/tags/keywords/whatever are handled in _end_category
|
||||
if element == 'category':
|
||||
return output
|
||||
|
||||
if element == 'title' and self.hasTitle:
|
||||
return output
|
||||
|
||||
# store output in appropriate place(s)
|
||||
if self.inentry and not self.insource:
|
||||
@ -960,6 +971,7 @@ class _FeedParserMixin:
|
||||
context = self._getContext()
|
||||
context.setdefault('image', FeedParserDict())
|
||||
self.inimage = 1
|
||||
self.hasTitle = 0
|
||||
self.push('image', 0)
|
||||
|
||||
def _end_image(self):
|
||||
@ -970,6 +982,7 @@ class _FeedParserMixin:
|
||||
context = self._getContext()
|
||||
context.setdefault('textinput', FeedParserDict())
|
||||
self.intextinput = 1
|
||||
self.hasTitle = 0
|
||||
self.push('textinput', 0)
|
||||
_start_textInput = _start_textinput
|
||||
|
||||
@ -1182,6 +1195,7 @@ class _FeedParserMixin:
|
||||
self.push('item', 0)
|
||||
self.inentry = 1
|
||||
self.guidislink = 0
|
||||
self.hasTitle = 0
|
||||
id = self._getAttribute(attrsD, 'rdf:about')
|
||||
if id:
|
||||
context = self._getContext()
|
||||
@ -1376,8 +1390,13 @@ class _FeedParserMixin:
|
||||
value = self.popContent('title')
|
||||
if not value: return
|
||||
context = self._getContext()
|
||||
self.hasTitle = 1
|
||||
_end_dc_title = _end_title
|
||||
_end_media_title = _end_title
|
||||
|
||||
def _end_media_title(self):
|
||||
hasTitle = self.hasTitle
|
||||
self._end_title()
|
||||
self.hasTitle = hasTitle
|
||||
|
||||
def _start_description(self, attrsD):
|
||||
context = self._getContext()
|
||||
@ -1466,6 +1485,7 @@ class _FeedParserMixin:
|
||||
|
||||
def _start_source(self, attrsD):
|
||||
self.insource = 1
|
||||
self.hasTitle = 0
|
||||
|
||||
def _end_source(self):
|
||||
self.insource = 0
|
||||
@ -2287,7 +2307,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||
'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
|
||||
'xml:lang']
|
||||
|
||||
unacceptable_elements_with_end_tag = ['script', 'applet']
|
||||
unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
@ -2410,7 +2430,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||
acceptable_attributes = self.svg_attributes
|
||||
tag = self.svg_elem_map.get(tag,tag)
|
||||
keymap = self.svg_attr_map
|
||||
else:
|
||||
elif not tag in self.acceptable_elements:
|
||||
return
|
||||
|
||||
# declare xlink namespace, if needed
|
||||
@ -3290,11 +3310,15 @@ def _stripDoctype(data):
|
||||
rss_version may be 'rss091n' or None
|
||||
stripped_data is the same XML document, minus the DOCTYPE
|
||||
'''
|
||||
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
|
||||
entity_results=entity_pattern.findall(data)
|
||||
data = entity_pattern.sub('', data)
|
||||
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
|
||||
doctype_results = doctype_pattern.findall(data)
|
||||
start = re.search('<\w',data)
|
||||
start = start and start.start() or -1
|
||||
head,data = data[:start+1], data[start+1:]
|
||||
|
||||
entity_pattern = re.compile(r'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
|
||||
entity_results=entity_pattern.findall(head)
|
||||
head = entity_pattern.sub('', head)
|
||||
doctype_pattern = re.compile(r'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE)
|
||||
doctype_results = doctype_pattern.findall(head)
|
||||
doctype = doctype_results and doctype_results[0] or ''
|
||||
if doctype.lower().count('netscape'):
|
||||
version = 'rss091n'
|
||||
@ -3308,7 +3332,7 @@ def _stripDoctype(data):
|
||||
safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
|
||||
if safe_entities:
|
||||
replacement='<!DOCTYPE feed [\n <!ENTITY %s>\n]>' % '>\n <!ENTITY '.join(safe_entities)
|
||||
data = doctype_pattern.sub(replacement, data)
|
||||
data = doctype_pattern.sub(replacement, head) + data
|
||||
|
||||
return version, data, dict(replacement and safe_pattern.findall(replacement))
|
||||
|
@ -119,8 +119,8 @@ spaceCharacters = frozenset((
|
||||
tableInsertModeElements = frozenset((
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr"
|
||||
))
|
||||
|
||||
@ -133,7 +133,7 @@ hexDigits = frozenset(string.hexdigits)
|
||||
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
||||
for c in string.ascii_uppercase])
|
||||
|
||||
# Heading elements need to be ordered
|
||||
# Heading elements need to be ordered
|
||||
headingElements = (
|
||||
"h1",
|
||||
"h2",
|
||||
@ -158,6 +158,38 @@ voidElements = frozenset((
|
||||
"input"
|
||||
))
|
||||
|
||||
cdataElements = frozenset(('title', 'textarea'))
|
||||
|
||||
rcdataElements = frozenset((
|
||||
'style',
|
||||
'script',
|
||||
'xmp',
|
||||
'iframe',
|
||||
'noembed',
|
||||
'noframes',
|
||||
'noscript'
|
||||
))
|
||||
|
||||
booleanAttributes = {
|
||||
"": frozenset(("irrelevant",)),
|
||||
"style": frozenset(("scoped",)),
|
||||
"img": frozenset(("ismap",)),
|
||||
"audio": frozenset(("autoplay","controls")),
|
||||
"video": frozenset(("autoplay","controls")),
|
||||
"script": frozenset(("defer", "async")),
|
||||
"details": frozenset(("open",)),
|
||||
"datagrid": frozenset(("multiple", "disabled")),
|
||||
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
||||
"menu": frozenset(("autosubmit",)),
|
||||
"fieldset": frozenset(("disabled", "readonly")),
|
||||
"option": frozenset(("disabled", "readonly", "selected")),
|
||||
"optgroup": frozenset(("disabled", "readonly")),
|
||||
"button": frozenset(("disabled", "autofocus")),
|
||||
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
||||
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
||||
"output": frozenset(("disabled", "readonly")),
|
||||
}
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||
# therefore can't be a frozenset.
|
||||
entitiesWindows1252 = (
|
||||
@ -196,265 +228,372 @@ entitiesWindows1252 = (
|
||||
)
|
||||
|
||||
entities = {
|
||||
"AElig;": u"\u00C6",
|
||||
"AElig": u"\u00C6",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha": u"\u0391",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta": u"\u0392",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi": u"\u03A7",
|
||||
"Dagger": u"\u2021",
|
||||
"Delta": u"\u0394",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon": u"\u0395",
|
||||
"Eta": u"\u0397",
|
||||
"Euml": u"\u00CB",
|
||||
"Gamma": u"\u0393",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota": u"\u0399",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa": u"\u039A",
|
||||
"Lambda": u"\u039B",
|
||||
"Mu": u"\u039C",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu": u"\u039D",
|
||||
"OElig": u"\u0152",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega": u"\u03A9",
|
||||
"Omicron": u"\u039F",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi": u"\u03A6",
|
||||
"Pi": u"\u03A0",
|
||||
"Prime": u"\u2033",
|
||||
"Psi": u"\u03A8",
|
||||
"Rho": u"\u03A1",
|
||||
"Scaron": u"\u0160",
|
||||
"Sigma": u"\u03A3",
|
||||
"THORN": u"\u00DE",
|
||||
"Tau": u"\u03A4",
|
||||
"Theta": u"\u0398",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon": u"\u03A5",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi": u"\u039E",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml": u"\u0178",
|
||||
"Zeta": u"\u0396",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc": u"\u00E2",
|
||||
"acute": u"\u00B4",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym": u"\u2135",
|
||||
"alpha": u"\u03B1",
|
||||
"amp": u"\u0026",
|
||||
"AMP;": u"\u0026",
|
||||
"AMP": u"\u0026",
|
||||
"and": u"\u2227",
|
||||
"ang": u"\u2220",
|
||||
"apos": u"\u0027",
|
||||
"aring": u"\u00E5",
|
||||
"asymp": u"\u2248",
|
||||
"atilde": u"\u00E3",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo": u"\u201E",
|
||||
"beta": u"\u03B2",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull": u"\u2022",
|
||||
"cap": u"\u2229",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil": u"\u00B8",
|
||||
"cent": u"\u00A2",
|
||||
"chi": u"\u03C7",
|
||||
"circ": u"\u02C6",
|
||||
"clubs": u"\u2663",
|
||||
"cong": u"\u2245",
|
||||
"copy": u"\u00A9",
|
||||
"Aacute;": u"\u00C1",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc;": u"\u00C2",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave;": u"\u00C0",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha;": u"\u0391",
|
||||
"Aring;": u"\u00C5",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde;": u"\u00C3",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml;": u"\u00C4",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta;": u"\u0392",
|
||||
"COPY;": u"\u00A9",
|
||||
"COPY": u"\u00A9",
|
||||
"crarr": u"\u21B5",
|
||||
"cup": u"\u222A",
|
||||
"curren": u"\u00A4",
|
||||
"dArr": u"\u21D3",
|
||||
"dagger": u"\u2020",
|
||||
"darr": u"\u2193",
|
||||
"deg": u"\u00B0",
|
||||
"delta": u"\u03B4",
|
||||
"diams": u"\u2666",
|
||||
"divide": u"\u00F7",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave": u"\u00E8",
|
||||
"empty": u"\u2205",
|
||||
"emsp": u"\u2003",
|
||||
"ensp": u"\u2002",
|
||||
"epsilon": u"\u03B5",
|
||||
"equiv": u"\u2261",
|
||||
"eta": u"\u03B7",
|
||||
"eth": u"\u00F0",
|
||||
"euml": u"\u00EB",
|
||||
"euro": u"\u20AC",
|
||||
"exist": u"\u2203",
|
||||
"fnof": u"\u0192",
|
||||
"forall": u"\u2200",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl": u"\u2044",
|
||||
"gamma": u"\u03B3",
|
||||
"ge": u"\u2265",
|
||||
"gt": u"\u003E",
|
||||
"Ccedil;": u"\u00C7",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi;": u"\u03A7",
|
||||
"Dagger;": u"\u2021",
|
||||
"Delta;": u"\u0394",
|
||||
"ETH;": u"\u00D0",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute;": u"\u00C9",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc;": u"\u00CA",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave;": u"\u00C8",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon;": u"\u0395",
|
||||
"Eta;": u"\u0397",
|
||||
"Euml;": u"\u00CB",
|
||||
"Euml": u"\u00CB",
|
||||
"GT;": u"\u003E",
|
||||
"GT": u"\u003E",
|
||||
"hArr": u"\u21D4",
|
||||
"harr": u"\u2194",
|
||||
"hearts": u"\u2665",
|
||||
"hellip": u"\u2026",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave": u"\u00EC",
|
||||
"image": u"\u2111",
|
||||
"infin": u"\u221E",
|
||||
"int": u"\u222B",
|
||||
"iota": u"\u03B9",
|
||||
"iquest": u"\u00BF",
|
||||
"isin": u"\u2208",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa": u"\u03BA",
|
||||
"lArr": u"\u21D0",
|
||||
"lambda": u"\u03BB",
|
||||
"lang": u"\u2329",
|
||||
"laquo": u"\u00AB",
|
||||
"larr": u"\u2190",
|
||||
"lceil": u"\u2308",
|
||||
"ldquo": u"\u201C",
|
||||
"le": u"\u2264",
|
||||
"lfloor": u"\u230A",
|
||||
"lowast": u"\u2217",
|
||||
"loz": u"\u25CA",
|
||||
"lrm": u"\u200E",
|
||||
"lsaquo": u"\u2039",
|
||||
"lsquo": u"\u2018",
|
||||
"lt": u"\u003C",
|
||||
"Gamma;": u"\u0393",
|
||||
"Iacute;": u"\u00CD",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc;": u"\u00CE",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave;": u"\u00CC",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota;": u"\u0399",
|
||||
"Iuml;": u"\u00CF",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa;": u"\u039A",
|
||||
"LT;": u"\u003C",
|
||||
"LT": u"\u003C",
|
||||
"macr": u"\u00AF",
|
||||
"mdash": u"\u2014",
|
||||
"micro": u"\u00B5",
|
||||
"middot": u"\u00B7",
|
||||
"minus": u"\u2212",
|
||||
"mu": u"\u03BC",
|
||||
"nabla": u"\u2207",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash": u"\u2013",
|
||||
"ne": u"\u2260",
|
||||
"ni": u"\u220B",
|
||||
"not": u"\u00AC",
|
||||
"notin": u"\u2209",
|
||||
"nsub": u"\u2284",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu": u"\u03BD",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig": u"\u0153",
|
||||
"ograve": u"\u00F2",
|
||||
"oline": u"\u203E",
|
||||
"omega": u"\u03C9",
|
||||
"omicron": u"\u03BF",
|
||||
"oplus": u"\u2295",
|
||||
"or": u"\u2228",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes": u"\u2297",
|
||||
"ouml": u"\u00F6",
|
||||
"para": u"\u00B6",
|
||||
"part": u"\u2202",
|
||||
"permil": u"\u2030",
|
||||
"perp": u"\u22A5",
|
||||
"phi": u"\u03C6",
|
||||
"pi": u"\u03C0",
|
||||
"piv": u"\u03D6",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound": u"\u00A3",
|
||||
"prime": u"\u2032",
|
||||
"prod": u"\u220F",
|
||||
"prop": u"\u221D",
|
||||
"psi": u"\u03C8",
|
||||
"quot": u"\u0022",
|
||||
"Lambda;": u"\u039B",
|
||||
"Mu;": u"\u039C",
|
||||
"Ntilde;": u"\u00D1",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu;": u"\u039D",
|
||||
"OElig;": u"\u0152",
|
||||
"Oacute;": u"\u00D3",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc;": u"\u00D4",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve;": u"\u00D2",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega;": u"\u03A9",
|
||||
"Omicron;": u"\u039F",
|
||||
"Oslash;": u"\u00D8",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde;": u"\u00D5",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml;": u"\u00D6",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi;": u"\u03A6",
|
||||
"Pi;": u"\u03A0",
|
||||
"Prime;": u"\u2033",
|
||||
"Psi;": u"\u03A8",
|
||||
"QUOT;": u"\u0022",
|
||||
"QUOT": u"\u0022",
|
||||
"rArr": u"\u21D2",
|
||||
"radic": u"\u221A",
|
||||
"rang": u"\u232A",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr": u"\u2192",
|
||||
"rceil": u"\u2309",
|
||||
"rdquo": u"\u201D",
|
||||
"real": u"\u211C",
|
||||
"reg": u"\u00AE",
|
||||
"REG;": u"\u00AE",
|
||||
"REG": u"\u00AE",
|
||||
"rfloor": u"\u230B",
|
||||
"rho": u"\u03C1",
|
||||
"rlm": u"\u200F",
|
||||
"rsaquo": u"\u203A",
|
||||
"rsquo": u"\u2019",
|
||||
"sbquo": u"\u201A",
|
||||
"scaron": u"\u0161",
|
||||
"sdot": u"\u22C5",
|
||||
"Rho;": u"\u03A1",
|
||||
"Scaron;": u"\u0160",
|
||||
"Sigma;": u"\u03A3",
|
||||
"THORN;": u"\u00DE",
|
||||
"THORN": u"\u00DE",
|
||||
"TRADE;": u"\u2122",
|
||||
"Tau;": u"\u03A4",
|
||||
"Theta;": u"\u0398",
|
||||
"Uacute;": u"\u00DA",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc;": u"\u00DB",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave;": u"\u00D9",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon;": u"\u03A5",
|
||||
"Uuml;": u"\u00DC",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi;": u"\u039E",
|
||||
"Yacute;": u"\u00DD",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml;": u"\u0178",
|
||||
"Zeta;": u"\u0396",
|
||||
"aacute;": u"\u00E1",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc;": u"\u00E2",
|
||||
"acirc": u"\u00E2",
|
||||
"acute;": u"\u00B4",
|
||||
"acute": u"\u00B4",
|
||||
"aelig;": u"\u00E6",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave;": u"\u00E0",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym;": u"\u2135",
|
||||
"alpha;": u"\u03B1",
|
||||
"amp;": u"\u0026",
|
||||
"amp": u"\u0026",
|
||||
"and;": u"\u2227",
|
||||
"ang;": u"\u2220",
|
||||
"apos;": u"\u0027",
|
||||
"aring;": u"\u00E5",
|
||||
"aring": u"\u00E5",
|
||||
"asymp;": u"\u2248",
|
||||
"atilde;": u"\u00E3",
|
||||
"atilde": u"\u00E3",
|
||||
"auml;": u"\u00E4",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo;": u"\u201E",
|
||||
"beta;": u"\u03B2",
|
||||
"brvbar;": u"\u00A6",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull;": u"\u2022",
|
||||
"cap;": u"\u2229",
|
||||
"ccedil;": u"\u00E7",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil;": u"\u00B8",
|
||||
"cedil": u"\u00B8",
|
||||
"cent;": u"\u00A2",
|
||||
"cent": u"\u00A2",
|
||||
"chi;": u"\u03C7",
|
||||
"circ;": u"\u02C6",
|
||||
"clubs;": u"\u2663",
|
||||
"cong;": u"\u2245",
|
||||
"copy;": u"\u00A9",
|
||||
"copy": u"\u00A9",
|
||||
"crarr;": u"\u21B5",
|
||||
"cup;": u"\u222A",
|
||||
"curren;": u"\u00A4",
|
||||
"curren": u"\u00A4",
|
||||
"dArr;": u"\u21D3",
|
||||
"dagger;": u"\u2020",
|
||||
"darr;": u"\u2193",
|
||||
"deg;": u"\u00B0",
|
||||
"deg": u"\u00B0",
|
||||
"delta;": u"\u03B4",
|
||||
"diams;": u"\u2666",
|
||||
"divide;": u"\u00F7",
|
||||
"divide": u"\u00F7",
|
||||
"eacute;": u"\u00E9",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc;": u"\u00EA",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave;": u"\u00E8",
|
||||
"egrave": u"\u00E8",
|
||||
"empty;": u"\u2205",
|
||||
"emsp;": u"\u2003",
|
||||
"ensp;": u"\u2002",
|
||||
"epsilon;": u"\u03B5",
|
||||
"equiv;": u"\u2261",
|
||||
"eta;": u"\u03B7",
|
||||
"eth;": u"\u00F0",
|
||||
"eth": u"\u00F0",
|
||||
"euml;": u"\u00EB",
|
||||
"euml": u"\u00EB",
|
||||
"euro;": u"\u20AC",
|
||||
"exist;": u"\u2203",
|
||||
"fnof;": u"\u0192",
|
||||
"forall;": u"\u2200",
|
||||
"frac12;": u"\u00BD",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14;": u"\u00BC",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34;": u"\u00BE",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl;": u"\u2044",
|
||||
"gamma;": u"\u03B3",
|
||||
"ge;": u"\u2265",
|
||||
"gt;": u"\u003E",
|
||||
"gt": u"\u003E",
|
||||
"hArr;": u"\u21D4",
|
||||
"harr;": u"\u2194",
|
||||
"hearts;": u"\u2665",
|
||||
"hellip;": u"\u2026",
|
||||
"iacute;": u"\u00ED",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc;": u"\u00EE",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl;": u"\u00A1",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave;": u"\u00EC",
|
||||
"igrave": u"\u00EC",
|
||||
"image;": u"\u2111",
|
||||
"infin;": u"\u221E",
|
||||
"int;": u"\u222B",
|
||||
"iota;": u"\u03B9",
|
||||
"iquest;": u"\u00BF",
|
||||
"iquest": u"\u00BF",
|
||||
"isin;": u"\u2208",
|
||||
"iuml;": u"\u00EF",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa;": u"\u03BA",
|
||||
"lArr;": u"\u21D0",
|
||||
"lambda;": u"\u03BB",
|
||||
"lang;": u"\u3008",
|
||||
"laquo;": u"\u00AB",
|
||||
"laquo": u"\u00AB",
|
||||
"larr;": u"\u2190",
|
||||
"lceil;": u"\u2308",
|
||||
"ldquo;": u"\u201C",
|
||||
"le;": u"\u2264",
|
||||
"lfloor;": u"\u230A",
|
||||
"lowast;": u"\u2217",
|
||||
"loz;": u"\u25CA",
|
||||
"lrm;": u"\u200E",
|
||||
"lsaquo;": u"\u2039",
|
||||
"lsquo;": u"\u2018",
|
||||
"lt;": u"\u003C",
|
||||
"lt": u"\u003C",
|
||||
"macr;": u"\u00AF",
|
||||
"macr": u"\u00AF",
|
||||
"mdash;": u"\u2014",
|
||||
"micro;": u"\u00B5",
|
||||
"micro": u"\u00B5",
|
||||
"middot;": u"\u00B7",
|
||||
"middot": u"\u00B7",
|
||||
"minus;": u"\u2212",
|
||||
"mu;": u"\u03BC",
|
||||
"nabla;": u"\u2207",
|
||||
"nbsp;": u"\u00A0",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash;": u"\u2013",
|
||||
"ne;": u"\u2260",
|
||||
"ni;": u"\u220B",
|
||||
"not;": u"\u00AC",
|
||||
"not": u"\u00AC",
|
||||
"notin;": u"\u2209",
|
||||
"nsub;": u"\u2284",
|
||||
"ntilde;": u"\u00F1",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu;": u"\u03BD",
|
||||
"oacute;": u"\u00F3",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc;": u"\u00F4",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig;": u"\u0153",
|
||||
"ograve;": u"\u00F2",
|
||||
"ograve": u"\u00F2",
|
||||
"oline;": u"\u203E",
|
||||
"omega;": u"\u03C9",
|
||||
"omicron;": u"\u03BF",
|
||||
"oplus;": u"\u2295",
|
||||
"or;": u"\u2228",
|
||||
"ordf;": u"\u00AA",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm;": u"\u00BA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash;": u"\u00F8",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde;": u"\u00F5",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes;": u"\u2297",
|
||||
"ouml;": u"\u00F6",
|
||||
"ouml": u"\u00F6",
|
||||
"para;": u"\u00B6",
|
||||
"para": u"\u00B6",
|
||||
"part;": u"\u2202",
|
||||
"permil;": u"\u2030",
|
||||
"perp;": u"\u22A5",
|
||||
"phi;": u"\u03C6",
|
||||
"pi;": u"\u03C0",
|
||||
"piv;": u"\u03D6",
|
||||
"plusmn;": u"\u00B1",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound;": u"\u00A3",
|
||||
"pound": u"\u00A3",
|
||||
"prime;": u"\u2032",
|
||||
"prod;": u"\u220F",
|
||||
"prop;": u"\u221D",
|
||||
"psi;": u"\u03C8",
|
||||
"quot;": u"\u0022",
|
||||
"quot": u"\u0022",
|
||||
"rArr;": u"\u21D2",
|
||||
"radic;": u"\u221A",
|
||||
"rang;": u"\u3009",
|
||||
"raquo;": u"\u00BB",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr;": u"\u2192",
|
||||
"rceil;": u"\u2309",
|
||||
"rdquo;": u"\u201D",
|
||||
"real;": u"\u211C",
|
||||
"reg;": u"\u00AE",
|
||||
"reg": u"\u00AE",
|
||||
"rfloor;": u"\u230B",
|
||||
"rho;": u"\u03C1",
|
||||
"rlm;": u"\u200F",
|
||||
"rsaquo;": u"\u203A",
|
||||
"rsquo;": u"\u2019",
|
||||
"sbquo;": u"\u201A",
|
||||
"scaron;": u"\u0161",
|
||||
"sdot;": u"\u22C5",
|
||||
"sect;": u"\u00A7",
|
||||
"sect": u"\u00A7",
|
||||
"shy;": u"\u00AD",
|
||||
"shy": u"\u00AD",
|
||||
"sigma": u"\u03C3",
|
||||
"sigmaf": u"\u03C2",
|
||||
"sim": u"\u223C",
|
||||
"spades": u"\u2660",
|
||||
"sub": u"\u2282",
|
||||
"sube": u"\u2286",
|
||||
"sum": u"\u2211",
|
||||
"sup": u"\u2283",
|
||||
"sigma;": u"\u03C3",
|
||||
"sigmaf;": u"\u03C2",
|
||||
"sim;": u"\u223C",
|
||||
"spades;": u"\u2660",
|
||||
"sub;": u"\u2282",
|
||||
"sube;": u"\u2286",
|
||||
"sum;": u"\u2211",
|
||||
"sup1;": u"\u00B9",
|
||||
"sup1": u"\u00B9",
|
||||
"sup2;": u"\u00B2",
|
||||
"sup2": u"\u00B2",
|
||||
"sup3;": u"\u00B3",
|
||||
"sup3": u"\u00B3",
|
||||
"supe": u"\u2287",
|
||||
"sup;": u"\u2283",
|
||||
"supe;": u"\u2287",
|
||||
"szlig;": u"\u00DF",
|
||||
"szlig": u"\u00DF",
|
||||
"tau": u"\u03C4",
|
||||
"there4": u"\u2234",
|
||||
"theta": u"\u03B8",
|
||||
"thetasym": u"\u03D1",
|
||||
"thinsp": u"\u2009",
|
||||
"tau;": u"\u03C4",
|
||||
"there4;": u"\u2234",
|
||||
"theta;": u"\u03B8",
|
||||
"thetasym;": u"\u03D1",
|
||||
"thinsp;": u"\u2009",
|
||||
"thorn;": u"\u00FE",
|
||||
"thorn": u"\u00FE",
|
||||
"tilde": u"\u02DC",
|
||||
"tilde;": u"\u02DC",
|
||||
"times;": u"\u00D7",
|
||||
"times": u"\u00D7",
|
||||
"trade": u"\u2122",
|
||||
"uArr": u"\u21D1",
|
||||
"trade;": u"\u2122",
|
||||
"uArr;": u"\u21D1",
|
||||
"uacute;": u"\u00FA",
|
||||
"uacute": u"\u00FA",
|
||||
"uarr": u"\u2191",
|
||||
"uarr;": u"\u2191",
|
||||
"ucirc;": u"\u00FB",
|
||||
"ucirc": u"\u00FB",
|
||||
"ugrave;": u"\u00F9",
|
||||
"ugrave": u"\u00F9",
|
||||
"uml;": u"\u00A8",
|
||||
"uml": u"\u00A8",
|
||||
"upsih": u"\u03D2",
|
||||
"upsilon": u"\u03C5",
|
||||
"upsih;": u"\u03D2",
|
||||
"upsilon;": u"\u03C5",
|
||||
"uuml;": u"\u00FC",
|
||||
"uuml": u"\u00FC",
|
||||
"weierp": u"\u2118",
|
||||
"xi": u"\u03BE",
|
||||
"weierp;": u"\u2118",
|
||||
"xi;": u"\u03BE",
|
||||
"yacute;": u"\u00FD",
|
||||
"yacute": u"\u00FD",
|
||||
"yen;": u"\u00A5",
|
||||
"yen": u"\u00A5",
|
||||
"yuml;": u"\u00FF",
|
||||
"yuml": u"\u00FF",
|
||||
"zeta": u"\u03B6",
|
||||
"zwj": u"\u200D",
|
||||
"zwnj": u"\u200C"
|
||||
"zeta;": u"\u03B6",
|
||||
"zwj;": u"\u200D",
|
||||
"zwnj;": u"\u200C"
|
||||
}
|
||||
|
||||
encodings = frozenset((
|
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
0
planet/vendor/html5lib/filters/__init__.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
10
planet/vendor/html5lib/filters/_base.py
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
class Filter(object):
|
||||
def __init__(self, source):
|
||||
self.source = source
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.source)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.source, name)
|
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
63
planet/vendor/html5lib/filters/inject_meta_charset.py
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __init__(self, source, encoding):
|
||||
_base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
state = "pre_head"
|
||||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
state = "in_head"
|
||||
|
||||
elif type == "EmptyTag":
|
||||
if token["name"].lower() == "meta":
|
||||
# replace charset with actual encoding
|
||||
has_http_equiv_content_type = False
|
||||
content_index = -1
|
||||
for i,(name,value) in enumerate(token["data"]):
|
||||
if name.lower() == 'charset':
|
||||
token["data"][i] = (u'charset', self.encoding)
|
||||
meta_found = True
|
||||
break
|
||||
elif name == 'http-equiv' and value.lower() == 'content-type':
|
||||
has_http_equiv_content_type = True
|
||||
elif name == 'content':
|
||||
content_index = i
|
||||
else:
|
||||
if has_http_equiv_content_type and content_index >= 0:
|
||||
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
|
||||
meta_found = True
|
||||
|
||||
elif token["name"].lower() == "head" and not meta_found:
|
||||
# insert meta into empty head
|
||||
yield {"type": "StartTag", "name": "head",
|
||||
"data": token["data"]}
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": [["charset", self.encoding]]}
|
||||
yield {"type": "EndTag", "name": "head"}
|
||||
meta_found = True
|
||||
continue
|
||||
|
||||
elif type == "EndTag":
|
||||
if token["name"].lower() == "head" and pending:
|
||||
# insert meta into head (if necessary) and flush pending queue
|
||||
yield pending.pop(0)
|
||||
if not meta_found:
|
||||
yield {"type": "EmptyTag", "name": "meta",
|
||||
"data": [["charset", self.encoding]]}
|
||||
while pending:
|
||||
yield pending.pop(0)
|
||||
meta_found = True
|
||||
state = "post_head"
|
||||
|
||||
if state == "in_head":
|
||||
pending.append(token)
|
||||
else:
|
||||
yield token
|
88
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
88
planet/vendor/html5lib/filters/lint.py
vendored
Normal file
@ -0,0 +1,88 @@
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
import _base
|
||||
from html5lib.constants import cdataElements, rcdataElements, voidElements
|
||||
|
||||
from html5lib.constants import spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class LintError(Exception): pass
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
contentModelFlag = "PCDATA"
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty attribute name"))
|
||||
if not isinstance(value, unicode):
|
||||
raise LintError(_("Attribute value is not a string: %r") % value)
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
contentModelFlag = "RCDATA"
|
||||
elif name == "plaintext":
|
||||
contentModelFlag = "PLAINTEXT"
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
if name in voidElements:
|
||||
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
|
||||
contentModelFlag = "PCDATA"
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Comment not in PCDATA content model flag"))
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, unicode):
|
||||
raise LintError(_("Attribute name is not a string: %r") % data)
|
||||
if not data:
|
||||
raise LintError(_(u"%s token with empty data") % type)
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise LintError(_(u"Unknown token type: %s") % type)
|
||||
|
||||
yield token
|
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
175
planet/vendor/html5lib/filters/optionaltags.py
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
import _base
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["data"] or not self.is_optional_start(token["name"], previous, next):
|
||||
yield token
|
||||
elif type == "EndTag":
|
||||
if not self.is_optional_end(token["name"], next):
|
||||
yield token
|
||||
else:
|
||||
yield token
|
||||
|
||||
def is_optional_start(self, tagname, previous, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in 'html':
|
||||
# An html element's start tag may be omitted if the first thing
|
||||
# inside the html element is not a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname == 'head':
|
||||
# A head element's start tag may be omitted if the first thing
|
||||
# inside the head element is an element.
|
||||
return type == "StartTag"
|
||||
elif tagname == 'body':
|
||||
# A body element's start tag may be omitted if the first thing
|
||||
# inside the body element is not a space character or a comment,
|
||||
# except if the first thing inside the body element is a script
|
||||
# or style element and the node immediately preceding the body
|
||||
# element is a head element whose end tag has been omitted.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so we never omit
|
||||
# the body element's start tag if it's followed by a script or
|
||||
# a style element.
|
||||
return next["name"] not in ('script', 'style')
|
||||
else:
|
||||
return True
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's start tag may be omitted if the first thing
|
||||
# inside the colgroup element is a col element, and if the element
|
||||
# is not immediately preceeded by another colgroup element whose
|
||||
# end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# XXX: we do not look at the preceding event, so instead we never
|
||||
# omit the colgroup element's end tag when it is immediately
|
||||
# followed by another colgroup element. See is_optional_end.
|
||||
return next["name"] == "col"
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tbody':
|
||||
# A tbody element's start tag may be omitted if the first thing
|
||||
# inside the tbody element is a tr element, and if the element is
|
||||
# not immediately preceeded by a tbody, thead, or tfoot element
|
||||
# whose end tag has been omitted.
|
||||
if type == "StartTag":
|
||||
# omit the thead and tfoot elements' end tag when they are
|
||||
# immediately followed by a tbody element. See is_optional_end.
|
||||
if previous and previous['type'] == 'EndTag' and \
|
||||
previous['name'] in ('tbody','thead','tfoot'):
|
||||
return False
|
||||
return next["name"] == 'tr'
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def is_optional_end(self, tagname, next):
|
||||
type = next and next["type"] or None
|
||||
if tagname in ('html', 'head', 'body'):
|
||||
# An html element's end tag may be omitted if the html element
|
||||
# is not immediately followed by a space character or a comment.
|
||||
return type not in ("Comment", "SpaceCharacters")
|
||||
elif tagname in ('li', 'optgroup', 'option', 'tr'):
|
||||
# A li element's end tag may be omitted if the li element is
|
||||
# immediately followed by another li element or if there is
|
||||
# no more content in the parent element.
|
||||
# An optgroup element's end tag may be omitted if the optgroup
|
||||
# element is immediately followed by another optgroup element,
|
||||
# or if there is no more content in the parent element.
|
||||
# An option element's end tag may be omitted if the option
|
||||
# element is immediately followed by another option element,
|
||||
# or if there is no more content in the parent element.
|
||||
# A tr element's end tag may be omitted if the tr element is
|
||||
# immediately followed by another tr element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] == tagname
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('dt', 'dd'):
|
||||
# A dt element's end tag may be omitted if the dt element is
|
||||
# immediately followed by another dt element or a dd element.
|
||||
# A dd element's end tag may be omitted if the dd element is
|
||||
# immediately followed by another dd element or a dt element,
|
||||
# or if there is no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('dt', 'dd')
|
||||
elif tagname == 'dd':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'p':
|
||||
# A p element's end tag may be omitted if the p element is
|
||||
# immediately followed by an address, blockquote, dl, fieldset,
|
||||
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
||||
# or ul element, or if there is no more content in the parent
|
||||
# element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('address', 'blockquote', \
|
||||
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
|
||||
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname == 'colgroup':
|
||||
# A colgroup element's end tag may be omitted if the colgroup
|
||||
# element is not immediately followed by a space character or
|
||||
# a comment.
|
||||
if type in ("Comment", "SpaceCharacters"):
|
||||
return False
|
||||
elif type == "StartTag":
|
||||
# XXX: we also look for an immediately following colgroup
|
||||
# element. See is_optional_start.
|
||||
return next["name"] != 'colgroup'
|
||||
else:
|
||||
return True
|
||||
elif tagname in ('thead', 'tbody'):
|
||||
# A thead element's end tag may be omitted if the thead element
|
||||
# is immediately followed by a tbody or tfoot element.
|
||||
# A tbody element's end tag may be omitted if the tbody element
|
||||
# is immediately followed by a tbody or tfoot element, or if
|
||||
# there is no more content in the parent element.
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ['tbody', 'tfoot']
|
||||
elif tagname == 'tbody':
|
||||
return type == "EndTag" or type is None
|
||||
else:
|
||||
return False
|
||||
elif tagname == 'tfoot':
|
||||
# A tfoot element's end tag may be omitted if the tfoot element
|
||||
# is immediately followed by a tbody element, or if there is no
|
||||
# more content in the parent element.
|
||||
# XXX: we never omit the end tag when the following element is
|
||||
# a tbody. See is_optional_start.
|
||||
if type == "StartTag":
|
||||
return next["name"] == 'tbody'
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
elif tagname in ('td', 'th'):
|
||||
# A td element's end tag may be omitted if the td element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
# A th element's end tag may be omitted if the th element is
|
||||
# immediately followed by a td or th element, or if there is
|
||||
# no more content in the parent element.
|
||||
if type == "StartTag":
|
||||
return next["name"] in ('td', 'th')
|
||||
else:
|
||||
return type == "EndTag" or type is None
|
||||
return False
|
41
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
41
planet/vendor/html5lib/filters/whitespace.py
vendored
Normal file
@ -0,0 +1,41 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import re
|
||||
|
||||
import _base
|
||||
from html5lib.constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
||||
|
||||
class Filter(_base.Filter):
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
preserve += 1
|
||||
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = u" "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
|
||||
yield token
|
||||
|
||||
def collapse_spaces(text):
|
||||
return SPACES_REGEX.sub(' ', text)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,9 @@ import codecs
|
||||
import re
|
||||
import types
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings
|
||||
from utils import MethodDispatcher
|
||||
@ -31,37 +34,38 @@ class HTMLInputStream(object):
|
||||
|
||||
"""
|
||||
# List of where new lines occur
|
||||
self.newLines = []
|
||||
self.newLines = [0]
|
||||
|
||||
# Raw Stream
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
self.numBytesMeta = 512
|
||||
#Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if encoding is None or not isValidEncoding(encoding):
|
||||
encoding = self.detectEncoding(parseMeta, chardet)
|
||||
self.charEncoding = encoding
|
||||
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||
|
||||
# Normalize new ipythonlines and null characters
|
||||
uString = re.sub('\r\n?', '\n', uString)
|
||||
uString = re.sub('\x00', u'\uFFFD', uString)
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
self.dataStream = uString
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
|
||||
'replace')
|
||||
|
||||
self.queue = []
|
||||
self.errors = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
self.reset()
|
||||
self.line = self.col = 0
|
||||
self.lineLengths = []
|
||||
|
||||
#Flag to indicate we may have a CR LF broken across a data chunk
|
||||
self._lastChunkEndsWithCR = False
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
@ -74,6 +78,9 @@ class HTMLInputStream(object):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
self.charEncoding = "utf-8"
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
@ -90,10 +97,18 @@ class HTMLInputStream(object):
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and chardet:
|
||||
try:
|
||||
import chardet
|
||||
buffer = self.rawStream.read()
|
||||
encoding = chardet.detect(buffer)['encoding']
|
||||
self.rawStream = self.openStream(buffer)
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
buffer = self.rawStream.read(self.numBytesChardet)
|
||||
if not buffer:
|
||||
break
|
||||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
self.seek("".join(buffers), 0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
@ -119,98 +134,197 @@ class HTMLInputStream(object):
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
self.rawStream.seek(0)
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
# Need to detect UTF-32 before UTF-16
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
|
||||
#AT - move this to the caller?
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
self.seek(string, encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def seek(self, buffer, n):
|
||||
"""Unget buffer[n:]"""
|
||||
if hasattr(self.rawStream, 'unget'):
|
||||
self.rawStream.unget(buffer[n:])
|
||||
return
|
||||
|
||||
if hasattr(self.rawStream, 'seek'):
|
||||
try:
|
||||
self.rawStream.seek(n)
|
||||
return
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
class BufferedStream:
|
||||
def __init__(self, data, stream):
|
||||
self.data = data
|
||||
self.stream = stream
|
||||
def read(self, chars=-1):
|
||||
if chars == -1 or chars > len(self.data):
|
||||
result = self.data
|
||||
self.data = ''
|
||||
if chars == -1:
|
||||
return result + self.stream.read()
|
||||
else:
|
||||
return result + self.stream.read(chars-len(result))
|
||||
elif not self.data:
|
||||
return self.stream.read(chars)
|
||||
else:
|
||||
result = self.data[:chars]
|
||||
self.data = self.data[chars:]
|
||||
return result
|
||||
def unget(self, data):
|
||||
if self.data:
|
||||
self.data += data
|
||||
else:
|
||||
self.data = data
|
||||
|
||||
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
"""
|
||||
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
|
||||
self.rawStream.seek(0)
|
||||
buffer = self.rawStream.read(self.numBytesMeta)
|
||||
parser = EncodingParser(buffer)
|
||||
self.seek(buffer, 0)
|
||||
return parser.getEncoding()
|
||||
|
||||
def determineNewLines(self):
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
self.newLines.append(0)
|
||||
for i in xrange(len(self.dataStream)):
|
||||
if self.dataStream[i] == u"\n":
|
||||
self.newLines.append(i)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
# Generate list of new lines first time around
|
||||
if not self.newLines:
|
||||
self.determineNewLines()
|
||||
|
||||
line = 0
|
||||
tell = self.tell
|
||||
for pos in self.newLines:
|
||||
if pos < tell:
|
||||
line += 1
|
||||
else:
|
||||
break
|
||||
col = tell - self.newLines[line-1] - 1
|
||||
return (line, col)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the position in the stream back to the start."""
|
||||
self.tell = 0
|
||||
line, col = self.line, self.col
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
if self.queue:
|
||||
return self.queue.pop(0)
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#If we still don't have a character we have reached EOF
|
||||
if not self.queue:
|
||||
return EOF
|
||||
|
||||
char = self.queue.pop(0)
|
||||
|
||||
# update position in stream
|
||||
if char == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
try:
|
||||
self.tell += 1
|
||||
return self.dataStream[self.tell - 1]
|
||||
except:
|
||||
return EOF
|
||||
self.col += 1
|
||||
return char
|
||||
|
||||
def readChunk(self, chunkSize=10240):
|
||||
data = self.dataStream.read(chunkSize)
|
||||
if not data:
|
||||
return
|
||||
#Replace null characters
|
||||
for i in xrange(data.count(u"\u0000")):
|
||||
self.errors.append(_('null character found in input stream, '
|
||||
'replaced with U+FFFD'))
|
||||
data = data.replace(u"\u0000", u"\ufffd")
|
||||
#Check for CR LF broken across chunks
|
||||
if (self._lastChunkEndsWithCR and data[0] == "\n"):
|
||||
data = data[1:]
|
||||
self._lastChunkEndsWithCR = data[-1] == "\r"
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
data = unicode(data)
|
||||
self.queue.extend([char for char in data])
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
any container that supports the in method being called on it.
|
||||
"""
|
||||
charStack = [self.char()]
|
||||
|
||||
# First from the queue
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
||||
and self.queue:
|
||||
charStack.append(self.queue.pop(0))
|
||||
#This method is currently 40-50% of our total runtime and badly needs
|
||||
#optimizing
|
||||
#Possible improvements:
|
||||
# - use regexp to find characters that match the required character set
|
||||
# (with regexp cache since we do the same searches many many times)
|
||||
# - improve EOF handling for fewer if statements
|
||||
|
||||
# Then the rest
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||
try:
|
||||
self.tell += 1
|
||||
charStack.append(self.dataStream[self.tell - 1])
|
||||
except:
|
||||
charStack.append(EOF)
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#Break if we have reached EOF
|
||||
if not self.queue or self.queue[0] == None:
|
||||
return u""
|
||||
|
||||
i = 0
|
||||
while (self.queue[i] in characters) == opposite:
|
||||
i += 1
|
||||
if i == len(self.queue):
|
||||
self.readChunk()
|
||||
#If the queue doesn't grow we have reached EOF
|
||||
if i == len(self.queue) or self.queue[i] is EOF:
|
||||
break
|
||||
#XXX- wallpaper over bug in calculation below
|
||||
#Otherwise change the stream position
|
||||
if self.queue[i] == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
self.col += 1
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
self.queue.insert(0, charStack.pop())
|
||||
return "".join(charStack)
|
||||
rv = u"".join(self.queue[:i])
|
||||
self.queue = self.queue[i:]
|
||||
|
||||
#Calculate where we now are in the stream
|
||||
#One possible optimisation would be to store all read characters and
|
||||
#Calculate this on an as-needed basis (perhaps flushing the read data
|
||||
#every time we read a new chunk) rather than once per call here and
|
||||
#in .char()
|
||||
|
||||
#XXX Temporarily disable this because there is a bug
|
||||
|
||||
#lines = rv.split("\n")
|
||||
#
|
||||
#if lines:
|
||||
# #Add number of lines passed onto positon
|
||||
# oldCol = self.col
|
||||
# self.line += len(lines)-1
|
||||
# if len(lines) > 1:
|
||||
# self.col = len(lines[-1])
|
||||
# else:
|
||||
# self.col += len(lines[0])
|
||||
#
|
||||
# if self.lineLengths and oldCol > 0:
|
||||
# self.lineLengths[-1] += len(lines[0])
|
||||
# lines = lines[1:-1]
|
||||
# else:
|
||||
# lines = lines[:-1]
|
||||
#
|
||||
# for line in lines:
|
||||
# self.lineLengths.append(len(line))
|
||||
#
|
||||
|
||||
return rv
|
||||
|
||||
def unget(self, chars):
|
||||
if chars:
|
||||
self.queue = list(chars) + self.queue
|
||||
#Alter the current line, col position
|
||||
for c in chars[::-1]:
|
||||
if c == '\n':
|
||||
self.line -= 1
|
||||
self.col = self.lineLengths[self.line]
|
||||
else:
|
||||
self.col -= 1
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an assosiated position and various extra methods
|
@ -15,35 +15,36 @@ References:
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
from constants import voidElements
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
from constants import voidElements, contentModelFlags
|
||||
|
||||
from xml.dom import XHTML_NAMESPACE
|
||||
from xml.sax.saxutils import unescape
|
||||
|
||||
class XMLParser(html5parser.HTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
# AT When Python 2.4 is widespread we should use
|
||||
# dict(reversed(token.data))
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
save = self.tokenizer.contentModelFlag
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
self.tokenizer.contentModelFlag = save
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
elif token["type"] == "EndTag":
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
elif token["type"] == "Characters":
|
||||
# un-escape rcdataElements (e.g. style, script)
|
||||
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
||||
token["data"] = unescape(token["data"])
|
||||
|
||||
elif token["type"] == "Comment":
|
||||
# Rescue CDATA from the comments
|
||||
@ -54,11 +55,19 @@ class XMLParser(html5parser.HTMLParser):
|
||||
|
||||
return token
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
|
||||
**kwargs):
|
||||
|
||||
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
|
||||
encoding, lowercaseElementName=False,
|
||||
lowercaseAttrName=False)
|
||||
|
||||
class XHTMLParser(XMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlInitialPhase(self, self.tree)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
@ -66,16 +75,21 @@ class XHTMLParser(XMLParser):
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token["type"] == "EndTag" and \
|
||||
token["name"] not in voidElements and \
|
||||
token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
|
||||
break
|
||||
if token["type"] == "EndTag":
|
||||
if token["name"] in voidElements:
|
||||
if not self.tree.openElements or \
|
||||
self.tree.openElements[-1].name != token["name"]:
|
||||
token["type"] = "EmptyTag"
|
||||
if not token.has_key("data"): token["data"] = {}
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
if token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] != XHTML_NAMESPACE:
|
||||
break
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
|
||||
return token
|
||||
|
||||
@ -86,7 +100,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XmlInitialPhase(html5parser.InitialPhase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
print repr(data)
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
self.tree.openElements.append(self.tree.document)
|
202
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
202
planet/vendor/html5lib/sanitizer.py
vendored
Normal file
@ -0,0 +1,202 @@
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from tokenizer import HTMLTokenizer
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
|
||||
'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
|
||||
'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
|
||||
'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
|
||||
'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
|
||||
'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
|
||||
'ul', 'var']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
|
||||
'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
|
||||
'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
|
||||
'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
|
||||
'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
|
||||
'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
|
||||
'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
|
||||
'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
|
||||
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
|
||||
'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
|
||||
'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
|
||||
'xml:lang']
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
|
||||
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
|
||||
'font-family', 'font-size', 'font-stretch', 'font-style',
|
||||
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
|
||||
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
|
||||
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
|
||||
'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
|
||||
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
|
||||
'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
|
||||
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
||||
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
|
||||
'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
|
||||
'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
|
||||
'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
|
||||
'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
|
||||
'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
|
||||
'xlink:href', 'xml:base']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs' ]
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||
allowed_css_properties = acceptable_css_properties
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr): continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||
del attrs[attr]
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
return token
|
||||
else:
|
||||
if token["type"] == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == "EmptyTag":
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = "Characters"
|
||||
del token["name"]
|
||||
return token
|
||||
elif token["type"] == "Comment":
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
|
||||
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
|
||||
|
||||
clean = []
|
||||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
|
||||
if not value: continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
|
||||
for keyword in value.split():
|
||||
if not keyword in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False):
|
||||
#Change case matching defaults as we only output lowercase html anyway
|
||||
#This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
|
||||
lowercaseElementName, lowercaseAttrName)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
3
planet/vendor/html5lib/serializer/__init__.py
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
from htmlserializer import HTMLSerializer
|
||||
from xhtmlserializer import XHTMLSerializer
|
218
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
218
planet/vendor/html5lib/serializer/htmlserializer.py
vendored
Normal file
@ -0,0 +1,218 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements
|
||||
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
try:
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
except ImportError:
|
||||
unicode_encode_errors = "strict"
|
||||
else:
|
||||
unicode_encode_errors = "htmlentityreplace"
|
||||
|
||||
from html5lib.constants import entities
|
||||
|
||||
encode_entity_map = {}
|
||||
for k, v in entities.items():
|
||||
if v != "&" and encode_entity_map.get(v) != k.lower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
for c in exc.object[exc.start:exc.end]:
|
||||
e = encode_entity_map.get(c)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||
return (u"".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||
|
||||
del register_error
|
||||
|
||||
def encode(text, encoding):
|
||||
return text.encode(encoding, unicode_encode_errors)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
quote_attr_values = False
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
minimize_boolean_attributes = True
|
||||
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
omit_optional_tags = True
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata", 'use_trailing_solidus', "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs.has_key('quote_char'):
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
|
||||
self.errors = []
|
||||
self.strict = False
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
if encoding and self.inject_meta_charset:
|
||||
from html5lib.filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from html5lib.filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from html5lib.filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from html5lib.filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = u"<!DOCTYPE %s>" % token["name"]
|
||||
if encoding:
|
||||
yield doctype.encode(encoding)
|
||||
else:
|
||||
yield doctype
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
if type == "SpaceCharacters" or in_cdata:
|
||||
if in_cdata and token["data"].find("</") >= 0:
|
||||
self.serializeError(_("Unexpected </ in CDATA"))
|
||||
if encoding:
|
||||
yield token["data"].encode(encoding, "strict")
|
||||
else:
|
||||
yield token["data"]
|
||||
elif encoding:
|
||||
yield encode(escape(token["data"]), encoding)
|
||||
else:
|
||||
yield escape(token["data"])
|
||||
|
||||
elif type in ("StartTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
if name in rcdataElements and not self.escape_rcdata:
|
||||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
attrs = token["data"]
|
||||
if hasattr(attrs, "items"):
|
||||
attrs = attrs.items()
|
||||
attrs.sort()
|
||||
attributes = []
|
||||
for k,v in attrs:
|
||||
if encoding:
|
||||
k = k.encode(encoding, "strict")
|
||||
attributes.append(' ')
|
||||
|
||||
attributes.append(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple()) \
|
||||
and k not in booleanAttributes.get("", tuple())):
|
||||
attributes.append("=")
|
||||
if self.quote_attr_values or not v:
|
||||
quote_attr = True
|
||||
else:
|
||||
quote_attr = reduce(lambda x,y: x or (y in v),
|
||||
spaceCharacters + "<>\"'", False)
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs: v = v.replace("<", "<")
|
||||
if encoding:
|
||||
v = encode(v, encoding)
|
||||
if quote_attr:
|
||||
quote_char = self.quote_char
|
||||
if self.use_best_quote_char:
|
||||
if "'" in v and '"' not in v:
|
||||
quote_char = '"'
|
||||
elif '"' in v and "'" not in v:
|
||||
quote_char = "'"
|
||||
if quote_char == "'":
|
||||
v = v.replace("'", "'")
|
||||
else:
|
||||
v = v.replace('"', """)
|
||||
attributes.append(quote_char)
|
||||
attributes.append(v)
|
||||
attributes.append(quote_char)
|
||||
else:
|
||||
attributes.append(v)
|
||||
if name in voidElements and self.use_trailing_solidus:
|
||||
if self.space_before_trailing_solidus:
|
||||
attributes.append(" /")
|
||||
else:
|
||||
attributes.append("/")
|
||||
if encoding:
|
||||
yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
|
||||
else:
|
||||
yield u"<%s%s>" % (name, u"".join(attributes))
|
||||
|
||||
elif type == "EndTag":
|
||||
name = token["name"]
|
||||
if name in rcdataElements:
|
||||
in_cdata = False
|
||||
elif in_cdata:
|
||||
self.serializeError(_("Unexpected child element of a CDATA element"))
|
||||
end_tag = u"</%s>" % name
|
||||
if encoding:
|
||||
end_tag = end_tag.encode(encoding, "strict")
|
||||
yield end_tag
|
||||
|
||||
elif type == "Comment":
|
||||
data = token["data"]
|
||||
if data.find("--") >= 0:
|
||||
self.serializeError(_("Comment contains --"))
|
||||
comment = u"<!--%s-->" % token["data"]
|
||||
if encoding:
|
||||
comment = comment.encode(encoding, unicode_encode_errors)
|
||||
yield comment
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
def render(self, treewalker, encoding=None):
|
||||
if encoding:
|
||||
return "".join(list(self.serialize(treewalker, encoding)))
|
||||
else:
|
||||
return u"".join(list(self.serialize(treewalker)))
|
||||
|
||||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
if self.strict:
|
||||
raise SerializeError
|
||||
|
||||
def SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
9
planet/vendor/html5lib/serializer/xhtmlserializer.py
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
from htmlserializer import HTMLSerializer
|
||||
|
||||
class XHTMLSerializer(HTMLSerializer):
|
||||
quote_attr_values = True
|
||||
minimize_boolean_attributes = False
|
||||
use_trailing_solidus = True
|
||||
escape_lt_in_attrs = True
|
||||
omit_optional_tags = False
|
||||
escape_rcdata = True
|
@ -9,7 +9,7 @@ _ = gettext.gettext
|
||||
|
||||
from constants import contentModelFlags, spaceCharacters
|
||||
from constants import entitiesWindows1252, entities
|
||||
from constants import asciiLowercase, asciiLetters
|
||||
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
||||
from constants import digits, hexDigits, EOF
|
||||
|
||||
from inputstream import HTMLInputStream
|
||||
@ -32,9 +32,14 @@ class HTMLTokenizer(object):
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def __init__(self, stream, encoding=None, parseMeta=True):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True,):
|
||||
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
||||
|
||||
|
||||
#Perform case conversions?
|
||||
self.lowercaseElementName = lowercaseElementName
|
||||
self.lowercaseAttrName = lowercaseAttrName
|
||||
|
||||
self.states = {
|
||||
"data":self.dataState,
|
||||
"entityData":self.entityDataState,
|
||||
@ -50,18 +55,30 @@ class HTMLTokenizer(object):
|
||||
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||
"bogusComment":self.bogusCommentState,
|
||||
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||
"commentStart":self.commentStartState,
|
||||
"commentStartDash":self.commentStartDashState,
|
||||
"comment":self.commentState,
|
||||
"commentDash":self.commentDashState,
|
||||
"commentEndDash":self.commentEndDashState,
|
||||
"commentEnd":self.commentEndState,
|
||||
"doctype":self.doctypeState,
|
||||
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||
"doctypeName":self.doctypeNameState,
|
||||
"afterDoctypeName":self.afterDoctypeNameState,
|
||||
"beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
|
||||
"doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
|
||||
"doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
|
||||
"afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
|
||||
"beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
|
||||
"doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
|
||||
"doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
|
||||
"afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
|
||||
"bogusDoctype":self.bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
self.escapeFlag = False
|
||||
self.lastFourChars = []
|
||||
self.state = self.states["data"]
|
||||
|
||||
# The current token being created
|
||||
@ -77,11 +94,12 @@ class HTMLTokenizer(object):
|
||||
to return we yield the token which pauses processing until the next token
|
||||
is requested.
|
||||
"""
|
||||
self.stream.reset()
|
||||
self.tokenQueue = []
|
||||
# Start processing. When EOF is reached self.state will return False
|
||||
# instead of True and the loop will terminate.
|
||||
while self.state():
|
||||
while self.stream.errors:
|
||||
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
|
||||
while self.tokenQueue:
|
||||
yield self.tokenQueue.pop(0)
|
||||
|
||||
@ -98,11 +116,11 @@ class HTMLTokenizer(object):
|
||||
self.currentToken["type"] = "EmptyTag"
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
_(u"Solidus (/) incorrectly placed in tag.")})
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
|
||||
def consumeNumberEntity(self, isHex):
|
||||
"""This function returns either U+FFFD or the character based on the
|
||||
@ -119,7 +137,6 @@ class HTMLTokenizer(object):
|
||||
allowed = hexDigits
|
||||
radix = 16
|
||||
|
||||
char = u"\uFFFD"
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
@ -132,70 +149,75 @@ class HTMLTokenizer(object):
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = int("".join(charStack), radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if 127 < charAsInt < 160:
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#self.tokenQueue.append({"type": "ParseError", "data":
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
if charAsInt == 13:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Incorrect CR newline entity. Replaced with LF.")})
|
||||
charAsInt = 10
|
||||
elif 127 < charAsInt < 160:
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0:
|
||||
charAsInt = 65533
|
||||
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
|
||||
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
try:
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||
else:
|
||||
char = u"\uFFFD"
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
_(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != u";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
self.stream.queue.append(c)
|
||||
_(u"Numeric entity didn't end with ';'.")})
|
||||
self.stream.unget(c)
|
||||
|
||||
return char
|
||||
|
||||
def consumeEntity(self):
|
||||
def consumeEntity(self, fromAttribute=False):
|
||||
char = None
|
||||
charStack = [self.stream.char()]
|
||||
if charStack[0] == u"#":
|
||||
if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
|
||||
self.stream.unget(charStack)
|
||||
elif charStack[0] == u"#":
|
||||
# We might have a number entity here.
|
||||
charStack.extend([self.stream.char(), self.stream.char()])
|
||||
if EOF in charStack:
|
||||
if EOF in charStack[:2]:
|
||||
# If we reach the end of the file put everything up to EOF
|
||||
# back in the queue
|
||||
charStack = charStack[:charStack.index(EOF)]
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
_(u"Numeric entity expected. Got end of file instead.")})
|
||||
else:
|
||||
if charStack[1].lower() == u"x" \
|
||||
and charStack[2] in hexDigits:
|
||||
# Hexadecimal entity detected.
|
||||
self.stream.queue.append(charStack[2])
|
||||
self.stream.unget(charStack[2])
|
||||
char = self.consumeNumberEntity(True)
|
||||
elif charStack[1] in digits:
|
||||
# Decimal entity detected.
|
||||
self.stream.queue.extend(charStack[1:])
|
||||
self.stream.unget(charStack[1:])
|
||||
char = self.consumeNumberEntity(False)
|
||||
else:
|
||||
# No number entity detected.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected but none found.")})
|
||||
# Break out if we reach the end of the file
|
||||
elif charStack[0] == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
_(u"Numeric entity expected but none found.")})
|
||||
else:
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
@ -216,7 +238,8 @@ class HTMLTokenizer(object):
|
||||
# that may match an entity
|
||||
entityName = None
|
||||
|
||||
# Try to find the longest entity the string will match
|
||||
# Try to find the longest entity the string will match to take care
|
||||
# of ¬i for instance.
|
||||
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||
possibleEntityName = "".join(charStack[:entityLength])
|
||||
if possibleEntityName in entities:
|
||||
@ -224,24 +247,26 @@ class HTMLTokenizer(object):
|
||||
break
|
||||
|
||||
if entityName is not None:
|
||||
char = entities[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";":
|
||||
if entityName[-1] != ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity didn't end with ';'.")})
|
||||
self.stream.queue.extend(charStack[entityLength:])
|
||||
_(u"Named entity didn't end with ';'.")})
|
||||
if entityName[-1] != ";" and fromAttribute and \
|
||||
(charStack[entityLength] in asciiLetters
|
||||
or charStack[entityLength] in digits):
|
||||
self.stream.unget(charStack)
|
||||
else:
|
||||
char = entities[entityName]
|
||||
self.stream.unget(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity expected. Got none.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
_(u"Named entity expected. Got none.")})
|
||||
self.stream.unget(charStack)
|
||||
return char
|
||||
|
||||
def processEntityInAttribute(self):
|
||||
"""This method replaces the need for "entityInAttributeValueState".
|
||||
"""
|
||||
entity = self.consumeEntity()
|
||||
entity = self.consumeEntity(True)
|
||||
if entity:
|
||||
self.currentToken["data"][-1][1] += entity
|
||||
else:
|
||||
@ -252,9 +277,15 @@ class HTMLTokenizer(object):
|
||||
the state to "data" because that's what's needed after a token has been
|
||||
emitted.
|
||||
"""
|
||||
|
||||
token = self.currentToken
|
||||
# Add token to the queue to be yielded
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
|
||||
if self.lowercaseElementName:
|
||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||
if token["type"] == "EndTag" and token["data"]:
|
||||
self.tokenQueue.append({"type":"ParseError",
|
||||
"data":_(u"End tag contains unexpected attributes.")})
|
||||
self.tokenQueue.append(token)
|
||||
self.state = self.states["data"]
|
||||
|
||||
|
||||
@ -266,12 +297,34 @@ class HTMLTokenizer(object):
|
||||
|
||||
def dataState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||
|
||||
# Keep a charbuffer to handle the escapeFlag
|
||||
if self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
||||
if len(self.lastFourChars) == 4:
|
||||
self.lastFourChars.pop(0)
|
||||
self.lastFourChars.append(data)
|
||||
|
||||
# The rest of the logic
|
||||
if data == "&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
|
||||
self.escapeFlag:
|
||||
self.state = self.states["entityData"]
|
||||
elif data == u"<" and self.contentModelFlag !=\
|
||||
contentModelFlags["PLAINTEXT"]:
|
||||
elif data == "-" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
|
||||
self.escapeFlag and "".join(self.lastFourChars) == "<!--":
|
||||
self.escapeFlag = True
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == "<" and (self.contentModelFlag ==\
|
||||
contentModelFlags["PCDATA"] or (self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == False)):
|
||||
self.state = self.states["tagOpen"]
|
||||
elif data == ">" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
|
||||
self.escapeFlag = False
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
@ -279,13 +332,11 @@ class HTMLTokenizer(object):
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data":
|
||||
data + self.stream.charsUntil((u"&", u"<"))})
|
||||
data + self.stream.charsUntil(("&", "<", ">", "-"))})
|
||||
return True
|
||||
|
||||
def entityDataState(self):
|
||||
@ -312,23 +363,23 @@ class HTMLTokenizer(object):
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
_(u"Expected tag name. Got '>' instead.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||
self.state = self.states["data"]
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
||||
_(u"Expected tag name. Got '?' instead (HTML doesn't "
|
||||
"support processing instructions).")})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
_(u"Expected tag name. Got something else instead")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
@ -338,7 +389,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["closeTagOpen"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.insert(0, data)
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
@ -361,7 +412,7 @@ class HTMLTokenizer(object):
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.stream.unget(charStack)
|
||||
|
||||
if self.currentToken \
|
||||
and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
@ -372,8 +423,6 @@ class HTMLTokenizer(object):
|
||||
# emitting the end tag token.
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
|
||||
@ -381,27 +430,25 @@ class HTMLTokenizer(object):
|
||||
# method to be walked through.
|
||||
return True
|
||||
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "EndTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken = {"type":"EndTag", "name":data, "data":[]}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def tagNameState(self):
|
||||
@ -413,14 +460,9 @@ class HTMLTokenizer(object):
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character when getting the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
_(u"Unexpected end of file in the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
@ -440,14 +482,9 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
_(u"Unexpected end of file. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
@ -457,6 +494,7 @@ class HTMLTokenizer(object):
|
||||
def attributeNameState(self):
|
||||
data = self.stream.char()
|
||||
leavingThisState = True
|
||||
emitToken = False
|
||||
if data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data in asciiLetters:
|
||||
@ -467,23 +505,17 @@ class HTMLTokenizer(object):
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
pass
|
||||
emitToken = True
|
||||
elif data in spaceCharacters:
|
||||
self.state = self.states["afterAttributeName"]
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
_(u"Unexpected end of file in attribute name.")})
|
||||
self.state = self.states["data"]
|
||||
emitToken = True
|
||||
else:
|
||||
self.currentToken["data"][-1][0] += data
|
||||
leavingThisState = False
|
||||
@ -492,12 +524,16 @@ class HTMLTokenizer(object):
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
if self.lowercaseAttrName:
|
||||
self.currentToken["data"][-1][0] = (
|
||||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||
for name, value in self.currentToken["data"][:-1]:
|
||||
if self.currentToken["data"][-1][0] == name:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
_(u"Dropped duplicate attribute on tag.")})
|
||||
break
|
||||
# XXX Fix for above XXX
|
||||
if data == u">":
|
||||
if emitToken:
|
||||
self.emitCurrentToken()
|
||||
return True
|
||||
|
||||
@ -515,14 +551,9 @@ class HTMLTokenizer(object):
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
_(u"Unexpected end of file. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
@ -537,19 +568,14 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["attributeValueDoubleQuoted"]
|
||||
elif data == u"&":
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
self.stream.queue.append(data);
|
||||
self.stream.unget(data);
|
||||
elif data == u"'":
|
||||
self.state = self.states["attributeValueSingleQuoted"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
_(u"Unexpected end of file. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data
|
||||
@ -564,7 +590,7 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (\").")})
|
||||
_(u"Unexpected end of file in attribute value (\").")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
@ -579,7 +605,7 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (').")})
|
||||
_(u"Unexpected end of file in attribute value (').")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
@ -594,14 +620,9 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
_(u"Unexpected end of file in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
@ -624,44 +645,83 @@ class HTMLTokenizer(object):
|
||||
def markupDeclarationOpenState(self):
|
||||
charStack = [self.stream.char(), self.stream.char()]
|
||||
if charStack == [u"-", u"-"]:
|
||||
self.currentToken = {"type": "Comment", "data": ""}
|
||||
self.state = self.states["comment"]
|
||||
self.currentToken = {"type": "Comment", "data": u""}
|
||||
self.state = self.states["commentStart"]
|
||||
else:
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
# Put in explicit EOF check
|
||||
if (not EOF in charStack and
|
||||
"".join(charStack).upper() == u"DOCTYPE"):
|
||||
self.currentToken =\
|
||||
{"type": "Doctype", "name": "", "data": True}
|
||||
self.currentToken = {"type":"Doctype", "name":u"",
|
||||
"publicId":None, "systemId":None, "correct":True}
|
||||
self.state = self.states["doctype"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
_(u"Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.unget(charStack)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def commentStartState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.state = self.states["commentStartDash"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
def commentStartDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
|
||||
def commentState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentDash"]
|
||||
self.state = self.states["commentEndDash"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
return True
|
||||
|
||||
def commentDashState(self):
|
||||
def commentEndDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (-)")})
|
||||
_(u"Unexpected end of file in comment (-)")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -680,17 +740,17 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["data"]
|
||||
elif data == u"-":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
_(u"Unexpected '-' after '--' found in comment.")})
|
||||
self.currentToken["data"] += data
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (--).")})
|
||||
_(u"Unexpected end of file in comment (--).")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in comment found.")})
|
||||
_(u"Unexpected character in comment found.")})
|
||||
self.currentToken["data"] += u"--" + data
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
@ -701,8 +761,8 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.queue.append(data)
|
||||
_(u"No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
return True
|
||||
|
||||
@ -710,19 +770,16 @@ class HTMLTokenizer(object):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data in asciiLowercase:
|
||||
self.currentToken["name"] = data.upper()
|
||||
self.state = self.states["doctypeName"]
|
||||
elif data == u">":
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
_(u"Unexpected > character. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
_(u"Unexpected end of file. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -732,30 +789,19 @@ class HTMLTokenizer(object):
|
||||
|
||||
def doctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
needsDoctypeCheck = False
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["afterDoctypeName"]
|
||||
needsDoctypeCheck = True
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
_(u"Unexpected end of file in DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if data in asciiLowercase:
|
||||
data = data.upper()
|
||||
self.currentToken["name"] += data
|
||||
needsDoctypeCheck = True
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
||||
self.currentToken["data"] = False
|
||||
return True
|
||||
|
||||
def afterDoctypeNameState(self):
|
||||
@ -766,30 +812,196 @@ class HTMLTokenizer(object):
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.currentToken["data"] = True
|
||||
# XXX EMIT
|
||||
self.stream.queue.append(data)
|
||||
self.currentToken["correct"] = False
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
charStack = [data]
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
if EOF not in charStack and\
|
||||
"".join(charStack).translate(asciiUpper2Lower) == "public":
|
||||
self.state = self.states["beforeDoctypePublicIdentifier"]
|
||||
elif EOF not in charStack and\
|
||||
"".join(charStack).translate(asciiUpper2Lower) == "system":
|
||||
self.state = self.states["beforeDoctypeSystemIdentifier"]
|
||||
else:
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Expected space or '>'. Got '%s'") % (data,)})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def beforeDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["publicId"] = u""
|
||||
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["publicId"] = u""
|
||||
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
self.currentToken["data"] = True
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def doctypePublicIdentifierDoubleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "\"":
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["publicId"] += data
|
||||
return True
|
||||
|
||||
def doctypePublicIdentifierSingleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "'":
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["publicId"] += data
|
||||
return True
|
||||
|
||||
def afterDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def beforeDoctypeSystemIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def doctypeSystemIdentifierDoubleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "\"":
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["systemId"] += data
|
||||
return True
|
||||
|
||||
def doctypeSystemIdentifierSingleQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "'":
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["systemId"] += data
|
||||
return True
|
||||
|
||||
def afterDoctypeSystemIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def bogusDoctypeState(self):
|
||||
data = self.stream.char()
|
||||
self.currentToken["correct"] = False
|
||||
if data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.stream.queue.append(data)
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
_(u"Unexpected end of file in bogus doctype.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
65
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
65
planet/vendor/html5lib/treebuilders/__init__.py
vendored
Executable file
@ -0,0 +1,65 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeBuilder class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
lxml.etree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeBuilderCache:
|
||||
if treeType in ("dom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeBuilderCache[treeType] = mod.TreeBuilder
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
return treeBuilderCache.get(treeType)
|
@ -1,4 +1,4 @@
|
||||
from constants import scopingElements, tableInsertModeElements
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
@ -207,8 +207,11 @@ class TreeBuilder(object):
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertDoctype(self, name):
|
||||
self.document.appendChild(self.doctypeClass(name))
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
doctype = self.doctypeClass(name)
|
||||
doctype.publicId = publicId
|
||||
doctype.systemId = systemId
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if parent is None:
|
||||
@ -302,6 +305,7 @@ class TreeBuilder(object):
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
59
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
59
planet/html5lib/treebuilders/dom.py → planet/vendor/html5lib/treebuilders/dom.py
vendored
Executable file → Normal file
@ -1,8 +1,5 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
from xml.sax.saxutils import escape
|
||||
from constants import voidElements
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -44,7 +41,8 @@ class NodeBuilder(_base.Node):
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self.element.removeChild(node.element)
|
||||
if node.element.parentNode == self.element:
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
@ -74,15 +72,13 @@ class NodeBuilder(_base.Node):
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
def hilite(self, encoding):
|
||||
print 'foo'
|
||||
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
|
||||
setattr(self.dom, 'hilite', method)
|
||||
return self
|
||||
|
||||
def doctypeClass(self,name):
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
def elementClass(self, name):
|
||||
return NodeBuilder(self.dom.createElement(name))
|
||||
@ -124,10 +120,13 @@ def testSerializer(element):
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
if element.name:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
rv.append("#document-fragment")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
@ -145,32 +144,6 @@ def testSerializer(element):
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
|
||||
rv += "</%s>\n"%node.nodeName
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.nodeType == Node.TEXT_NODE:
|
||||
rv = node.nodeValue
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
rv = "<%s"%node.nodeName
|
||||
if node.hasAttributes():
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.items()])
|
||||
rv += ">"
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
rv = "<!-- %s -->" % escape(node.nodeValue)
|
||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
if node.nodeType == Node.ELEMENT_NODE:
|
||||
if not nsmap:
|
||||
@ -215,10 +188,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||
handler.startDocument()
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
handler.endDocument()
|
||||
|
||||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||
|
||||
else:
|
||||
# ATTRIBUTE_NODE
|
266
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
266
planet/vendor/html5lib/treebuilders/etree.py
vendored
Executable file
@ -0,0 +1,266 @@
|
||||
import _base
|
||||
import new
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation, fullTree=False):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation, fullTree)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
ElementTree = ElementTreeImplementation
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
if not self._element[-1].tail:
|
||||
self._element[-1].tail = ""
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
if not self._element[index-1].tail:
|
||||
self._element[index-1].tail = ""
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
if not self._element.text:
|
||||
self._element.text = ""
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
for name, value in self.attributes.iteritems():
|
||||
element.attributes[name] = value
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
if not newParent._element.text:
|
||||
newParent._element.text = ""
|
||||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
#Use the superclass constructor to set all properties on the
|
||||
#wrapper element
|
||||
self._element = ElementTree.Comment(data)
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get(u"publicId", None)
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"publicId", value)
|
||||
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get(u"systemId", None)
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"systemId", value)
|
||||
|
||||
systemId = property(_getSystemId, _setSystemId)
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_ROOT>")
|
||||
|
||||
class DocumentFragment(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if not(hasattr(element, "tag")):
|
||||
element = element.getroot()
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if type(element) == type(ElementTree.ElementTree):
|
||||
element = element.getroot()
|
||||
|
||||
if element.tag == "<!DOCTYPE>":
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag == "<DOCUMENT_ROOT>":
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif type(element.tag) == type(ElementTree.Comment):
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
fragmentClass = DocumentFragment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
@ -1,5 +1,5 @@
|
||||
import _base
|
||||
from constants import voidElements
|
||||
from html5lib.constants import voidElements
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
@ -30,7 +30,7 @@ class Node(_base.Node):
|
||||
tree += child.printTree(indent + 2)
|
||||
return tree
|
||||
|
||||
def appendChild(self, node, index=None):
|
||||
def appendChild(self, node):
|
||||
if (isinstance(node, TextNode) and self.childNodes and
|
||||
isinstance(self.childNodes[-1], TextNode)):
|
||||
self.childNodes[-1].value += node.value
|
||||
@ -63,8 +63,9 @@ class Node(_base.Node):
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
if hasattr(self, 'attributes'):
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
newNode.value = self.value
|
||||
return newNode
|
||||
|
||||
@ -107,9 +108,11 @@ class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
self.publicId = u""
|
||||
self.systemId = u""
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!DOCTYPE %s>" % self.name
|
||||
return u"<!DOCTYPE %s>" % self.name
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
@ -123,7 +126,7 @@ class TextNode(Node):
|
||||
self.value = value
|
||||
|
||||
def __unicode__(self):
|
||||
return "\"%s\"" % self.value
|
||||
return u"\"%s\"" % self.value
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
@ -137,20 +140,20 @@ class Element(Node):
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
return "<%s>" % self.name
|
||||
return u"<%s>" % self.name
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
for name,value in self.attributes.iteritems():
|
||||
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
result += u' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += '>'
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
result += '</%s>' % self.name
|
||||
result += u'</%s>' % self.name
|
||||
else:
|
||||
result += '/>'
|
||||
result += u'/>'
|
||||
return result
|
||||
|
||||
def hilite(self):
|
||||
@ -191,32 +194,6 @@ class CommentNode(Node):
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.type == Element.type and node.name not in voidElements:
|
||||
rv += "</%s>\n"%node.name
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.type == TextNode.type:
|
||||
rv = node.value
|
||||
elif node.type == Element.type:
|
||||
rv = "<%s"%node.name
|
||||
if node.attributes:
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.iteritems()])
|
||||
rv += ">"
|
||||
elif node.type == CommentNode.type:
|
||||
rv = "<!-- %s -->" % escape(node.data)
|
||||
elif node.type == DocumentType.type:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
158
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
158
planet/vendor/html5lib/treebuilders/soup.py
vendored
Normal file
@ -0,0 +1,158 @@
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return self.attrs.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
"set attr", name, value
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return self.attrs.items()
|
||||
def keys(self):
|
||||
return self.attrs.keys()
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in self.attrs.keys()
|
||||
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup=soup
|
||||
|
||||
def appendChild(self, node):
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[-1]+node.element), self.soup)
|
||||
self.element.contents[-1].extract()
|
||||
self.appendChild(newNode)
|
||||
else:
|
||||
self.element.insert(len(self.element.contents), node.element)
|
||||
node.parent = self
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(NavigableString(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.contents.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
newNode = TextNode(NavigableString(
|
||||
self.element.contents[index-1]+node.element), self.soup)
|
||||
self.element.contents[index-1].extract()
|
||||
self.insertBefore(newNode, refNode)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.contents:
|
||||
child = self.element.contents[0]
|
||||
child.extract()
|
||||
if isinstance(child, Tag):
|
||||
newParent.appendChild(Element(child, self.soup))
|
||||
else:
|
||||
newParent.appendChild(TextNode(child, self.soup))
|
||||
|
||||
def cloneNode(self):
|
||||
node = Element(Tag(self.soup, self.element.name), self.soup)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup=soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup)
|
||||
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
self.soup.insert(0, Declaration(name))
|
||||
|
||||
def elementClass(self, name):
|
||||
return Element(Tag(self.soup, name), self.soup)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup)
|
||||
|
||||
def appendChild(self, node):
|
||||
self.soup.insert(len(self.soup.contents), node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, Declaration):
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
|
||||
elif isinstance(element, BeautifulSoup):
|
||||
if element.name == "[document_fragment]":
|
||||
rv.append("#document-fragment")
|
||||
else:
|
||||
rv.append("#document")
|
||||
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
|
||||
elif isinstance(element, unicode):
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.name))
|
||||
if element.attrs:
|
||||
for name, value in element.attrs:
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
if hasattr(element, "contents"):
|
||||
for child in element.contents:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
52
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
52
planet/vendor/html5lib/treewalkers/__init__.py
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
"""A collection of modules for iterating through different kinds of
|
||||
tree, generating tokens identical to those produced by the tokenizer
|
||||
module.
|
||||
|
||||
To create a tree walker for a new type of tree, you need to do
|
||||
implement a tree walker object (called TreeWalker by convention) that
|
||||
implements a 'serialize' method taking a tree as sole argument and
|
||||
returning an iterator generating tokens.
|
||||
"""
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
"""Get a TreeWalker class for various types of tree with built-in support
|
||||
|
||||
treeType - the name of the tree type required (case-insensitive). Supported
|
||||
values are "simpletree", "dom", "etree" and "beautifulsoup"
|
||||
|
||||
"simpletree" - a built-in DOM-ish tree type with support for some
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"pulldom" - The xml.dom.pulldom event stream
|
||||
"etree" - A generic walker for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"lxml" - Optimized walker for lxml.etree
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
"genshi" - a Genshi stream
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
cElementTree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
if treeType in ("dom", "pulldom", "simpletree"):
|
||||
mod = __import__(treeType, globals())
|
||||
treeWalkerCache[treeType] = mod.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
import genshistream
|
||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeWalkerCache[treeType] = soup.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
import lxmletree
|
||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
154
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
154
planet/vendor/html5lib/treewalkers/_base.py
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
class TreeWalker(object):
|
||||
def __init__(self, tree):
|
||||
self.tree = tree
|
||||
|
||||
def __iter__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def error(self, msg):
|
||||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def normalizeAttrs(self, attrs):
|
||||
if not attrs:
|
||||
attrs = []
|
||||
elif hasattr(attrs, 'items'):
|
||||
attrs = attrs.items()
|
||||
return [(unicode(name),unicode(value)) for name,value in attrs]
|
||||
|
||||
def emptyTag(self, name, attrs, hasChildren=False):
|
||||
yield {"type": "EmptyTag", "name": unicode(name), \
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
if hasChildren:
|
||||
yield self.error(_("Void element has children"))
|
||||
|
||||
def startTag(self, name, attrs):
|
||||
return {"type": "StartTag", "name": unicode(name), \
|
||||
"data": self.normalizeAttrs(attrs)}
|
||||
|
||||
def endTag(self, name):
|
||||
return {"type": "EndTag", "name": unicode(name), "data": []}
|
||||
|
||||
def text(self, data):
|
||||
data = unicode(data)
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data)-len(middle)]
|
||||
if left:
|
||||
yield {"type": "SpaceCharacters", "data": left}
|
||||
data = middle
|
||||
middle = data.rstrip(spaceCharacters)
|
||||
right = data[len(middle):]
|
||||
if middle:
|
||||
yield {"type": "Characters", "data": middle}
|
||||
if right:
|
||||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
return {"type": "Comment", "data": unicode(data)}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||
return {"type": "Doctype",
|
||||
"name": name is not None and unicode(name) or u"",
|
||||
"publicId": publicId, "systemId": systemId,
|
||||
"correct": correct}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
|
||||
class RecursiveTreeWalker(TreeWalker):
|
||||
def walkChildren(self, node):
|
||||
raise NodeImplementedError
|
||||
|
||||
def element(self, node, name, attrs, hasChildren):
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attrs, hasChildren):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, attrs)
|
||||
if hasChildren:
|
||||
for token in self.walkChildren(node):
|
||||
yield token
|
||||
yield self.endTag(name)
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getFirstChild(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getNextSibling(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def getParentNode(self, node):
|
||||
raise NotImplementedError
|
||||
|
||||
def __iter__(self):
|
||||
currentNode = self.tree
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
hasChildren = False
|
||||
|
||||
if type == DOCTYPE:
|
||||
yield self.doctype(*details)
|
||||
|
||||
elif type == TEXT:
|
||||
for token in self.text(*details):
|
||||
yield token
|
||||
|
||||
elif type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, attributes, hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
yield self.startTag(name, attributes)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
else:
|
||||
yield self.unknown(details[0])
|
||||
|
||||
if hasChildren:
|
||||
firstChild = self.getFirstChild(currentNode)
|
||||
else:
|
||||
firstChild = None
|
||||
|
||||
if firstChild is not None:
|
||||
currentNode = firstChild
|
||||
else:
|
||||
while currentNode is not None:
|
||||
details = self.getNodeDetails(currentNode)
|
||||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
37
planet/vendor/html5lib/treewalkers/dom.py
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
from xml.dom import Node
|
||||
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return _base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return _base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return _base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parentNode
|
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
112
planet/vendor/html5lib/treewalkers/etree.py
vendored
Normal file
@ -0,0 +1,112 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import new
|
||||
import copy
|
||||
|
||||
import _base
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
def getETreeModule(ElementTreeImplementation):
|
||||
name = "_" + ElementTreeImplementation.__name__+"builder"
|
||||
if name in moduleCache:
|
||||
return moduleCache[name]
|
||||
else:
|
||||
mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
|
||||
objs = getETreeBuilder(ElementTreeImplementation)
|
||||
mod.__dict__.update(objs)
|
||||
moduleCache[name] = mod
|
||||
return mod
|
||||
|
||||
def getETreeBuilder(ElementTreeImplementation):
|
||||
ElementTree = ElementTreeImplementation
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
||||
1. An Element node serving as *context* (it cannot be called the parent
|
||||
node due to the particular ``tail`` text nodes.
|
||||
|
||||
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
|
||||
|
||||
3. A list used as a stack of all ancestor *context nodes*. It is a
|
||||
pair tuple whose first item is an Element and second item is a child
|
||||
index.
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
if key in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, key)
|
||||
else:
|
||||
node = elt[int(key)]
|
||||
|
||||
if not(hasattr(node, "tag")):
|
||||
node = node.getroot()
|
||||
|
||||
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return _base.DOCTYPE, node.text
|
||||
|
||||
elif type(node.tag) == type(ElementTree.Comment):
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents = node
|
||||
assert key not in ("text", "tail"), "Text nodes have no children"
|
||||
parents.append((elt, int(key)))
|
||||
node = elt[int(key)]
|
||||
else:
|
||||
parents = []
|
||||
|
||||
assert len(node) or node.text, "Node has no children"
|
||||
if node.text:
|
||||
return (node, "text", parents)
|
||||
else:
|
||||
return (node, 0, parents)
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
|
||||
elt, key, parents = node
|
||||
if key == "text":
|
||||
key = -1
|
||||
elif key == "tail":
|
||||
elt, key = parents.pop()
|
||||
else:
|
||||
# Look for "tail" of the "revisited" node
|
||||
child = elt[key]
|
||||
if child.tail:
|
||||
parents.append((elt, key))
|
||||
return (child, "tail", parents)
|
||||
|
||||
# case where key were "text" or "tail" or elt[key] had a tail
|
||||
key += 1
|
||||
if len(elt) > key:
|
||||
return (elt, key, parents)
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
elt, key, parents = node
|
||||
if parents:
|
||||
elt, key = parents.pop()
|
||||
return elt, key, parents
|
||||
else:
|
||||
# HACK: We could return ``elt`` but None will stop the algorithm the same way
|
||||
return None
|
||||
|
||||
return locals()
|
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
67
planet/vendor/html5lib/treewalkers/genshistream.py
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
|
||||
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.output import NamespaceFlattener
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
depth = 0
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in NamespaceFlattener(prefixes={
|
||||
'http://www.w3.org/1999/xhtml': ''
|
||||
})(self.tree):
|
||||
if previous is not None:
|
||||
if previous[0] == START:
|
||||
depth += 1
|
||||
if ignore_until <= depth:
|
||||
ignore_until = None
|
||||
if ignore_until is None:
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = depth
|
||||
if previous[0] == END:
|
||||
depth -= 1
|
||||
previous = event
|
||||
if previous is not None:
|
||||
if ignore_until is None or ignore_until <= depth:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, pos = event
|
||||
if kind == START:
|
||||
tag, attrib = data
|
||||
if tag in voidElements:
|
||||
for token in self.emptyTag(tag, list(attrib), \
|
||||
not next or next[0] != END or next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(tag, list(attrib))
|
||||
|
||||
elif kind == END:
|
||||
if data not in voidElements:
|
||||
yield self.endTag(data)
|
||||
|
||||
elif kind == COMMENT:
|
||||
yield self.comment(data)
|
||||
|
||||
elif kind == TEXT:
|
||||
for token in self.text(data):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||
START_CDATA, END_CDATA, PI):
|
||||
pass
|
||||
|
||||
else:
|
||||
yield self.unknown(kind)
|
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
52
planet/vendor/html5lib/treewalkers/pulldom.py
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
|
||||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
def __iter__(self):
|
||||
ignore_until = None
|
||||
previous = None
|
||||
for event in self.tree:
|
||||
if previous is not None and \
|
||||
(ignore_until is None or previous[1] is ignore_until):
|
||||
if previous[1] is ignore_until:
|
||||
ignore_until = None
|
||||
for token in self.tokens(previous, event):
|
||||
yield token
|
||||
if token["type"] == "EmptyTag":
|
||||
ignore_until = previous[1]
|
||||
previous = event
|
||||
if ignore_until is None or previous[1] is ignore_until:
|
||||
for token in self.tokens(previous, None):
|
||||
yield token
|
||||
elif ignore_until is not None:
|
||||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
|
||||
|
||||
def tokens(self, event, next):
|
||||
type, node = event
|
||||
if type == START_ELEMENT:
|
||||
name = node.nodeName
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(name, \
|
||||
node.attributes.items(), not next or next[1] is not node):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(name, node.attributes.items())
|
||||
|
||||
elif type == END_ELEMENT:
|
||||
name = node.nodeName
|
||||
if name not in voidElements:
|
||||
yield self.endTag(name)
|
||||
|
||||
elif type == COMMENT:
|
||||
yield self.comment(node.nodeValue)
|
||||
|
||||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
|
||||
for token in self.text(node.nodeValue):
|
||||
yield token
|
||||
|
||||
else:
|
||||
yield self.unknown(type)
|
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
72
planet/vendor/html5lib/treewalkers/simpletree.py
vendored
Normal file
@ -0,0 +1,72 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
"""Given that simpletree has no performant way of getting a node's
|
||||
next sibling, this implementation returns "nodes" as tuples with the
|
||||
following content:
|
||||
|
||||
1. The parent Node (Element, Document or DocumentFragment)
|
||||
|
||||
2. The child index of the current node in its parent's children list
|
||||
|
||||
3. A list used as a stack of all ancestors. It is a pair tuple whose
|
||||
first item is a parent Node and second item is a child index.
|
||||
"""
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
node = parent.childNodes[idx]
|
||||
|
||||
# testing node.type allows us not to import treebuilders.simpletree
|
||||
if node.type in (1, 2): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.type == 3: # DocumentType
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.type == 4: # TextNode
|
||||
return _base.TEXT, node.value
|
||||
|
||||
elif node.type == 5: # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
node.attributes.items(), node.hasContent()
|
||||
|
||||
elif node.type == 6: # CommentNode
|
||||
return _base.COMMENT, node.data
|
||||
|
||||
else:
|
||||
return _node.UNKNOWN, node.type
|
||||
|
||||
def getFirstChild(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Node
|
||||
parent, idx, parents = node
|
||||
parents.append((parent, idx))
|
||||
node = parent.childNodes[idx]
|
||||
else:
|
||||
parents = []
|
||||
|
||||
assert node.hasContent(), "Node has no children"
|
||||
return (node, 0, parents)
|
||||
|
||||
def getNextSibling(self, node):
|
||||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
|
||||
parent, idx, parents = node
|
||||
idx += 1
|
||||
if len(parent.childNodes) > idx:
|
||||
return (parent, idx, parents)
|
||||
else:
|
||||
return None
|
||||
|
||||
def getParentNode(self, node):
|
||||
assert isinstance(node, tuple)
|
||||
parent, idx, parents = node
|
||||
if parents:
|
||||
parent, idx = parents.pop()
|
||||
return parent, idx, parents
|
||||
else:
|
||||
# HACK: We could return ``parent`` but None will stop the algorithm the same way
|
||||
return None
|
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
36
planet/vendor/html5lib/treewalkers/soup.py
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Declaration): # DocumentType
|
||||
#Slice needed to remove markup added during unicode conversion
|
||||
return _base.DOCTYPE, unicode(node.string)[2:-1]
|
||||
|
||||
elif isinstance(node, Comment):
|
||||
return _base.COMMENT, unicode(node.string)[4:-3]
|
||||
|
||||
elif isinstance(node, unicode): # TextNode
|
||||
return _base.TEXT, node
|
||||
|
||||
elif isinstance(node, Tag): # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
dict(node.attrs).items(), node.contents
|
||||
else:
|
||||
return _base.UNKNOWN, node.__class__.__name__
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.contents[0]
|
||||
|
||||
def getNextSibling(self, node):
|
||||
return node.nextSibling
|
||||
|
||||
def getParentNode(self, node):
|
||||
return node.parent
|
@ -1,5 +1,5 @@
|
||||
[Planet]
|
||||
output_theme = genshi_fancy
|
||||
output_theme = asf
|
||||
output_dir = tests/work/apply
|
||||
name = test planet
|
||||
cache_directory = tests/work/spider/cache
|
||||
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
|
||||
bill_of_materials:
|
||||
images/#{face}
|
||||
|
||||
[index.html.genshi]
|
||||
[index.html.xslt]
|
||||
filters:
|
||||
xhtml2html.py>index.html4
|
||||
xhtml2html.plugin?quote_attr_values=True"e_char="'">index.html4
|
||||
|
||||
[tests/data/spider/testfeed0.atom]
|
||||
name = not found
|
||||
|
3
tests/data/config/basic.csv
Normal file
3
tests/data/config/basic.csv
Normal file
@ -0,0 +1,3 @@
|
||||
url,name,filters
|
||||
feed1,one
|
||||
feed2,two,bar
|
|
7
tests/data/config/rlist-config.ini
Normal file
7
tests/data/config/rlist-config.ini
Normal file
@ -0,0 +1,7 @@
|
||||
[Planet]
|
||||
name = CSV Test Configuration
|
||||
cache_directory = tests/work/config/cache
|
||||
filters = foo
|
||||
|
||||
[tests/data/config/subconfig.ini]
|
||||
content_type = config
|
7
tests/data/config/rlist-csv.ini
Normal file
7
tests/data/config/rlist-csv.ini
Normal file
@ -0,0 +1,7 @@
|
||||
[Planet]
|
||||
name = CSV Test Configuration
|
||||
cache_directory = tests/work/config/cache
|
||||
filters = foo
|
||||
|
||||
[tests/data/config/basic.csv]
|
||||
content_type = csv
|
6
tests/data/config/subconfig.ini
Normal file
6
tests/data/config/subconfig.ini
Normal file
@ -0,0 +1,6 @@
|
||||
[feed1]
|
||||
name = one
|
||||
|
||||
[feed2]
|
||||
name = two
|
||||
filters = bar
|
@ -10,7 +10,7 @@
|
||||
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
|
||||
|
||||
<entry>
|
||||
<title>Atom-Powered Robots Run Amok</title>
|
||||
<title>¡Atom-Powered Robots Run Amok!</title>
|
||||
<link href="http://example.org/2003/12/13/atom03"/>
|
||||
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
|
||||
<updated>2003-12-13T18:30:02Z</updated>
|
||||
|
@ -4,3 +4,4 @@ filters = excerpt.py
|
||||
[excerpt.py]
|
||||
width = 100
|
||||
omit = p
|
||||
target = planet:excerpt
|
||||
|
3
tests/data/filter/minhead.ini
Normal file
3
tests/data/filter/minhead.ini
Normal file
@ -0,0 +1,3 @@
|
||||
[Planet]
|
||||
filters = minhead.py?min=3
|
||||
filter_directories = filters
|
3
tests/data/filter/minhead.xml
Normal file
3
tests/data/filter/minhead.xml
Normal file
@ -0,0 +1,3 @@
|
||||
<div xmlns="http://www.w3.org/1999/xhtml">
|
||||
<h1>title</h1><h3>mid</h3><h5>bottom</h5>
|
||||
</div>
|
13
tests/data/filter/tmpl/content_xhtml2.xml
Normal file
13
tests/data/filter/tmpl/content_xhtml2.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<!--
|
||||
Description: xhtml content
|
||||
Expect: Items[0]['content'] == '<img src="x.jpg" />'
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<content type="xhtml">
|
||||
<div xmlns="http://www.w3.org/1999/xhtml"><img src="x.jpg"/></div>
|
||||
</content>
|
||||
</entry>
|
||||
</feed>
|
||||
|
13
tests/data/reconstitute/author_noname.xml
Normal file
13
tests/data/reconstitute/author_noname.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<!--
|
||||
Description: author name
|
||||
Expect: author_detail.name == ''
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<author>
|
||||
<email>john@example.com</email>
|
||||
</author>
|
||||
</entry>
|
||||
</feed>
|
||||
|
11
tests/data/reconstitute/category_blank_term.xml
Normal file
11
tests/data/reconstitute/category_blank_term.xml
Normal file
@ -0,0 +1,11 @@
|
||||
<!--
|
||||
Description: category term
|
||||
Expect: not globals().has_key('tags')
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<category term=""/>
|
||||
</entry>
|
||||
</feed>
|
||||
|
@ -5,7 +5,7 @@ Expect: tags[0].label == 'Inbox'
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<category label="Inbox"/>
|
||||
<category term='x' label="Inbox"/>
|
||||
</entry>
|
||||
</feed>
|
||||
|
||||
|
@ -5,7 +5,7 @@ Expect: tags[0].scheme == 'http://example.com/categories'
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<category scheme="http://example.com/categories"/>
|
||||
<category term='x' scheme="http://example.com/categories"/>
|
||||
</entry>
|
||||
</feed>
|
||||
|
||||
|
10
tests/data/reconstitute/empty_title.xml
Normal file
10
tests/data/reconstitute/empty_title.xml
Normal file
@ -0,0 +1,10 @@
|
||||
<!--
|
||||
Description: empty title
|
||||
Expect: title_detail.value == ''
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
<title/>
|
||||
</entry>
|
||||
</feed>
|
9
tests/data/reconstitute/missing_title.xml
Normal file
9
tests/data/reconstitute/missing_title.xml
Normal file
@ -0,0 +1,9 @@
|
||||
<!--
|
||||
Description: missing title
|
||||
Expect: title_detail.value == ''
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
<entry>
|
||||
</entry>
|
||||
</feed>
|
41
tests/data/spider/testfeed4.atom
Normal file
41
tests/data/spider/testfeed4.atom
Normal file
@ -0,0 +1,41 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom"/>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed1</id>
|
||||
|
||||
<title>Sam Ruby</title>
|
||||
<subtitle>It’s just data</subtitle>
|
||||
<author>
|
||||
<name>Sam Ruby</name>
|
||||
<email>rubys@intertwingly.net</email>
|
||||
<uri>http://www.intertwingly.net/blog/</uri>
|
||||
</author>
|
||||
<updated>2006-06-16T20:15:18-04:00</updated>
|
||||
<link href="http://www.intertwingly.net/blog/"/>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/1"/>
|
||||
<title>Mercury</title>
|
||||
<content>one</content>
|
||||
<updated>2006-01-01T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/3"/>
|
||||
<title>Earth</title>
|
||||
<content>three</content>
|
||||
<updated>2006-01-03T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/2"/>
|
||||
<title>Venus</title>
|
||||
<content>two</content>
|
||||
<updated>2006-01-02T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
</feed>
|
||||
|
@ -4,13 +4,13 @@ venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0,venus_base)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import planet
|
||||
planet.getLogger('WARN',None)
|
||||
|
||||
hide_planet_ns = True
|
||||
|
||||
while len(sys.argv) > 1:
|
||||
if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
|
||||
import planet
|
||||
planet.getLogger('DEBUG',None)
|
||||
del sys.argv[1]
|
||||
elif sys.argv[1] == '-p' or sys.argv[1] == '--planet':
|
||||
hide_planet_ns = False
|
||||
@ -41,7 +41,7 @@ if __name__ == "__main__":
|
||||
from planet import spider
|
||||
spider.spiderPlanet(only_if_new=False)
|
||||
|
||||
from planet import feedparser
|
||||
import feedparser
|
||||
for source in glob.glob(os.path.join(work, 'sources/*')):
|
||||
feed = feedparser.parse(source).feed
|
||||
if feed.has_key('title'):
|
||||
|
@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
shutil.rmtree(os.path.split(workdir)[0])
|
||||
|
||||
def test_apply_asf(self):
|
||||
config.load(configfile % 'asf')
|
||||
def apply_asf(self):
|
||||
splice.apply(self.feeddata)
|
||||
|
||||
# verify that selected files are there
|
||||
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
|
||||
self.assertEqual(12, content)
|
||||
self.assertEqual(3, lang)
|
||||
|
||||
def test_apply_asf(self):
|
||||
config.load(configfile % 'asf')
|
||||
self.apply_asf()
|
||||
|
||||
def test_apply_classic_fancy(self):
|
||||
config.load(configfile % 'fancy')
|
||||
self.apply_fancy()
|
||||
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
|
||||
|
||||
def test_apply_filter_html(self):
|
||||
config.load(configfile % 'html')
|
||||
self.apply_fancy()
|
||||
self.apply_asf()
|
||||
|
||||
output = open(os.path.join(workdir, 'index.html')).read()
|
||||
self.assertTrue(output.find('/>')>=0)
|
||||
@ -100,10 +103,18 @@ class ApplyTest(unittest.TestCase):
|
||||
html = open(os.path.join(workdir, 'index.html')).read()
|
||||
self.assertTrue(html.find(' href="http://example.com/default.css"')>=0)
|
||||
|
||||
import test_filter_genshi
|
||||
for method in dir(test_filter_genshi.GenshiFilterTests):
|
||||
if method.startswith('test_'): break
|
||||
else:
|
||||
delattr(ApplyTest,'test_apply_genshi_fancy')
|
||||
|
||||
try:
|
||||
import libxml2
|
||||
except ImportError:
|
||||
|
||||
delattr(ApplyTest,'test_apply_filter_mememe')
|
||||
|
||||
try:
|
||||
import win32pipe
|
||||
(stdin,stdout) = win32pipe.popen4('xsltproc -V', 't')
|
||||
@ -122,10 +133,3 @@ except ImportError:
|
||||
logger.warn("xsltproc is not available => can't test XSLT templates")
|
||||
for method in dir(ApplyTest):
|
||||
if method.startswith('test_'): delattr(ApplyTest,method)
|
||||
|
||||
import test_filter_genshi
|
||||
for method in dir(test_filter_genshi.GenshiFilterTests):
|
||||
if method.startswith('test_'): break
|
||||
else:
|
||||
delattr(ApplyTest,'test_apply_genshi_fancy')
|
||||
delattr(ApplyTest,'test_apply_filter_html')
|
||||
|
25
tests/test_config_csv.py
Normal file
25
tests/test_config_csv.py
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os, shutil, unittest
|
||||
from planet import config
|
||||
|
||||
workdir = os.path.join('tests', 'work', 'config', 'cache')
|
||||
|
||||
class ConfigCsvTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
config.load('tests/data/config/rlist-csv.ini')
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(workdir)
|
||||
os.removedirs(os.path.split(workdir)[0])
|
||||
|
||||
# administrivia
|
||||
|
||||
def test_feeds(self):
|
||||
feeds = config.subscriptions()
|
||||
feeds.sort()
|
||||
self.assertEqual(['feed1', 'feed2'], feeds)
|
||||
|
||||
def test_filters(self):
|
||||
self.assertEqual(['foo','bar'], config.filters('feed2'))
|
||||
self.assertEqual(['foo'], config.filters('feed1'))
|
@ -15,7 +15,7 @@ configfile = 'tests/data/expunge/config.ini'
|
||||
class ExpungeTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# silence errors
|
||||
planet.logger = None
|
||||
self.original_logger = planet.logger
|
||||
planet.getLogger('CRITICAL',None)
|
||||
|
||||
try:
|
||||
@ -29,6 +29,7 @@ class ExpungeTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
shutil.rmtree(workdir)
|
||||
os.removedirs(os.path.split(workdir)[0])
|
||||
planet.logger = self.original_logger
|
||||
|
||||
def test_expunge(self):
|
||||
config.load(configfile)
|
||||
|
@ -18,13 +18,13 @@ class DjangoFilterTests(unittest.TestCase):
|
||||
results = dj.tmpl.template_info("<feed/>")
|
||||
self.assertEqual(type(results['date']), datetime.datetime)
|
||||
|
||||
def test_django_item_title(self):
|
||||
def test_django_entry_title(self):
|
||||
config.load('tests/data/filter/django/test.ini')
|
||||
feed = open('tests/data/filter/django/test.xml')
|
||||
input = feed.read(); feed.close()
|
||||
results = dj.run(
|
||||
os.path.realpath('tests/data/filter/django/title.html.dj'), input)
|
||||
self.assertEqual(results, "Atom-Powered Robots Run Amok\n")
|
||||
self.assertEqual(results, "\xc2\xa1Atom-Powered Robots Run Amok!\n")
|
||||
|
||||
def test_django_config_context(self):
|
||||
config.load('tests/data/filter/django/test.ini')
|
||||
|
@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
|
||||
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
def test_xhtml2html_filter(self):
|
||||
testfile = 'tests/data/filter/index.html'
|
||||
filter = 'xhtml2html.py'
|
||||
output = shell.run(filter, open(testfile).read(), mode="filter")
|
||||
self.assertTrue(output.find('/>')<0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
try:
|
||||
import genshi
|
||||
except:
|
||||
|
@ -54,6 +54,21 @@ class FilterTests(unittest.TestCase):
|
||||
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
|
||||
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
|
||||
|
||||
def test_excerpt_lorem_ipsum_summary(self):
|
||||
testfile = 'tests/data/filter/excerpt-lorem-ipsum.xml'
|
||||
config.load('tests/data/filter/excerpt-lorem-ipsum.ini')
|
||||
config.parser.set('excerpt.py', 'target', 'atom:summary')
|
||||
|
||||
output = open(testfile).read()
|
||||
for filter in config.filters():
|
||||
output = shell.run(filter, output, mode="filter")
|
||||
|
||||
dom = xml.dom.minidom.parseString(output)
|
||||
excerpt = dom.getElementsByTagName('summary')[0]
|
||||
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
|
||||
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
|
||||
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
|
||||
|
||||
def test_stripAd_yahoo(self):
|
||||
testfile = 'tests/data/filter/stripAd-yahoo.xml'
|
||||
config.load('tests/data/filter/stripAd-yahoo.ini')
|
||||
@ -130,6 +145,13 @@ class FilterTests(unittest.TestCase):
|
||||
|
||||
self.assertEqual('', output)
|
||||
|
||||
def test_xhtml2html_filter(self):
|
||||
testfile = 'tests/data/filter/index.html'
|
||||
filter = 'xhtml2html.plugin?quote_attr_values=True'
|
||||
output = shell.run(filter, open(testfile).read(), mode="filter")
|
||||
self.assertTrue(output.find('/>')<0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
try:
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
|
@ -1,18 +1,18 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest
|
||||
from planet import idindex, config, logger
|
||||
import unittest, planet
|
||||
from planet import idindex, config
|
||||
|
||||
class idIndexTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
# silence errors
|
||||
import planet
|
||||
planet.logger = None
|
||||
self.original_logger = planet.logger
|
||||
planet.getLogger('CRITICAL',None)
|
||||
|
||||
def tearDown(self):
|
||||
idindex.destroy()
|
||||
planet.logger = self.original_logger
|
||||
|
||||
def test_unicode(self):
|
||||
from planet.spider import filename
|
||||
@ -69,6 +69,6 @@ class idIndexTest(unittest.TestCase):
|
||||
try:
|
||||
module = 'dbhash'
|
||||
except ImportError:
|
||||
logger.warn("dbhash is not available => can't test id index")
|
||||
planet.logger.warn("dbhash is not available => can't test id index")
|
||||
for method in dir(idIndexTest):
|
||||
if method.startswith('test_'): delattr(idIndexTest,method)
|
||||
|
@ -26,6 +26,9 @@ feed = '''
|
||||
|
||||
configData = '''
|
||||
[testfeed]
|
||||
ignore_in_feed =
|
||||
future_dates =
|
||||
|
||||
name_type = html
|
||||
title_type = html
|
||||
summary_type = html
|
||||
@ -37,16 +40,21 @@ class ScrubTest(unittest.TestCase):
|
||||
def test_scrub_ignore(self):
|
||||
base = feedparser.parse(feed)
|
||||
|
||||
self.assertTrue(base.entries[0].has_key('author'))
|
||||
self.assertTrue(base.entries[0].has_key('author_detail'))
|
||||
self.assertTrue(base.entries[0].has_key('id'))
|
||||
self.assertTrue(base.entries[0].has_key('updated'))
|
||||
self.assertTrue(base.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(base.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
|
||||
config.parser.set('testfeed', 'ignore_in_feed',
|
||||
'author id updated xml:lang')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertFalse(data.entries[0].has_key('author'))
|
||||
self.assertFalse(data.entries[0].has_key('author_detail'))
|
||||
self.assertFalse(data.entries[0].has_key('id'))
|
||||
self.assertFalse(data.entries[0].has_key('updated'))
|
||||
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||
|
@ -12,7 +12,7 @@ configfile = 'tests/data/spider/config.ini'
|
||||
class SpiderTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# silence errors
|
||||
planet.logger = None
|
||||
self.original_logger = planet.logger
|
||||
planet.getLogger('CRITICAL',None)
|
||||
|
||||
try:
|
||||
@ -24,6 +24,7 @@ class SpiderTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
shutil.rmtree(workdir)
|
||||
os.removedirs(os.path.split(workdir)[0])
|
||||
planet.logger = self.original_logger
|
||||
|
||||
def test_filename(self):
|
||||
self.assertEqual(os.path.join('.', 'example.com,index.html'),
|
||||
@ -87,6 +88,14 @@ class SpiderTest(unittest.TestCase):
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderFeedUpdatedEntries(self):
|
||||
config.load(configfile)
|
||||
self.spiderFeed(testfeed % '4')
|
||||
self.assertEqual(2, len(glob.glob(workdir+"/*")))
|
||||
data = feedparser.parse(workdir +
|
||||
'/planet.intertwingly.net,2006,testfeed4')
|
||||
self.assertEqual(u'three', data.entries[0].content[0].value)
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
|
||||
|
8
tests/test_subconfig.py
Normal file
8
tests/test_subconfig.py
Normal file
@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from test_config_csv import ConfigCsvTest
|
||||
from planet import config
|
||||
|
||||
class SubConfigTest(ConfigCsvTest):
|
||||
def setUp(self):
|
||||
config.load('tests/data/config/rlist-config.ini')
|
@ -38,6 +38,10 @@ a.rising {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
a[rel~='license'] {
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
body > h1 {
|
||||
font-size: x-large;
|
||||
text-transform: uppercase;
|
||||
|
@ -4,7 +4,7 @@
|
||||
xmlns:planet="http://planet.intertwingly.net/"
|
||||
xmlns="http://www.w3.org/1999/xhtml"
|
||||
exclude-result-prefixes="atom planet xhtml">
|
||||
|
||||
|
||||
<xsl:output method="xml" omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="atom:feed">
|
||||
@ -158,7 +158,7 @@
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="atom:entry">
|
||||
<!-- date header -->
|
||||
<xsl:variable name="date" select="substring(atom:updated,1,10)"/>
|
||||
@ -216,19 +216,57 @@
|
||||
<xsl:apply-templates select="atom:summary"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
|
||||
|
||||
<!-- entry footer -->
|
||||
<xsl:text> </xsl:text>
|
||||
<div class="permalink">
|
||||
<xsl:if test="atom:link[@rel='license'] or
|
||||
atom:source/atom:link[@rel='license'] or
|
||||
atom:rights or atom:source/atom:rights">
|
||||
<a>
|
||||
<xsl:if test="atom:source/atom:link[@rel='license']/@href">
|
||||
<xsl:attribute name="rel">license</xsl:attribute>
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="atom:source/atom:link[@rel='license']/@href"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:if test="atom:link[@rel='license']/@href">
|
||||
<xsl:attribute name="rel">license</xsl:attribute>
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="atom:link[@rel='license']/@href"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:if test="atom:source/atom:rights">
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="atom:source/atom:rights"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:if test="atom:rights">
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="atom:rights"/>
|
||||
</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:text>©</xsl:text>
|
||||
</a>
|
||||
<xsl:text> </xsl:text>
|
||||
</xsl:if>
|
||||
<a href="{atom:link[@rel='alternate']/@href}">
|
||||
<xsl:choose>
|
||||
<xsl:when test="atom:author/atom:name">
|
||||
<xsl:text>by </xsl:text>
|
||||
<xsl:if test="not(atom:link[@rel='license'] or
|
||||
atom:source/atom:link[@rel='license'] or
|
||||
atom:rights or atom:source/atom:rights)">
|
||||
<xsl:text>by </xsl:text>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="atom:author/atom:name"/>
|
||||
<xsl:text> at </xsl:text>
|
||||
</xsl:when>
|
||||
<xsl:when test="atom:source/atom:author/atom:name">
|
||||
<xsl:text>by </xsl:text>
|
||||
<xsl:if test="not(atom:link[@rel='license'] or
|
||||
atom:source/atom:link[@rel='license'] or
|
||||
atom:rights or atom:source/atom:rights)">
|
||||
<xsl:text>by </xsl:text>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="atom:source/atom:author/atom:name"/>
|
||||
<xsl:text> at </xsl:text>
|
||||
</xsl:when>
|
||||
|
@ -5,11 +5,26 @@
|
||||
|
||||
<xsl:output indent="yes" method="xml"/>
|
||||
|
||||
<xsl:template name="rfc822" xmlns:date="http://exslt.org/dates-and-times">
|
||||
<xsl:param name="date"/>
|
||||
<!-- http://www.trachtenberg.com/blog/2005/03/03/xslt-cookbook-generating-an-rfc-822-date/ -->
|
||||
<xsl:value-of select="concat(date:day-abbreviation($date), ', ',
|
||||
format-number(date:day-in-month($date), '00'), ' ',
|
||||
date:month-abbreviation($date), ' ', date:year($date), ' ',
|
||||
format-number(date:hour-in-day($date), '00'), ':',
|
||||
format-number(date:minute-in-hour($date), '00'), ':',
|
||||
format-number(date:second-in-minute($date), '00'), ' GMT')"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="atom:feed">
|
||||
<opml version="1.1">
|
||||
<head>
|
||||
<title><xsl:value-of select="atom:title"/></title>
|
||||
<dateModified><xsl:value-of select="atom:updated/@planet:format"/></dateModified>
|
||||
<dateModified>
|
||||
<xsl:call-template name="rfc822">
|
||||
<xsl:with-param name="date" select="atom:updated"/>
|
||||
</xsl:call-template>
|
||||
</dateModified>
|
||||
<ownerName><xsl:value-of select="atom:author/atom:name"/></ownerName>
|
||||
<ownerEmail><xsl:value-of select="atom:author/atom:email"/></ownerEmail>
|
||||
</head>
|
||||
|
Loading…
x
Reference in New Issue
Block a user