diff --git a/README b/README
index 3014d89..59924c3 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ also actively being maintained.
It uses Mark Pilgrim's Universal Feed Parser to read from CDF, RDF, RSS and
Atom feeds; Leonard Richardson's Beautiful Soup to correct markup issues;
-and either Tomas Styblo's templating engine Daniel Viellard's implementation
+and either Tomas Styblo's templating engine or Daniel Viellard's implementation
of XSLT to output static files in any format you can dream up.
To get started, check out the documentation in the docs directory. If you have
diff --git a/docs/config.html b/docs/config.html
index 4a08ed7..0ed6e59 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -139,9 +139,10 @@ you are free to include as few or as many parameters as you like, most of
the predefined themes presume that at least name
is defined.
title_type
, summary_type
,
content_type
allow you to override the
type
diff --git a/examples/opml-top100.ini b/examples/opml-top100.ini
index 01b210d..5ba6771 100644
--- a/examples/opml-top100.ini
+++ b/examples/opml-top100.ini
@@ -31,13 +31,18 @@ activity_threshold = 90
# filters to be run
filters = excerpt.py
+bill_of_materials:
+ .htaccess
+ favicon.ico
+ robots.txt
+
# filter parameters
[excerpt.py]
omit = img p br
width = 500
# add memes to output
-[index.html.tmpl]
+[index.html.xslt]
filters = mememe.plugin
[mememe.plugin]
diff --git a/filters/excerpt.py b/filters/excerpt.py
index 99126b8..c1d4e9a 100644
--- a/filters/excerpt.py
+++ b/filters/excerpt.py
@@ -4,6 +4,7 @@ Generate an excerpt from either the summary or a content of an entry.
Parameters:
width: maximum number of characters in the excerpt. Default: 500
omit: whitespace delimited list of html tags to remove. Default: none
+ target: name of element created. Default: planet:excerpt
Notes:
* if 'img' is in the list of tags to be omitted
tags are replaced with
@@ -23,6 +24,7 @@ args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2]))
wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
omit = args.get('omit', '').split()
+target = args.get('target', 'planet:excerpt')
class copy:
""" recursively copy a source to a target, up to a given width """
@@ -94,10 +96,14 @@ if not source:
# if present, recursively copy it to a planet:excerpt element
if source:
- dom.documentElement.setAttribute('xmlns:planet', planetNS)
- target = dom.createElementNS(planetNS, 'planet:excerpt')
- source[0].parentNode.appendChild(target)
- copy(dom, source[0], target)
+ if target.startswith('planet:'):
+ dom.documentElement.setAttribute('xmlns:planet', planetNS)
+ if target.startswith('atom:'): target = target.split(':',1)[1]
+ excerpt = dom.createElementNS(planetNS, target)
+ source[0].parentNode.appendChild(excerpt)
+ copy(dom, source[0], excerpt)
+ if source[0].nodeName == excerpt.nodeName:
+ source[0].parentNode.removeChild(source[0])
# print out results
print dom.toxml('utf-8')
diff --git a/filters/html2xhtml.plugin b/filters/html2xhtml.plugin
index 456df48..3ab7a8c 100644
--- a/filters/html2xhtml.plugin
+++ b/filters/html2xhtml.plugin
@@ -1,5 +1,5 @@
import sys
-from planet import html5lib
+import html5lib
tree=html5lib.treebuilders.dom.TreeBuilder
parser = html5lib.html5parser.HTMLParser(tree=tree)
document = parser.parse(sys.stdin)
diff --git a/filters/mememe.plugin b/filters/mememe.plugin
index 36dea83..eb347c7 100644
--- a/filters/mememe.plugin
+++ b/filters/mememe.plugin
@@ -23,9 +23,10 @@ from xml.sax.saxutils import escape
from htmlentitydefs import entitydefs
import planet
-from planet import config, feedparser
+from planet import config
from planet.spider import filename
-log = planet.getLogger(config.log_level(),config.log_format())
+import feedparser
+log = planet.logger
options = config.filter_options(sys.argv[0])
MEMES_ATOM = os.path.join(config.output_dir(),'memes.atom')
@@ -64,6 +65,7 @@ def cache_meme(url, headers):
file.close()
urlmap = {}
+revmap = {}
def canonicalize(url):
url = urlmap.get(url,url)
parts = list(urlparse.urlparse(url))
@@ -73,7 +75,10 @@ def canonicalize(url):
if parts[1].startswith('www.'): parts[1]=parts[1][4:]
if not parts[2]: parts[2] = '/'
parts[-1] = ''
- return urlparse.urlunparse(parts)
+
+ canonurl = urlparse.urlunparse(parts)
+ revmap[canonurl] = url
+ return canonurl
log.debug("Loading cached data")
for name in glob.glob(os.path.join(cache, '*')):
@@ -125,7 +130,7 @@ for name in glob.glob(os.path.join(cache, '*')):
# identify the unique links
entry_links = []
- for node in doc.xpathEval("//*[@href and not(@rel='source')]"):
+ for node in doc.xpathEval("//*[@href and not(@rel='source') and not(@rel='license')]"):
parent = node.parent
while parent:
if parent.name == 'source': break
@@ -309,7 +314,7 @@ meme_feed.newTextChild(None, 'updated',
# parse the input
log.debug("Parse input")
-doc=libxml2.parseDoc(sys.stdin.read())
+doc=libxml2.readDoc(sys.stdin.read(), '', 'utf-8', libxml2.XML_PARSE_NONET)
# find the sidebar/footer
sidebar = options.get('sidebar','//*[@class="sidebar"]')
@@ -340,7 +345,7 @@ while child:
if not title: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a', title)
- a.setProp('href', entry)
+ a.setProp('href', revmap.get(entry,entry))
link_count = link_count + 1
if link_count >= 10: break
if link_count > 0: state = None
@@ -388,7 +393,7 @@ for i in range(0,len(weighted_links)):
# otherwise, parse the html
if not title:
- title = html(link).title
+ title = html(revmap.get(link,link)).title
# dehtmlize
title = re.sub('&(\w+);',
@@ -421,7 +426,7 @@ for i in range(0,len(weighted_links)):
# main link
a = li.newTextChild(None, 'a', title.strip().encode('utf-8'))
- a.setProp('href',link)
+ a.setProp('href',revmap.get(link,link))
if (((i==0) or (updated>=weighted_links[i-1][2])) and
(i+1==len(weighted_links) or (updated>=weighted_links[i+1][2]))):
rank = 0
@@ -437,7 +442,7 @@ for i in range(0,len(weighted_links)):
if entry in voters: continue
li2 = ul2.newChild(None, 'li', None)
a = li2.newTextChild(None, 'a' , author)
- a.setProp('href',entry)
+ a.setProp('href',revmap.get(entry,entry))
if title: a.setProp('title',title)
voters.append(entry)
diff --git a/filters/minhead.py b/filters/minhead.py
new file mode 100644
index 0000000..b9c225e
--- /dev/null
+++ b/filters/minhead.py
@@ -0,0 +1,36 @@
+#
+# Ensure that all headings are below a permissible maximum (like h3).
+# If not, all heading levels will be changed to conform.
+# Note: this may create "illegal" heading levels, like h7 and beyond.
+#
+
+import sys
+from xml.dom import minidom, XHTML_NAMESPACE
+
+# determine permissible minimimum heading
+if '--min' in sys.argv:
+ minhead = int(sys.argv[sys.argv.index('--min')+1])
+else:
+ minhead=3
+
+# parse input stream
+doc = minidom.parse(sys.stdin)
+
+# search for headings below the permissable minimum
+first=minhead
+for i in range(1,minhead):
+ if doc.getElementsByTagName('h%d' % i):
+ first=i
+ break
+
+# if found, bump all headings so that the top is the permissible minimum
+if first < minhead:
+ for i in range(6,0,-1):
+ for oldhead in doc.getElementsByTagName('h%d' % i):
+ newhead = doc.createElementNS(XHTML_NAMESPACE, 'h%d' % (i+minhead-first))
+ for child in oldhead.childNodes:
+ newhead.appendChild(child)
+ oldhead.parentNode.replaceChild(newhead, oldhead)
+
+# return (possibly modified) document
+print doc.toxml('utf-8')
diff --git a/filters/xhtml2html.plugin b/filters/xhtml2html.plugin
new file mode 100644
index 0000000..3f13d3c
--- /dev/null
+++ b/filters/xhtml2html.plugin
@@ -0,0 +1,24 @@
+# Example usages:
+#
+# filters:
+# xhtml2html.plugin?quote_attr_values=True"e_char="'"
+#
+# -- or --
+#
+# [xhtml2html.plugin]
+# quote_attr_values=True
+# quote_char="'"
+
+import sys
+opts = {}
+for name,value in zip(sys.argv[1::2],sys.argv[2::2]):
+ name = name.lstrip('-')
+ try: opts[name] = eval(value)
+ except: opts[name] = value
+
+from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
+parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
+serializer = serializer.HTMLSerializer(**dict(opts))
+for text in serializer.serialize(tokens, encoding='utf-8'):
+ sys.stdout.write(text)
diff --git a/filters/xhtml2html.py b/filters/xhtml2html.py
deleted file mode 100644
index 9c2073e..0000000
--- a/filters/xhtml2html.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import sys
-from genshi.input import XMLParser
-from genshi.output import HTMLSerializer
-
-print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
diff --git a/planet/__init__.py b/planet/__init__.py
index 61ac2a2..f90dfe9 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -1,6 +1,7 @@
xmlns = 'http://planet.intertwingly.net/'
logger = None
+loggerParms = None
import os, sys, re
import config
@@ -11,8 +12,8 @@ from urlparse import urljoin
def getLogger(level, format):
""" get a logger with the specified log level """
- global logger
- if logger: return logger
+ global logger, loggerParms
+ if logger and loggerParms == (level,format): return logger
try:
import logging
@@ -21,16 +22,19 @@ def getLogger(level, format):
import compat_logging as logging
logging.basicConfig(format=format)
- logging.getLogger().setLevel(logging.getLevelName(level))
logger = logging.getLogger("planet.runner")
+ logger.setLevel(logging.getLevelName(level))
try:
logger.warning
except:
logger.warning = logger.warn
+ loggerParms = (level,format)
return logger
+sys.path.insert(1, os.path.join(os.path.dirname(__file__),'vendor'))
+
# Configure feed parser
-from planet import feedparser
+import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0
diff --git a/planet/config.py b/planet/config.py
index fb436e8..e1325d1 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -138,8 +138,10 @@ def load(config_file):
parser.read(config_file)
import config, planet
- from planet import opml, foaf
- log = planet.getLogger(config.log_level(),config.log_format())
+ from planet import opml, foaf, csv_config
+ log = planet.logger
+ if not log:
+ log = planet.getLogger(config.log_level(),config.log_format())
# Theme support
theme = config.output_theme()
@@ -191,18 +193,22 @@ def load(config_file):
os.makedirs(config.cache_lists_directory())
def data2config(data, cached_config):
- if content_type(list).find('opml')>=0:
- opml.opml2config(data, cached_config)
- elif content_type(list).find('foaf')>=0:
- foaf.foaf2config(data, cached_config)
- else:
- from planet import shell
- import StringIO
- cached_config.readfp(StringIO.StringIO(shell.run(
- content_type(list), data.getvalue(), mode="filter")))
+ if content_type(list).find('opml')>=0:
+ opml.opml2config(data, cached_config)
+ elif content_type(list).find('foaf')>=0:
+ foaf.foaf2config(data, cached_config)
+ elif content_type(list).find('csv')>=0:
+ csv_config.csv2config(data, cached_config)
+ elif content_type(list).find('config')>=0:
+ cached_config.readfp(data)
+ else:
+ from planet import shell
+ import StringIO
+ cached_config.readfp(StringIO.StringIO(shell.run(
+ content_type(list), data.getvalue(), mode="filter")))
- if cached_config.sections() in [[], [list]]:
- raise Exception
+ if cached_config.sections() in [[], [list]]:
+ raise Exception
for list in reading_lists:
downloadReadingList(list, parser, data2config)
@@ -344,7 +350,9 @@ def reading_lists():
for section in parser.sections():
if parser.has_option(section, 'content_type'):
type = parser.get(section, 'content_type')
- if type.find('opml')>=0 or type.find('foaf')>=0 or type.find('.')>=0:
+ if type.find('opml')>=0 or type.find('foaf')>=0 or \
+ type.find('csv')>=0 or type.find('config')>=0 or \
+ type.find('.')>=0:
result.append(section)
return result
diff --git a/planet/csv_config.py b/planet/csv_config.py
new file mode 100755
index 0000000..ba3be61
--- /dev/null
+++ b/planet/csv_config.py
@@ -0,0 +1,29 @@
+from ConfigParser import ConfigParser
+import csv
+
+# input = csv, output = ConfigParser
+def csv2config(input, config=None):
+
+ if not hasattr(input, 'read'):
+ input = csv.StringIO(input)
+
+ if not config:
+ config = ConfigParser()
+
+ reader = csv.DictReader(input)
+ for row in reader:
+ section = row[reader.fieldnames[0]]
+ config.add_section(section)
+ for name, value in row.items():
+ if value and name != reader.fieldnames[0]:
+ config.set(section, name, value)
+
+ return config
+
+if __name__ == "__main__":
+ # small main program which converts CSV into config.ini format
+ import sys, urllib
+ config = ConfigParser()
+ for input in sys.argv[1:]:
+ csv2config(urllib.urlopen(input), config)
+ config.write(sys.stdout)
diff --git a/planet/expunge.py b/planet/expunge.py
index 9f890b9..de7e511 100644
--- a/planet/expunge.py
+++ b/planet/expunge.py
@@ -5,8 +5,7 @@ from spider import filename
def expungeCache():
""" Expunge old entries from a cache of entries """
- import planet
- log = planet.getLogger(config.log_level(),config.log_format())
+ log = planet.logger
log.info("Determining feed subscriptions")
entry_count = {}
diff --git a/planet/foaf.py b/planet/foaf.py
index eb981d1..6149c1f 100644
--- a/planet/foaf.py
+++ b/planet/foaf.py
@@ -35,13 +35,13 @@ def load_model(rdf, base_uri):
return model
# input = foaf, output = ConfigParser
-def foaf2config(rdf, config, subject=None):
+def foaf2config(rdf, config, subject=None, section=None):
if not config or not config.sections():
return
# there should be only be 1 section
- section = config.sections().pop()
+ if not section: section = config.sections().pop()
try:
from RDF import Model, NS, Parser, Statement
@@ -191,6 +191,7 @@ if __name__ == "__main__":
for uri in sys.argv[1:]:
config.add_section(uri)
- foaf2config(urllib.urlopen(uri), config)
+ foaf2config(urllib.urlopen(uri), config, section=uri)
+ config.remove_section(uri)
config.write(sys.stdout)
diff --git a/planet/html5lib/treebuilders/__init__.py b/planet/html5lib/treebuilders/__init__.py
deleted file mode 100755
index 9470145..0000000
--- a/planet/html5lib/treebuilders/__init__.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""A collection of modules for building different kinds of tree from
-HTML documents.
-
-To create a treebuilder for a new type of tree, you need to do
-implement several things:
-
-1) A set of classes for various types of elements: Document, Doctype,
-Comment, Element. These must implement the interface of
-_base.treebuilders.Node (although comment nodes have a different
-signature for their constructor, see treebuilders.simpletree.Comment)
-Textual content may also be implemented as another node type, or not, as
-your tree implementation requires.
-
-2) A treebuilder object (called TreeBuilder by convention) that
-inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
-documentClass - the class to use for the bottommost node of a document
-elementClass - the class to use for HTML Elements
-commentClass - the class to use for comments
-doctypeClass - the class to use for doctypes
-It also has one required method:
-getDocument - Returns the root node of the complete document tree
-
-3) If you wish to run the unit tests, you must also create a
-testSerializer method on your treebuilder which accepts a node and
-returns a string containing Node and its children serialized according
-to the format used in the unittests
-
-The supplied simpletree module provides a python-only implementation
-of a full treebuilder and is a useful reference for the semantics of
-the various methods.
-"""
-
-import os.path
-__path__.append(os.path.dirname(__path__[0]))
-
-import dom
-import simpletree
-
-try:
- import etree
-except:
- pass
diff --git a/planet/html5lib/treebuilders/etree.py b/planet/html5lib/treebuilders/etree.py
deleted file mode 100755
index 5af468b..0000000
--- a/planet/html5lib/treebuilders/etree.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import etreefull
-
-class TreeBuilder(etreefull.TreeBuilder):
- def getDocument(self):
- return self.document._element.find("html")
diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py
deleted file mode 100644
index 2629664..0000000
--- a/planet/html5lib/treebuilders/etreefull.py
+++ /dev/null
@@ -1,227 +0,0 @@
-try:
- from xml.etree import ElementTree
-except ImportError:
- try:
- from elementtree import ElementTree
- except:
- pass
-
-import _base
-
-class Element(_base.Node):
- def __init__(self, name):
- self._element = ElementTree.Element(name)
- self.name = name
- self.parent = None
- self._childNodes = []
- self._flags = []
-
- def _setName(self, name):
- self._element.tag = name
-
- def _getName(self):
- return self._element.tag
-
- name = property(_getName, _setName)
-
- def _getAttributes(self):
- return self._element.attrib
-
- def _setAttributes(self, attributes):
- #Delete existing attributes first
- #XXX - there may be a better way to do this...
- for key in self._element.attrib.keys():
- del self._element.attrib[key]
- for key, value in attributes.iteritems():
- self._element.set(key, value)
-
- attributes = property(_getAttributes, _setAttributes)
-
- def _getChildNodes(self):
- return self._childNodes
-
- def _setChildNodes(self, value):
- del self._element[:]
- self._childNodes = []
- for element in value:
- self.insertChild(element)
-
- childNodes = property(_getChildNodes, _setChildNodes)
-
- def hasContent(self):
- """Return true if the node has children or text"""
- return bool(self._element.text or self._element.getchildren())
-
- def appendChild(self, node):
- self._childNodes.append(node)
- self._element.append(node._element)
- node.parent = self
-
- def insertBefore(self, node, refNode):
- index = self._element.getchildren().index(refNode._element)
- self._element.insert(index, node._element)
- node.parent = self
-
- def removeChild(self, node):
- self._element.remove(node._element)
- node.parent=None
-
- def insertText(self, data, insertBefore=None):
- if not(len(self._element)):
- if not self._element.text:
- self._element.text = ""
- self._element.text += data
- elif insertBefore is None:
- #Insert the text as the tail of the last child element
- if not self._element[-1].tail:
- self._element[-1].tail = ""
- self._element[-1].tail += data
- else:
- #Insert the text before the specified node
- children = self._element.getchildren()
- index = children.index(insertBefore._element)
- if index > 0:
- if not self._element[index-1].tail:
- self._element[index-1].tail = ""
- self._element[index-1].tail += data
- else:
- if not self._element.text:
- self._element.text = ""
- self._element.text += data
-
- def cloneNode(self):
- element = Element(self.name)
- element.attributes = self.attributes
- return element
-
- def reparentChildren(self, newParent):
- if newParent.childNodes:
- newParent.childNodes[-1]._element.tail += self._element.text
- else:
- if not newParent._element.text:
- newParent._element.text = ""
- if self._element.text is not None:
- newParent._element.text += self._element.text
- self._element.text = ""
- _base.Node.reparentChildren(self, newParent)
-
-class Comment(Element):
- def __init__(self, data):
- #Use the superclass constructor to set all properties on the
- #wrapper element
- Element.__init__(self, None)
- self._element = ElementTree.Comment(data)
-
- def _getData(self):
- return self._element.text
-
- def _setData(self, value):
- self._element.text = value
-
- data = property(_getData, _setData)
-
-class DocumentType(Element):
- def __init__(self, name):
- Element.__init__(self, DocumentType)
- self._element.text = name
-
-class Document(Element):
- def __init__(self):
- Element.__init__(self, Document)
-
-class DocumentFragment(Element):
- def __init__(self):
- Element.__init__(self, DocumentFragment)
-
-def testSerializer(element):
- rv = []
- finalText = None
- def serializeElement(element, indent=0):
- if element.tag is DocumentType:
- rv.append("|%s"%(' '*indent, element.text))
- elif element.tag is Document:
- rv.append("#document")
- if element.text:
- rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
- if element.tail:
- finalText = element.tail
- elif element.tag is ElementTree.Comment:
- rv.append("|%s"%(' '*indent, element.text))
- else:
- rv.append("|%s<%s>"%(' '*indent, element.tag))
- if hasattr(element, "attrib"):
- for name, value in element.attrib.iteritems():
- rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
- if element.text:
- rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
- indent += 2
- for child in element.getchildren():
- serializeElement(child, indent)
- if element.tail:
- rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
- serializeElement(element, 0)
-
- if finalText is not None:
- rv.append("|%s\"%s\""%(' '*2, finalText))
-
- return "\n".join(rv)
-
-def tostring(element):
- """Serialize an element and its child nodes to a string"""
- rv = []
- finalText = None
- def serializeElement(element):
- if element.tag is DocumentType:
- rv.append(""%(element.text,))
- elif element.tag is Document:
- if element.text:
- rv.append(element.text)
- if element.tail:
- finalText = element.tail
-
- for child in element.getchildren():
- serializeElement(child)
-
- elif element.tag is ElementTree.Comment:
- rv.append(""%(element.text,))
- else:
- #This is assumed to be an ordinary element
- if not element.attrib:
- rv.append("<%s>"%(element.tag,))
- else:
- attr = " ".join(["%s=\"%s\""%(name, value)
- for name, value in element.attrib.iteritems()])
- rv.append("<%s %s>"%(element.tag, attr))
- if element.text:
- rv.append(element.text)
-
- for child in element.getchildren():
- serializeElement(child)
-
- rv.append("%s>"%(element.tag,))
-
- if element.tail:
- rv.append(element.tail)
-
- serializeElement(element)
-
- if finalText is not None:
- rv.append("%s\""%(' '*2, finalText))
-
- return "".join(rv)
-
-class TreeBuilder(_base.TreeBuilder):
- documentClass = Document
- doctypeClass = DocumentType
- elementClass = Element
- commentClass = Comment
- fragmentClass = DocumentFragment
-
- def testSerializer(self, element):
- return testSerializer(element)
-
- def getDocument(self):
- return self.document._element
-
- def getFragment(self):
- return _base.TreeBuilder.getFragment(self)._element
diff --git a/planet/idindex.py b/planet/idindex.py
index b0eacf9..87daa0f 100644
--- a/planet/idindex.py
+++ b/planet/idindex.py
@@ -1,5 +1,5 @@
from glob import glob
-import os, sys, dbhash
+import os, sys
if __name__ == '__main__':
rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -13,6 +13,7 @@ def open():
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): return None
+ import dbhash
return dbhash.open(filename(index, 'id'),'w')
except Exception, e:
if e.__class__.__name__ == 'DBError': e = e.args[-1]
@@ -34,6 +35,7 @@ def create():
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): os.makedirs(index)
+ import dbhash
index = dbhash.open(filename(index, 'id'),'c')
try:
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 1a914ec..2af0a37 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -16,7 +16,8 @@ Todo:
import re, time, md5, sgmllib
from xml.sax.saxutils import escape
from xml.dom import minidom, Node
-from planet.html5lib import liberalxmlparser, treebuilders
+from html5lib import liberalxmlparser
+from html5lib.treebuilders import dom
import planet, config
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -106,12 +107,12 @@ def date(xentry, name, parsed):
formatted = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsed)
xdate = createTextElement(xentry, name, formatted)
formatted = time.strftime(config.date_format(), parsed)
- xdate.setAttribute('planet:format', formatted)
+ xdate.setAttribute('planet:format', formatted.decode('utf-8'))
def category(xentry, tag):
xtag = xentry.ownerDocument.createElement('category')
- if tag.has_key('term') and tag.term:
- xtag.setAttribute('term', tag.get('term'))
+ if not tag.has_key('term') or not tag.term: return
+ xtag.setAttribute('term', tag.get('term'))
if tag.has_key('scheme') and tag.scheme:
xtag.setAttribute('scheme', tag.get('scheme'))
if tag.has_key('label') and tag.label:
@@ -124,7 +125,11 @@ def author(xentry, name, detail):
xdoc = xentry.ownerDocument
xauthor = xdoc.createElement(name)
- createTextElement(xauthor, 'name', detail.get('name', None))
+ if detail.get('name', None):
+ createTextElement(xauthor, 'name', detail.get('name'))
+ else:
+ xauthor.appendChild(xdoc.createElement('name'))
+
createTextElement(xauthor, 'email', detail.get('email', None))
createTextElement(xauthor, 'uri', detail.get('href', None))
@@ -150,7 +155,7 @@ def content(xentry, name, detail, bozo):
data = minidom.parseString(xdiv % detail.value).documentElement
xcontent.setAttribute('type', 'xhtml')
else:
- parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+ parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
html = parser.parse(xdiv % detail.value, encoding="utf-8")
for body in html.documentElement.childNodes:
if body.nodeType != Node.ELEMENT_NODE: continue
@@ -232,7 +237,7 @@ def reconstitute(feed, entry):
links(xentry, entry)
bozo = feed.bozo
- if not entry.has_key('title'):
+ if not entry.has_key('title') or not entry.title:
xentry.appendChild(xdoc.createElement('title'))
content(xentry, 'title', entry.get('title_detail',None), bozo)
diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py
index dd2abd3..49b8557 100644
--- a/planet/shell/__init__.py
+++ b/planet/shell/__init__.py
@@ -6,7 +6,7 @@ logged_modes = []
def run(template_file, doc, mode='template'):
""" select a template module based on file extension and execute it """
- log = planet.getLogger(planet.config.log_level(),planet.config.log_format())
+ log = planet.logger
if mode == 'template':
dirs = planet.config.template_directories()
diff --git a/planet/shell/dj.py b/planet/shell/dj.py
index 2ce316a..05baa62 100644
--- a/planet/shell/dj.py
+++ b/planet/shell/dj.py
@@ -40,7 +40,9 @@ def run(script, doc, output_file=None, options={}):
reluri = os.path.splitext(os.path.basename(output_file))[0]
context['url'] = urlparse.urljoin(config.link(),reluri)
f = open(output_file, 'w')
- f.write(t.render(context))
+ ss = t.render(context)
+ if isinstance(ss,unicode): ss=ss.encode('utf-8')
+ f.write(ss)
f.close()
else:
# @@this is useful for testing purposes, but does it
diff --git a/planet/shell/tmpl.py b/planet/shell/tmpl.py
index 4f4d822..05fb0cf 100644
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@@ -1,6 +1,10 @@
from xml.sax.saxutils import escape
-import sgmllib, time, os, sys, new, urlparse
-from planet import config, feedparser, htmltmpl
+import sgmllib, time, os, sys, new, urlparse, re
+from planet import config, feedparser
+import htmltmpl
+
+voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
+empty=re.compile(r"<((%s)[^>]*)>\2>" % '|'.join(voids))
class stripHtml(sgmllib.SGMLParser):
"remove all tags from the data"
@@ -130,9 +134,12 @@ def tmpl_mapper(source, rules):
node = source
for path in rule[2:]:
if isinstance(path, str) and path in node:
- if path == 'value' and node.get('type','')=='text/plain':
- node['value'] = escape(node['value'])
- node['type'] = 'text/html'
+ if path == 'value':
+ if node.get('type','')=='text/plain':
+ node['value'] = escape(node['value'])
+ node['type'] = 'text/html'
+ elif node.get('type','')=='application/xhtml+xml':
+ node['value'] = empty.sub(r"<\1 />", node['value'])
node = node[path]
elif isinstance(path, int):
node = node[path]
diff --git a/planet/shell/xslt.py b/planet/shell/xslt.py
index d667598..70e86a5 100644
--- a/planet/shell/xslt.py
+++ b/planet/shell/xslt.py
@@ -52,6 +52,7 @@ def run(script, doc, output_file=None, options={}):
cmdopts = []
for key,value in options.items():
+ if value.find("'")>=0 and value.find('"')>=0: continue
cmdopts += ['--stringparam', key, quote(value, apos=r"\'")]
os.system('xsltproc %s %s %s > %s' %
diff --git a/planet/spider.py b/planet/spider.py
index b18a787..7e72343 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -80,16 +80,23 @@ def writeCache(feed_uri, feed_info, data):
# process based on the HTTP status code
if data.status == 200 and data.has_key("url"):
- data.feed['planet_http_location'] = data.url
- if feed_uri == data.url:
+ feed_info.feed['planet_http_location'] = data.url
+ if data.has_key("entries") and len(data.entries) == 0:
+ log.warning("No data %s", feed_uri)
+ feed_info.feed['planet_message'] = 'no data'
+ elif feed_uri == data.url:
log.info("Updating feed %s", feed_uri)
else:
log.info("Updating feed %s @ %s", feed_uri, data.url)
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
data.feed['planet_http_location'] = data.url
- elif data.status == 304:
- log.info("Feed %s unchanged", feed_uri)
+ elif data.status == 304 and data.has_key("url"):
+ feed_info.feed['planet_http_location'] = data.url
+ if feed_uri == data.url:
+ log.info("Feed %s unchanged", feed_uri)
+ else:
+ log.info("Feed %s unchanged @ %s", feed_uri, data.url)
if not feed_info.feed.has_key('planet_message'):
if feed_info.feed.has_key('planet_updated'):
@@ -99,7 +106,9 @@ def writeCache(feed_uri, feed_info, data):
else:
if feed_info.feed.planet_message.startswith("no activity in"):
return
- del feed_info.feed['planet_message']
+ if not feed_info.feed.planet_message.startswith("duplicate") and \
+ not feed_info.feed.planet_message.startswith("no data"):
+ del feed_info.feed['planet_message']
elif data.status == 410:
log.info("Feed %s gone", feed_uri)
@@ -154,15 +163,28 @@ def writeCache(feed_uri, feed_info, data):
from planet import idindex
global index
if index != None: index = idindex.open()
-
- # write each entry to the cache
- cache = config.cache_directory()
+
+ # select latest entry for each unique id
+ ids = {}
for entry in data.entries:
# generate an id, if none is present
if not entry.has_key('id') or not entry.id:
entry['id'] = reconstitute.id(None, entry)
if not entry['id']: continue
+ # determine updated date for purposes of selection
+ updated = ''
+ if entry.has_key('published'): updated=entry.published
+ if entry.has_key('updated'): updated=entry.updated
+
+ # if not seen or newer than last seen, select it
+ if updated >= ids.get(entry.id,('',))[0]:
+ ids[entry.id] = (updated, entry)
+
+ # write each entry to the cache
+ cache = config.cache_directory()
+ for updated, entry in ids.values():
+
# compute cache file name based on the id
cache_file = filename(cache, entry.id)
@@ -329,7 +351,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
def spiderPlanet(only_if_new = False):
""" Spider (fetch) an entire planet """
- log = planet.getLogger(config.log_level(),config.log_format())
+ log = planet.logger
global index
index = True
@@ -340,7 +362,7 @@ def spiderPlanet(only_if_new = False):
log.info("Socket timeout set to %d seconds", timeout)
except:
try:
- from planet import timeoutsocket
+ import timeoutsocket
timeoutsocket.setDefaultSocketTimeout(float(timeout))
log.info("Socket timeout set to %d seconds", timeout)
except:
@@ -392,6 +414,7 @@ def spiderPlanet(only_if_new = False):
fetch_queue.put(item=(None, None))
# Process the results as they arrive
+ feeds_seen = {}
while fetch_queue.qsize() or parse_queue.qsize() or threads:
while parse_queue.qsize() == 0 and threads:
time.sleep(0.1)
@@ -415,8 +438,33 @@ def spiderPlanet(only_if_new = False):
else:
data = feedparser.FeedParserDict({'version': None,
'headers': feed.headers, 'entries': [], 'feed': {},
- 'bozo': 0, 'status': int(feed.headers.status)})
+ 'href': feed.url, 'bozo': 0,
+ 'status': int(feed.headers.status)})
+ # duplicate feed?
+ id = data.feed.get('id', None)
+ if not id: id = feed_info.feed.get('id', None)
+
+ href=uri
+ if data.has_key('href'): href=data.href
+
+ duplicate = None
+ if id and id in feeds_seen:
+ duplicate = id
+ elif href and href in feeds_seen:
+ duplicate = href
+
+ if duplicate:
+ feed_info.feed['planet_message'] = \
+ 'duplicate subscription: ' + feeds_seen[duplicate]
+ log.warn('Duplicate subscription: %s and %s' %
+ (uri, feeds_seen[duplicate]))
+ if href: feed_info.feed['planet_http_location'] = href
+
+ if id: feeds_seen[id] = uri
+ if href: feeds_seen[href] = uri
+
+ # complete processing for the feed
writeCache(uri, feed_info, data)
except Exception, e:
diff --git a/planet/splice.py b/planet/splice.py
index ccc55e0..f751975 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -9,7 +9,7 @@ from planet import idindex
def splice():
""" Splice together a planet from a cache of entries """
import planet
- log = planet.getLogger(config.log_level(),config.log_format())
+ log = planet.logger
log.info("Loading cached data")
cache = config.cache_directory()
@@ -109,7 +109,7 @@ def splice():
def apply(doc):
output_dir = config.output_dir()
if not os.path.exists(output_dir): os.makedirs(output_dir)
- log = planet.getLogger(config.log_level(),config.log_format())
+ log = planet.logger
planet_filters = config.filters('Planet')
diff --git a/planet/compat_logging/__init__.py b/planet/vendor/compat_logging/__init__.py
similarity index 100%
rename from planet/compat_logging/__init__.py
rename to planet/vendor/compat_logging/__init__.py
diff --git a/planet/compat_logging/config.py b/planet/vendor/compat_logging/config.py
similarity index 100%
rename from planet/compat_logging/config.py
rename to planet/vendor/compat_logging/config.py
diff --git a/planet/compat_logging/handlers.py b/planet/vendor/compat_logging/handlers.py
similarity index 100%
rename from planet/compat_logging/handlers.py
rename to planet/vendor/compat_logging/handlers.py
diff --git a/planet/feedparser.py b/planet/vendor/feedparser.py
similarity index 99%
rename from planet/feedparser.py
rename to planet/vendor/feedparser.py
index 9244646..bbc2ed7 100755
--- a/planet/feedparser.py
+++ b/planet/vendor/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec
"""
-__version__ = "4.2-pre-" + "$Revision: 262 $"[11:14] + "-svn"
+__version__ = "4.2-pre-" + "$Revision: 270 $"[11:14] + "-svn"
__license__ = """Copyright (c) 2002-2007, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@@ -466,6 +466,7 @@ class _FeedParserMixin:
self.baseuri = baseuri or ''
self.lang = baselang or None
self.svgOK = 0
+ self.hasTitle = 0
if baselang:
self.feeddata['language'] = baselang.replace('_','-')
@@ -478,6 +479,11 @@ class _FeedParserMixin:
# track xml:base and xml:lang
attrsD = dict(attrs)
baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
+ if type(baseuri) != type(u''):
+ try:
+ baseuri = unicode(baseuri, self.encoding)
+ except:
+ baseuri = unicode(baseuri, 'iso-8859-1')
self.baseuri = _urljoin(self.baseuri, baseuri)
lang = attrsD.get('xml:lang', attrsD.get('lang'))
if lang == '':
@@ -502,6 +508,7 @@ class _FeedParserMixin:
# track inline content
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
# element declared itself as escaped markup, but it isn't really
self.contentparams['type'] = 'application/xhtml+xml'
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
@@ -569,6 +576,7 @@ class _FeedParserMixin:
# track inline content
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
# element declared itself as escaped markup, but it isn't really
+ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
self.contentparams['type'] = 'application/xhtml+xml'
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
tag = tag.split(':')[-1]
@@ -794,6 +802,9 @@ class _FeedParserMixin:
# categories/tags/keywords/whatever are handled in _end_category
if element == 'category':
return output
+
+ if element == 'title' and self.hasTitle:
+ return output
# store output in appropriate place(s)
if self.inentry and not self.insource:
@@ -960,6 +971,7 @@ class _FeedParserMixin:
context = self._getContext()
context.setdefault('image', FeedParserDict())
self.inimage = 1
+ self.hasTitle = 0
self.push('image', 0)
def _end_image(self):
@@ -970,6 +982,7 @@ class _FeedParserMixin:
context = self._getContext()
context.setdefault('textinput', FeedParserDict())
self.intextinput = 1
+ self.hasTitle = 0
self.push('textinput', 0)
_start_textInput = _start_textinput
@@ -1182,6 +1195,7 @@ class _FeedParserMixin:
self.push('item', 0)
self.inentry = 1
self.guidislink = 0
+ self.hasTitle = 0
id = self._getAttribute(attrsD, 'rdf:about')
if id:
context = self._getContext()
@@ -1376,8 +1390,13 @@ class _FeedParserMixin:
value = self.popContent('title')
if not value: return
context = self._getContext()
+ self.hasTitle = 1
_end_dc_title = _end_title
- _end_media_title = _end_title
+
+ def _end_media_title(self):
+ hasTitle = self.hasTitle
+ self._end_title()
+ self.hasTitle = hasTitle
def _start_description(self, attrsD):
context = self._getContext()
@@ -1466,6 +1485,7 @@ class _FeedParserMixin:
def _start_source(self, attrsD):
self.insource = 1
+ self.hasTitle = 0
def _end_source(self):
self.insource = 0
@@ -2287,7 +2307,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
'xml:lang']
- unacceptable_elements_with_end_tag = ['script', 'applet']
+ unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
@@ -2410,7 +2430,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
acceptable_attributes = self.svg_attributes
tag = self.svg_elem_map.get(tag,tag)
keymap = self.svg_attr_map
- else:
+ elif not tag in self.acceptable_elements:
return
# declare xlink namespace, if needed
@@ -3290,11 +3310,15 @@ def _stripDoctype(data):
rss_version may be 'rss091n' or None
stripped_data is the same XML document, minus the DOCTYPE
'''
- entity_pattern = re.compile(r']*?)>', re.MULTILINE)
- entity_results=entity_pattern.findall(data)
- data = entity_pattern.sub('', data)
- doctype_pattern = re.compile(r']*?)>', re.MULTILINE)
- doctype_results = doctype_pattern.findall(data)
+ start = re.search('<\w',data)
+ start = start and start.start() or -1
+ head,data = data[:start+1], data[start+1:]
+
+ entity_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE)
+ entity_results=entity_pattern.findall(head)
+ head = entity_pattern.sub('', head)
+ doctype_pattern = re.compile(r'^\s*]*?)>', re.MULTILINE)
+ doctype_results = doctype_pattern.findall(head)
doctype = doctype_results and doctype_results[0] or ''
if doctype.lower().count('netscape'):
version = 'rss091n'
@@ -3308,7 +3332,7 @@ def _stripDoctype(data):
safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
if safe_entities:
replacement='\n]>' % '>\n = 0:
+ token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
+ meta_found = True
+
+ elif token["name"].lower() == "head" and not meta_found:
+ # insert meta into empty head
+ yield {"type": "StartTag", "name": "head",
+ "data": token["data"]}
+ yield {"type": "EmptyTag", "name": "meta",
+ "data": [["charset", self.encoding]]}
+ yield {"type": "EndTag", "name": "head"}
+ meta_found = True
+ continue
+
+ elif type == "EndTag":
+ if token["name"].lower() == "head" and pending:
+ # insert meta into head (if necessary) and flush pending queue
+ yield pending.pop(0)
+ if not meta_found:
+ yield {"type": "EmptyTag", "name": "meta",
+ "data": [["charset", self.encoding]]}
+ while pending:
+ yield pending.pop(0)
+ meta_found = True
+ state = "post_head"
+
+ if state == "in_head":
+ pending.append(token)
+ else:
+ yield token
diff --git a/planet/vendor/html5lib/filters/lint.py b/planet/vendor/html5lib/filters/lint.py
new file mode 100644
index 0000000..ea5c619
--- /dev/null
+++ b/planet/vendor/html5lib/filters/lint.py
@@ -0,0 +1,88 @@
+from gettext import gettext
+_ = gettext
+
+import _base
+from html5lib.constants import cdataElements, rcdataElements, voidElements
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class LintError(Exception): pass
+
+class Filter(_base.Filter):
+ def __iter__(self):
+ open_elements = []
+ contentModelFlag = "PCDATA"
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type in ("StartTag", "EmptyTag"):
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty tag name"))
+ if type == "StartTag" and name in voidElements:
+ raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+ elif type == "EmptyTag" and name not in voidElements:
+ raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+ if type == "StartTag":
+ open_elements.append(name)
+ for name, value in token["data"]:
+ if not isinstance(name, unicode):
+ raise LintError(_("Attribute name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty attribute name"))
+ if not isinstance(value, unicode):
+ raise LintError(_("Attribute value is not a string: %r") % value)
+ if name in cdataElements:
+ contentModelFlag = "CDATA"
+ elif name in rcdataElements:
+ contentModelFlag = "RCDATA"
+ elif name == "plaintext":
+ contentModelFlag = "PLAINTEXT"
+
+ elif type == "EndTag":
+ name = token["name"]
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_(u"Empty tag name"))
+ if name in voidElements:
+ raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+ start_name = open_elements.pop()
+ if start_name != name:
+ raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+ contentModelFlag = "PCDATA"
+
+ elif type == "Comment":
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Comment not in PCDATA content model flag"))
+
+ elif type in ("Characters", "SpaceCharacters"):
+ data = token["data"]
+ if not isinstance(data, unicode):
+ raise LintError(_("Attribute name is not a string: %r") % data)
+ if not data:
+ raise LintError(_(u"%s token with empty data") % type)
+ if type == "SpaceCharacters":
+ data = data.strip(spaceCharacters)
+ if data:
+ raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+
+ elif type == "Doctype":
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, unicode):
+ raise LintError(_(u"Tag name is not a string: %r") % name)
+ # XXX: what to do with token["data"] ?
+
+ elif type in ("ParseError", "SerializeError"):
+ pass
+
+ else:
+ raise LintError(_(u"Unknown token type: %s") % type)
+
+ yield token
diff --git a/planet/vendor/html5lib/filters/optionaltags.py b/planet/vendor/html5lib/filters/optionaltags.py
new file mode 100644
index 0000000..73da96c
--- /dev/null
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@@ -0,0 +1,175 @@
+import _base
+
+class Filter(_base.Filter):
+ def slider(self):
+ previous1 = previous2 = None
+ for token in self.source:
+ if previous1 is not None:
+ yield previous2, previous1, token
+ previous2 = previous1
+ previous1 = token
+ yield previous2, previous1, None
+
+ def __iter__(self):
+ for previous, token, next in self.slider():
+ type = token["type"]
+ if type == "StartTag":
+ if token["data"] or not self.is_optional_start(token["name"], previous, next):
+ yield token
+ elif type == "EndTag":
+ if not self.is_optional_end(token["name"], next):
+ yield token
+ else:
+ yield token
+
+ def is_optional_start(self, tagname, previous, next):
+ type = next and next["type"] or None
+ if tagname in 'html':
+ # An html element's start tag may be omitted if the first thing
+ # inside the html element is not a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname == 'head':
+ # A head element's start tag may be omitted if the first thing
+ # inside the head element is an element.
+ return type == "StartTag"
+ elif tagname == 'body':
+ # A body element's start tag may be omitted if the first thing
+ # inside the body element is not a space character or a comment,
+ # except if the first thing inside the body element is a script
+ # or style element and the node immediately preceding the body
+ # element is a head element whose end tag has been omitted.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we do not look at the preceding event, so we never omit
+ # the body element's start tag if it's followed by a script or
+ # a style element.
+ return next["name"] not in ('script', 'style')
+ else:
+ return True
+ elif tagname == 'colgroup':
+ # A colgroup element's start tag may be omitted if the first thing
+ # inside the colgroup element is a col element, and if the element
+ # is not immediately preceeded by another colgroup element whose
+ # end tag has been omitted.
+ if type == "StartTag":
+ # XXX: we do not look at the preceding event, so instead we never
+ # omit the colgroup element's end tag when it is immediately
+ # followed by another colgroup element. See is_optional_end.
+ return next["name"] == "col"
+ else:
+ return False
+ elif tagname == 'tbody':
+ # A tbody element's start tag may be omitted if the first thing
+ # inside the tbody element is a tr element, and if the element is
+ # not immediately preceeded by a tbody, thead, or tfoot element
+ # whose end tag has been omitted.
+ if type == "StartTag":
+ # omit the thead and tfoot elements' end tag when they are
+ # immediately followed by a tbody element. See is_optional_end.
+ if previous and previous['type'] == 'EndTag' and \
+ previous['name'] in ('tbody','thead','tfoot'):
+ return False
+ return next["name"] == 'tr'
+ else:
+ return False
+ return False
+
+ def is_optional_end(self, tagname, next):
+ type = next and next["type"] or None
+ if tagname in ('html', 'head', 'body'):
+ # An html element's end tag may be omitted if the html element
+ # is not immediately followed by a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname in ('li', 'optgroup', 'option', 'tr'):
+ # A li element's end tag may be omitted if the li element is
+ # immediately followed by another li element or if there is
+ # no more content in the parent element.
+ # An optgroup element's end tag may be omitted if the optgroup
+ # element is immediately followed by another optgroup element,
+ # or if there is no more content in the parent element.
+ # An option element's end tag may be omitted if the option
+ # element is immediately followed by another option element,
+ # or if there is no more content in the parent element.
+ # A tr element's end tag may be omitted if the tr element is
+ # immediately followed by another tr element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] == tagname
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('dt', 'dd'):
+ # A dt element's end tag may be omitted if the dt element is
+ # immediately followed by another dt element or a dd element.
+ # A dd element's end tag may be omitted if the dd element is
+ # immediately followed by another dd element or a dt element,
+ # or if there is no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('dt', 'dd')
+ elif tagname == 'dd':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'p':
+ # A p element's end tag may be omitted if the p element is
+ # immediately followed by an address, blockquote, dl, fieldset,
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+ # or ul element, or if there is no more content in the parent
+ # element.
+ if type == "StartTag":
+ return next["name"] in ('address', 'blockquote', \
+ 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
+ 'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'colgroup':
+ # A colgroup element's end tag may be omitted if the colgroup
+ # element is not immediately followed by a space character or
+ # a comment.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we also look for an immediately following colgroup
+ # element. See is_optional_start.
+ return next["name"] != 'colgroup'
+ else:
+ return True
+ elif tagname in ('thead', 'tbody'):
+ # A thead element's end tag may be omitted if the thead element
+ # is immediately followed by a tbody or tfoot element.
+ # A tbody element's end tag may be omitted if the tbody element
+ # is immediately followed by a tbody or tfoot element, or if
+ # there is no more content in the parent element.
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] in ['tbody', 'tfoot']
+ elif tagname == 'tbody':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'tfoot':
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] == 'tbody'
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('td', 'th'):
+ # A td element's end tag may be omitted if the td element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ # A th element's end tag may be omitted if the th element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('td', 'th')
+ else:
+ return type == "EndTag" or type is None
+ return False
diff --git a/planet/vendor/html5lib/filters/whitespace.py b/planet/vendor/html5lib/filters/whitespace.py
new file mode 100644
index 0000000..74d6f4d
--- /dev/null
+++ b/planet/vendor/html5lib/filters/whitespace.py
@@ -0,0 +1,41 @@
+try:
+ frozenset
+except NameError:
+ # Import from the sets module for python 2.3
+ from sets import ImmutableSet as frozenset
+
+import re
+
+import _base
+from html5lib.constants import rcdataElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
+
+class Filter(_base.Filter):
+
+ spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+ def __iter__(self):
+ preserve = 0
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag" \
+ and (preserve or token["name"] in self.spacePreserveElements):
+ preserve += 1
+
+ elif type == "EndTag" and preserve:
+ preserve -= 1
+
+ elif not preserve and type == "SpaceCharacters" and token["data"]:
+ # Test on token["data"] above to not introduce spaces where there were not
+ token["data"] = u" "
+
+ elif not preserve and type == "Characters":
+ token["data"] = collapse_spaces(token["data"])
+
+ yield token
+
+def collapse_spaces(text):
+ return SPACES_REGEX.sub(' ', text)
+
diff --git a/planet/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
similarity index 79%
rename from planet/html5lib/html5parser.py
rename to planet/vendor/html5lib/html5parser.py
index 898ec9f..1c0fd3e 100644
--- a/planet/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -1,16 +1,8 @@
-
-# Differences from the current specification (23 December 2006) are as follows:
+# Differences from the current specification are as follows:
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure , and
# always exist.
-# * We also deal with content when there's no DOCTYPE.
-# It is expected that the specification will catch up with us in due course ;-)
-#
-# It should be trivial to add the following cases. However, we should probably
-# also look into comment handling and such then...
-# * A element end tag creates an empty
element when there's no
-# element in scope.
-# * A
element end tag creates an empty
element.
+
try:
frozenset
@@ -20,6 +12,7 @@ except NameError:
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
+import sys
import tokenizer
@@ -30,32 +23,39 @@ from treebuilders import simpletree
import utils
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
-from constants import headingElements, tableInsertModeElements, voidElements
+from constants import headingElements, tableInsertModeElements
+from constants import cdataElements, rcdataElements, voidElements
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
- def __init__(self, strict = False, tree=simpletree.TreeBuilder):
+ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
+ tokenizer=tokenizer.HTMLTokenizer):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
- returned. This class is almost always a subclass of
- html5lib.treebuilders._base.TreeBuilder
+ returned. Built in treebuilders can be accessed through
+ html5lib.treebuilders.getTreeBuilder(treeType)
"""
# Raise an exception on the first error encountered
self.strict = strict
self.tree = tree()
+ self.tokenizer_class = tokenizer
self.errors = []
+ # "quirks" / "almost-standards" / "standards"
+ self.quirksMode = "standards"
+
self.phases = {
"initial": InitialPhase(self, self.tree),
"rootElement": RootElementPhase(self, self.tree),
"beforeHead": BeforeHeadPhase(self, self.tree),
"inHead": InHeadPhase(self, self.tree),
+ # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
@@ -72,21 +72,21 @@ class HTMLParser(object):
}
def _parse(self, stream, innerHTML=False, container="div",
- encoding=None):
+ encoding=None, **kwargs):
self.tree.reset()
self.firstStartTag = False
self.errors = []
- self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
- parseMeta=innerHTML)
+ self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+ parseMeta=not innerHTML, **kwargs)
if innerHTML:
self.innerHTML = container.lower()
- if self.innerHTML in ('title', 'textarea'):
+ if self.innerHTML in cdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
- elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+ elif self.innerHTML in rcdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
elif self.innerHTML == 'plaintext':
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
@@ -113,10 +113,12 @@ class HTMLParser(object):
method = getattr(self.phase, "process%s" % type, None)
if type in ("Characters", "SpaceCharacters", "Comment"):
method(token["data"])
- elif type in ("StartTag", "Doctype"):
+ elif type == "StartTag":
method(token["name"], token["data"])
elif type == "EndTag":
method(token["name"])
+ elif type == "Doctype":
+ method(token["name"], token["publicId"], token["systemId"], token["correct"])
else:
self.parseError(token["data"])
@@ -158,10 +160,6 @@ class HTMLParser(object):
if self.strict:
raise ParseError
- def atheistParseError(self):
- """This error is not an error"""
- pass
-
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
@@ -171,34 +169,17 @@ class HTMLParser(object):
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
- if token["name"] in voidElements:
- self.atheistParseError()
- else:
- self.parseError(_("Solidus (/) incorrectly placed in tag."))
+ if token["name"] not in voidElements:
+ self.parseError(_(u"Solidus (/) incorrectly placed in tag."))
token["type"] = "StartTag"
if token["type"] == "StartTag":
- token["name"] = token["name"].translate(asciiUpper2Lower)
-
- # We need to remove the duplicate attributes and convert attributes
- # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
-
- # AT When Python 2.4 is widespread we should use
- # dict(reversed(token.data))
- if token["data"]:
- token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
- for attr,value in token["data"][::-1]])
- else:
- token["data"] = {}
-
- elif token["type"] == "EndTag":
- if token["data"]:
- self.parseError(_("End tag contains unexpected attributes."))
- token["name"] = token["name"].lower()
+ token["data"] = dict(token["data"][::-1])
return token
+
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
@@ -265,17 +246,17 @@ class Phase(object):
def processEOF(self):
self.tree.generateImpliedEndTags()
if len(self.tree.openElements) > 2:
- self.parser.parseError(_("Unexpected end of file. "
+ self.parser.parseError(_(u"Unexpected end of file. "
u"Missing closing tags."))
elif len(self.tree.openElements) == 2 and\
self.tree.openElements[1].name != "body":
# This happens for framesets or something?
- self.parser.parseError(_("Unexpected end of file. Expected end "
- u"tag (" + self.tree.openElements[1].name + u") first."))
+ self.parser.parseError(_(u"Unexpected end of file. Expected end "
+ u"tag (%s) first.") % (self.tree.openElements[1].name,))
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
- self.parser.parseError(_("XXX innerHTML EOF"))
+ self.parser.parseError(_(u"XXX innerHTML EOF"))
# Betting ends.
def processComment(self, data):
@@ -283,8 +264,8 @@ class Phase(object):
# overridden.
self.tree.insertComment(data, self.tree.openElements[-1])
- def processDoctype(self, name, error):
- self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
+ def processDoctype(self, name, publicId, systemId, correct):
+ self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
self.tree.insertText(data)
@@ -294,7 +275,7 @@ class Phase(object):
def startTagHtml(self, name, attributes):
if self.parser.firstStartTag == False and name == "html":
- self.parser.parseError(_("html needs to be the first start tag."))
+ self.parser.parseError(_(u"html needs to be the first start tag."))
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in attributes.iteritems():
@@ -319,14 +300,105 @@ class InitialPhase(Phase):
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
- def processDoctype(self, name, error):
- if error:
- self.parser.parseError(_("Erroneous DOCTYPE."))
- self.tree.insertDoctype(name)
+ def processDoctype(self, name, publicId, systemId, correct):
+ nameLower = name.translate(asciiUpper2Lower)
+ if nameLower != "html" or publicId != None or\
+ systemId != None:
+ self.parser.parseError(_(u"Erroneous DOCTYPE."))
+ # XXX need to update DOCTYPE tokens
+ self.tree.insertDoctype(name, publicId, systemId)
+
+ if publicId == None:
+ publicId = ""
+ if publicId != "":
+ publicId = publicId.translate(asciiUpper2Lower)
+
+ if nameLower != "html":
+ # XXX quirks mode
+ pass
+ else:
+ if publicId in\
+ ("+//silmaril//dtd html pro v0r11 19970101//en",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+ "-//as//dtd html 3.0 aswedit + extensions//en",
+ "-//ietf//dtd html 2.0 level 1//en",
+ "-//ietf//dtd html 2.0 level 2//en",
+ "-//ietf//dtd html 2.0 strict level 1//en",
+ "-//ietf//dtd html 2.0 strict level 2//en",
+ "-//ietf//dtd html 2.0 strict//en",
+ "-//ietf//dtd html 2.0//en",
+ "-//ietf//dtd html 2.1e//en",
+ "-//ietf//dtd html 3.0//en",
+ "-//ietf//dtd html 3.0//en//",
+ "-//ietf//dtd html 3.2 final//en",
+ "-//ietf//dtd html 3.2//en",
+ "-//ietf//dtd html 3//en",
+ "-//ietf//dtd html level 0//en",
+ "-//ietf//dtd html level 0//en//2.0",
+ "-//ietf//dtd html level 1//en",
+ "-//ietf//dtd html level 1//en//2.0",
+ "-//ietf//dtd html level 2//en",
+ "-//ietf//dtd html level 2//en//2.0",
+ "-//ietf//dtd html level 3//en",
+ "-//ietf//dtd html level 3//en//3.0",
+ "-//ietf//dtd html strict level 0//en",
+ "-//ietf//dtd html strict level 0//en//2.0",
+ "-//ietf//dtd html strict level 1//en",
+ "-//ietf//dtd html strict level 1//en//2.0",
+ "-//ietf//dtd html strict level 2//en",
+ "-//ietf//dtd html strict level 2//en//2.0",
+ "-//ietf//dtd html strict level 3//en",
+ "-//ietf//dtd html strict level 3//en//3.0",
+ "-//ietf//dtd html strict//en",
+ "-//ietf//dtd html strict//en//2.0",
+ "-//ietf//dtd html strict//en//3.0",
+ "-//ietf//dtd html//en",
+ "-//ietf//dtd html//en//2.0",
+ "-//ietf//dtd html//en//3.0",
+ "-//metrius//dtd metrius presentational//en",
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
+ "-//microsoft//dtd internet explorer 2.0 html//en",
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
+ "-//microsoft//dtd internet explorer 3.0 html//en",
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
+ "-//netscape comm. corp.//dtd html//en",
+ "-//netscape comm. corp.//dtd strict html//en",
+ "-//o'reilly and associates//dtd html 2.0//en",
+ "-//o'reilly and associates//dtd html extended 1.0//en",
+ "-//spyglass//dtd html 2.0 extended//en",
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
+ "-//sun microsystems corp.//dtd hotjava html//en",
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
+ "-//w3c//dtd html 3 1995-03-24//en",
+ "-//w3c//dtd html 3.2 draft//en",
+ "-//w3c//dtd html 3.2 final//en",
+ "-//w3c//dtd html 3.2//en",
+ "-//w3c//dtd html 3.2s draft//en",
+ "-//w3c//dtd html 4.0 frameset//en",
+ "-//w3c//dtd html 4.0 transitional//en",
+ "-//w3c//dtd html experimental 19960712//en",
+ "-//w3c//dtd html experimental 970421//en",
+ "-//w3c//dtd w3 html//en",
+ "-//w3o//dtd w3 html 3.0//en",
+ "-//w3o//dtd w3 html 3.0//en//",
+ "-//w3o//dtd w3 html strict 3.0//en//",
+ "-//webtechs//dtd mozilla html 2.0//en",
+ "-//webtechs//dtd mozilla html//en",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")\
+ or (publicId in\
+ ("-//w3c//dtd html 4.01 frameset//EN",
+ "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
+ or (systemId != None and\
+ systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ #XXX quirks mode
+ pass
+
self.parser.phase = self.parser.phases["rootElement"]
def processSpaceCharacters(self, data):
- self.tree.insertText(data, self.tree.document)
+ pass
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters. "
@@ -335,14 +407,12 @@ class InitialPhase(Phase):
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
- self.parser.parseError(_(u"Unexpected start tag (" + name +\
- u"). Expected DOCTYPE."))
+ self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
- self.parser.parseError(_(u"Unexpected end tag (" + name +\
- "). Expected DOCTYPE."))
+ self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEndTag(name)
@@ -364,7 +434,7 @@ class RootElementPhase(Phase):
self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
- self.tree.insertText(data, self.tree.document)
+ pass
def processCharacters(self, data):
self.insertHtmlElement()
@@ -392,7 +462,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
- ("html", self.endTagHtml)
+ (("html", "head", "body", "br", "p"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -413,13 +483,12 @@ class BeforeHeadPhase(Phase):
self.startTagHead("head", {})
self.parser.phase.processStartTag(name, attributes)
- def endTagHtml(self, name):
+ def endTagImplyHead(self, name):
self.startTagHead("head", {})
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
- self.parser.parseError(_("Unexpected end tag (" + name +\
- ") after the (implied) root element."))
+ self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,))
class InHeadPhase(Phase):
def __init__(self, parser, tree):
@@ -429,6 +498,7 @@ class InHeadPhase(Phase):
("html", self.startTagHtml),
("title", self.startTagTitle),
("style", self.startTagStyle),
+ ("noscript", self.startTagNoScript),
("script", self.startTagScript),
(("base", "link", "meta"), self.startTagBaseLinkMeta),
("head", self.startTagHead)
@@ -437,8 +507,9 @@ class InHeadPhase(Phase):
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
- ("html", self.endTagHtml),
- (("title", "style", "script"), self.endTagTitleStyleScript)
+ (("html", "body", "br", "p"), self.endTagImplyAfterHead),
+ (("title", "style", "script", "noscript"),
+ self.endTagTitleStyleScriptNoScript)
])
self.endTagHandler.default = self.endTagOther
@@ -454,13 +525,14 @@ class InHeadPhase(Phase):
def processEOF(self):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.parser.parseError(_(u"Unexpected end of file. "
- u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
+ u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
self.tree.openElements.pop()
self.anythingElse()
self.parser.phase.processEOF()
def processCharacters(self, data):
- if self.tree.openElements[-1].name in ("title", "style", "script"):
+ if self.tree.openElements[-1].name in\
+ ("title", "style", "script", "noscript"):
self.tree.insertText(data)
else:
self.anythingElse()
@@ -485,6 +557,17 @@ class InHeadPhase(Phase):
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+ def startTagNoScript(self, name, attributes):
+ # XXX Need to decide whether to implement the scripting disabled case.
+ element = self.tree.createElement(name, attributes)
+ if self.tree.headPointer is not None and\
+ self.parser.phase == self.parser.phases["inHead"]:
+ self.appendToHead(element)
+ else:
+ self.tree.openElements[-1].appendChild(element)
+ self.tree.openElements.append(element)
+ self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+
def startTagScript(self, name, attributes):
#XXX Inner HTML case may be wrong
element = self.tree.createElement(name, attributes)
@@ -499,7 +582,11 @@ class InHeadPhase(Phase):
def startTagBaseLinkMeta(self, name, attributes):
element = self.tree.createElement(name, attributes)
- self.appendToHead(element)
+ if (self.tree.headPointer is not None and
+ self.parser.phase == self.parser.phases["inHead"]):
+ self.appendToHead(element)
+ else:
+ self.tree.openElements[-1].appendChild(element)
def startTagOther(self, name, attributes):
self.anythingElse()
@@ -509,23 +596,21 @@ class InHeadPhase(Phase):
if self.tree.openElements[-1].name == "head":
self.tree.openElements.pop()
else:
- self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
+ self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head')
self.parser.phase = self.parser.phases["afterHead"]
- def endTagHtml(self, name):
+ def endTagImplyAfterHead(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
- def endTagTitleStyleScript(self, name):
+ def endTagTitleStyleScriptNoScript(self, name):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
- self.parser.parseError(_(u"Unexpected end tag (" + name +\
- "). Ignored."))
+ self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
def endTagOther(self, name):
- self.parser.parseError(_(u"Unexpected end tag (" + name +\
- "). Ignored."))
+ self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
def anythingElse(self):
if self.tree.openElements[-1].name == "head":
@@ -533,6 +618,11 @@ class InHeadPhase(Phase):
else:
self.parser.phase = self.parser.phases["afterHead"]
+# XXX If we implement a parser for which scripting is disabled we need to
+# implement this phase.
+#
+# class InHeadNoScriptPhase(Phase):
+
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
@@ -563,8 +653,7 @@ class AfterHeadPhase(Phase):
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, name, attributes):
- self.parser.parseError(_(u"Unexpected start tag (" + name +\
- ") that can be in head. Moved."))
+ self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,))
self.parser.phase = self.parser.phases["inHead"]
self.parser.phase.processStartTag(name, attributes)
@@ -592,9 +681,9 @@ class InBodyPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
- (("script", "style"), self.startTagScriptStyle),
- (("base", "link", "meta", "title"),
- self.startTagFromHead),
+ (("base", "link", "meta", "script", "style"),
+ self.startTagProcessInHead),
+ ("title", self.startTagTitle),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@@ -604,8 +693,9 @@ class InBodyPhase(Phase):
("plaintext",self.startTagPlaintext),
(headingElements, self.startTagHeading),
("a", self.startTagA),
- (("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
- "strong", "tt", "u"),self.startTagFormatting),
+ (("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
+ "tt", "u"),self.startTagFormatting),
+ ("nobr", self.startTagNobr),
("button", self.startTagButton),
(("marquee", "object"), self.startTagMarqueeObject),
("xmp", self.startTagXmp),
@@ -642,7 +732,8 @@ class InBodyPhase(Phase):
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
- (("area", "basefont", "bgsound", "br", "embed", "hr", "image",
+ ("br", self.endTagBr),
+ (("area", "basefont", "bgsound", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
@@ -659,13 +750,16 @@ class InBodyPhase(Phase):
self.tree.openElements[-1])
# the real deal
- def processSpaceCharactersPre(self, data):
- #Sometimes (start of
blocks) we want to drop leading newlines
+ def processSpaceCharactersDropNewline(self, data):
+ # Sometimes (start of and