From 9aba1dbfc7ad273cc57a1f56143ab31fa2f48a87 Mon Sep 17 00:00:00 2001
From: Sam Ruby tags are replaced with
@@ -23,6 +24,7 @@ args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2]))
wrapper = textwrap.TextWrapper(width=int(args.get('width','500')))
omit = args.get('omit', '').split()
+target = args.get('target', 'planet:excerpt')
class copy:
""" recursively copy a source to a target, up to a given width """
@@ -94,10 +96,14 @@ if not source:
# if present, recursively copy it to a planet:excerpt element
if source:
- dom.documentElement.setAttribute('xmlns:planet', planetNS)
- target = dom.createElementNS(planetNS, 'planet:excerpt')
- source[0].parentNode.appendChild(target)
- copy(dom, source[0], target)
+ if target.startswith('planet:'):
+ dom.documentElement.setAttribute('xmlns:planet', planetNS)
+ if target.startswith('atom:'): target = target.split(':',1)[1]
+ excerpt = dom.createElementNS(planetNS, target)
+ source[0].parentNode.appendChild(excerpt)
+ copy(dom, source[0], excerpt)
+ if source[0].nodeName == excerpt.nodeName:
+ source[0].parentNode.removeChild(source[0])
# print out results
print dom.toxml('utf-8')
diff --git a/tests/data/filter/excerpt-lorem-ipsum.ini b/tests/data/filter/excerpt-lorem-ipsum.ini
index 85bbac8..610b764 100644
--- a/tests/data/filter/excerpt-lorem-ipsum.ini
+++ b/tests/data/filter/excerpt-lorem-ipsum.ini
@@ -4,3 +4,4 @@ filters = excerpt.py
[excerpt.py]
width = 100
omit = p
+target = planet:excerpt
diff --git a/tests/test_filters.py b/tests/test_filters.py
index e8b9488..d03b3b4 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -54,6 +54,21 @@ class FilterTests(unittest.TestCase):
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
+ def test_excerpt_lorem_ipsum_summary(self):
+ testfile = 'tests/data/filter/excerpt-lorem-ipsum.xml'
+ config.load('tests/data/filter/excerpt-lorem-ipsum.ini')
+ config.parser.set('excerpt.py', 'target', 'atom:summary')
+
+ output = open(testfile).read()
+ for filter in config.filters():
+ output = shell.run(filter, output, mode="filter")
+
+ dom = xml.dom.minidom.parseString(output)
+ excerpt = dom.getElementsByTagName('summary')[0]
+ self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
+ u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
+ u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
+
def test_stripAd_yahoo(self):
testfile = 'tests/data/filter/stripAd-yahoo.xml'
config.load('tests/data/filter/stripAd-yahoo.ini')
From b25df8b9d4da2352dd7d04e7e8dcffc99a9fa49d Mon Sep 17 00:00:00 2001
From: Sam Ruby
title
mid
bottom
+
ignore_in_feed
allows you to list any number of elements
or attributes which are to be ignored in feeds. This is often handy in the
-case of feeds where the id
, updated
or
-xml:lang
values can't be trusted.author
, id
,
+updated
or xml:lang
values can't be trusted.
title_type
, summary_type
,
content_type
allow you to override the
type
diff --git a/tests/reconstitute.py b/tests/reconstitute.py
index 26931dc..4af8e77 100644
--- a/tests/reconstitute.py
+++ b/tests/reconstitute.py
@@ -4,13 +4,13 @@ venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0,venus_base)
if __name__ == "__main__":
+ import planet
+ planet.getLogger('WARN',None)
hide_planet_ns = True
while len(sys.argv) > 1:
if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
- import planet
- planet.getLogger('DEBUG',None)
del sys.argv[1]
elif sys.argv[1] == '-p' or sys.argv[1] == '--planet':
hide_planet_ns = False
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index 8a16d65..a759b96 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -26,6 +26,9 @@ feed = '''
configData = '''
[testfeed]
+ignore_in_feed =
+future_dates =
+
name_type = html
title_type = html
summary_type = html
@@ -37,16 +40,21 @@ class ScrubTest(unittest.TestCase):
def test_scrub_ignore(self):
base = feedparser.parse(feed)
+ self.assertTrue(base.entries[0].has_key('author'))
+ self.assertTrue(base.entries[0].has_key('author_detail'))
self.assertTrue(base.entries[0].has_key('id'))
self.assertTrue(base.entries[0].has_key('updated'))
self.assertTrue(base.entries[0].has_key('updated_parsed'))
self.assertTrue(base.entries[0].summary_detail.has_key('language'))
config.parser.readfp(StringIO.StringIO(configData))
- config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
+ config.parser.set('testfeed', 'ignore_in_feed',
+ 'author id updated xml:lang')
data = deepcopy(base)
scrub('testfeed', data)
+ self.assertFalse(data.entries[0].has_key('author'))
+ self.assertFalse(data.entries[0].has_key('author_detail'))
self.assertFalse(data.entries[0].has_key('id'))
self.assertFalse(data.entries[0].has_key('updated'))
self.assertFalse(data.entries[0].has_key('updated_parsed'))
From fc90da7fc07cd966b94f3212cc92ab8fb75f3219 Mon Sep 17 00:00:00 2001
From: Sam Ruby element end tag creates an empty
element when there's no
# element in scope.
-# * A
element end tag creates an empty
element.
try:
frozenset
@@ -20,6 +20,7 @@ except NameError:
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
+import sys
import tokenizer
@@ -30,27 +31,32 @@ from treebuilders import simpletree
import utils
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
-from constants import headingElements, tableInsertModeElements, voidElements
+from constants import headingElements, tableInsertModeElements
+from constants import cdataElements, rcdataElements, voidElements
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
- def __init__(self, strict = False, tree=simpletree.TreeBuilder):
+ def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
- returned. This class is almost always a subclass of
- html5lib.treebuilders._base.TreeBuilder
+ returned. Built in treebuilders can be accessed through
+ html5lib.treebuilders.getTreeBuilder(treeType)
"""
# Raise an exception on the first error encountered
self.strict = strict
self.tree = tree()
+ self.tokenizer_class = tokenizer
self.errors = []
+ # "quirks" / "almost-standards" / "standards"
+ self.quirksMode = "standards"
+
self.phases = {
"initial": InitialPhase(self, self.tree),
"rootElement": RootElementPhase(self, self.tree),
@@ -78,15 +84,15 @@ class HTMLParser(object):
self.firstStartTag = False
self.errors = []
- self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
- parseMeta=innerHTML)
+ self.tokenizer = self.tokenizer_class(stream, encoding,
+ parseMeta=not innerHTML)
if innerHTML:
self.innerHTML = container.lower()
- if self.innerHTML in ('title', 'textarea'):
+ if self.innerHTML in cdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
- elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+ elif self.innerHTML in rcdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
elif self.innerHTML == 'plaintext':
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
@@ -113,10 +119,12 @@ class HTMLParser(object):
method = getattr(self.phase, "process%s" % type, None)
if type in ("Characters", "SpaceCharacters", "Comment"):
method(token["data"])
- elif type in ("StartTag", "Doctype"):
+ elif type == "StartTag":
method(token["name"], token["data"])
elif type == "EndTag":
method(token["name"])
+ elif type == "Doctype":
+ method(token["name"], token["publicId"], token["systemId"], token["correct"])
else:
self.parseError(token["data"])
@@ -158,10 +166,6 @@ class HTMLParser(object):
if self.strict:
raise ParseError
- def atheistParseError(self):
- """This error is not an error"""
- pass
-
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
@@ -171,9 +175,7 @@ class HTMLParser(object):
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
- if token["name"] in voidElements:
- self.atheistParseError()
- else:
+ if token["name"] not in voidElements:
self.parseError(_("Solidus (/) incorrectly placed in tag."))
token["type"] = "StartTag"
@@ -283,7 +285,7 @@ class Phase(object):
# overridden.
self.tree.insertComment(data, self.tree.openElements[-1])
- def processDoctype(self, name, error):
+ def processDoctype(self, name, publicId, systemId, correct):
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
@@ -319,10 +321,101 @@ class InitialPhase(Phase):
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
- def processDoctype(self, name, error):
- if error:
+ def processDoctype(self, name, publicId, systemId, correct):
+ nameLower = name.translate(asciiUpper2Lower)
+ if nameLower != "html" or publicId != None or\
+ systemId != None:
self.parser.parseError(_("Erroneous DOCTYPE."))
+ # XXX need to update DOCTYPE tokens
self.tree.insertDoctype(name)
+
+ if publicId == None:
+ publicId = ""
+ if publicId != "":
+ publicId = publicId.translate(asciiUpper2Lower)
+
+ if nameLower != "html":
+ # XXX quirks mode
+ pass
+ else:
+ if publicId in\
+ ("+//silmaril//dtd html pro v0r11 19970101//en",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+ "-//as//dtd html 3.0 aswedit + extensions//en",
+ "-//ietf//dtd html 2.0 level 1//en",
+ "-//ietf//dtd html 2.0 level 2//en",
+ "-//ietf//dtd html 2.0 strict level 1//en",
+ "-//ietf//dtd html 2.0 strict level 2//en",
+ "-//ietf//dtd html 2.0 strict//en",
+ "-//ietf//dtd html 2.0//en",
+ "-//ietf//dtd html 2.1e//en",
+ "-//ietf//dtd html 3.0//en",
+ "-//ietf//dtd html 3.0//en//",
+ "-//ietf//dtd html 3.2 final//en",
+ "-//ietf//dtd html 3.2//en",
+ "-//ietf//dtd html 3//en",
+ "-//ietf//dtd html level 0//en",
+ "-//ietf//dtd html level 0//en//2.0",
+ "-//ietf//dtd html level 1//en",
+ "-//ietf//dtd html level 1//en//2.0",
+ "-//ietf//dtd html level 2//en",
+ "-//ietf//dtd html level 2//en//2.0",
+ "-//ietf//dtd html level 3//en",
+ "-//ietf//dtd html level 3//en//3.0",
+ "-//ietf//dtd html strict level 0//en",
+ "-//ietf//dtd html strict level 0//en//2.0",
+ "-//ietf//dtd html strict level 1//en",
+ "-//ietf//dtd html strict level 1//en//2.0",
+ "-//ietf//dtd html strict level 2//en",
+ "-//ietf//dtd html strict level 2//en//2.0",
+ "-//ietf//dtd html strict level 3//en",
+ "-//ietf//dtd html strict level 3//en//3.0",
+ "-//ietf//dtd html strict//en",
+ "-//ietf//dtd html strict//en//2.0",
+ "-//ietf//dtd html strict//en//3.0",
+ "-//ietf//dtd html//en",
+ "-//ietf//dtd html//en//2.0",
+ "-//ietf//dtd html//en//3.0",
+ "-//metrius//dtd metrius presentational//en",
+ "-//microsoft//dtd internet explorer 2.0 html strict//en",
+ "-//microsoft//dtd internet explorer 2.0 html//en",
+ "-//microsoft//dtd internet explorer 2.0 tables//en",
+ "-//microsoft//dtd internet explorer 3.0 html strict//en",
+ "-//microsoft//dtd internet explorer 3.0 html//en",
+ "-//microsoft//dtd internet explorer 3.0 tables//en",
+ "-//netscape comm. corp.//dtd html//en",
+ "-//netscape comm. corp.//dtd strict html//en",
+ "-//o'reilly and associates//dtd html 2.0//en",
+ "-//o'reilly and associates//dtd html extended 1.0//en",
+ "-//spyglass//dtd html 2.0 extended//en",
+ "-//sq//dtd html 2.0 hotmetal + extensions//en",
+ "-//sun microsystems corp.//dtd hotjava html//en",
+ "-//sun microsystems corp.//dtd hotjava strict html//en",
+ "-//w3c//dtd html 3 1995-03-24//en",
+ "-//w3c//dtd html 3.2 draft//en",
+ "-//w3c//dtd html 3.2 final//en",
+ "-//w3c//dtd html 3.2//en",
+ "-//w3c//dtd html 3.2s draft//en",
+ "-//w3c//dtd html 4.0 frameset//en",
+ "-//w3c//dtd html 4.0 transitional//en",
+ "-//w3c//dtd html experimental 19960712//en",
+ "-//w3c//dtd html experimental 970421//en",
+ "-//w3c//dtd w3 html//en",
+ "-//w3o//dtd w3 html 3.0//en",
+ "-//w3o//dtd w3 html 3.0//en//",
+ "-//w3o//dtd w3 html strict 3.0//en//",
+ "-//webtechs//dtd mozilla html 2.0//en",
+ "-//webtechs//dtd mozilla html//en",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")\
+ or (publicId in\
+ ("-//w3c//dtd html 4.01 frameset//EN",
+ "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
+ or (systemId != None and\
+ systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ #XXX quirks mode
+ pass
+
self.parser.phase = self.parser.phases["rootElement"]
def processSpaceCharacters(self, data):
@@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
- ("html", self.endTagHtml)
+ (("html", "head", "body", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase):
self.startTagHead("head", {})
self.parser.phase.processStartTag(name, attributes)
- def endTagHtml(self, name):
+ def endTagImplyHead(self, name):
self.startTagHead("head", {})
self.parser.phase.processEndTag(name)
@@ -437,7 +530,7 @@ class InHeadPhase(Phase):
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
- ("html", self.endTagHtml),
+ (("html", "body", "br"), self.endTagImplyAfterHead),
(("title", "style", "script"), self.endTagTitleStyleScript)
])
self.endTagHandler.default = self.endTagOther
@@ -499,7 +592,11 @@ class InHeadPhase(Phase):
def startTagBaseLinkMeta(self, name, attributes):
element = self.tree.createElement(name, attributes)
- self.appendToHead(element)
+ if (self.tree.headPointer is not None and
+ self.parser.phase == self.parser.phases["inHead"]):
+ self.appendToHead(element)
+ else:
+ self.tree.openElements[-1].appendChild(element)
def startTagOther(self, name, attributes):
self.anythingElse()
@@ -512,7 +609,7 @@ class InHeadPhase(Phase):
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"]
- def endTagHtml(self, name):
+ def endTagImplyAfterHead(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
@@ -592,9 +689,9 @@ class InBodyPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
- (("script", "style"), self.startTagScriptStyle),
- (("base", "link", "meta", "title"),
- self.startTagFromHead),
+ (("base", "link", "meta", "script", "style"),
+ self.startTagProcessInHead),
+ ("title", self.startTagTitle),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@@ -604,8 +701,9 @@ class InBodyPhase(Phase):
("plaintext",self.startTagPlaintext),
(headingElements, self.startTagHeading),
("a", self.startTagA),
- (("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
- "strong", "tt", "u"),self.startTagFormatting),
+ (("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
+ "tt", "u"),self.startTagFormatting),
+ ("nobr", self.startTagNobr),
("button", self.startTagButton),
(("marquee", "object"), self.startTagMarqueeObject),
("xmp", self.startTagXmp),
@@ -642,7 +740,8 @@ class InBodyPhase(Phase):
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
- (("area", "basefont", "bgsound", "br", "embed", "hr", "image",
+ ("br", self.endTagBr),
+ (("area", "basefont", "bgsound", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
@@ -659,11 +758,13 @@ class InBodyPhase(Phase):
self.tree.openElements[-1])
# the real deal
- def processSpaceCharactersPre(self, data):
- #Sometimes (start of
blocks) we want to drop leading newlines + def processSpaceCharactersDropNewline(self, data): + # Sometimes (start ofand
element end tag creates an empty
element when there's no
-# element in scope. + try: frozenset @@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase): self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - (("html", "head", "body", "br"), self.endTagImplyHead) + (("html", "head", "body", "br", "p"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther @@ -530,7 +524,7 @@ class InHeadPhase(Phase): self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), - (("html", "body", "br"), self.endTagImplyAfterHead), + (("html", "body", "br", "p"), self.endTagImplyAfterHead), (("title", "style", "script"), self.endTagTitleStyleScript) ]) self.endTagHandler.default = self.endTagOther @@ -994,9 +988,13 @@ class InBodyPhase(Phase): if self.tree.elementInScope("p"): self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": - self.parser.parseError("Unexpected end tag (p).") - while self.tree.elementInScope("p"): - self.tree.openElements.pop() + self.parser.parseError(_("Unexpected end tag (p).")) + if self.tree.elementInScope("p"): + while self.tree.elementInScope("p"): + self.tree.openElements.pop() + else: + self.startTagCloseP("p", {}) + self.endTagP("p") def endTagBody(self, name): # XXX Need to take open
tags into account here. We shouldn't imply
@@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
- self.parser.parseError((u"End tag (" + name + ") seen too "
+ self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if inScope:
node = self.tree.openElements.pop()
@@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
node = self.tree.openElements.pop()
def endTagForm(self, name):
- self.endTagBlock(name)
+ if self.tree.elementInScope(name):
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != name:
+ self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
+ else:
+ self.tree.openElements.pop()
self.tree.formPointer = None
def endTagListItem(self, name):
@@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
- self.parser.parseError((u"End tag (" + name + ") seen too "
+ self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if self.tree.elementInScope(name):
@@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != name:
- self.parser.parseError((u"Unexpected end tag (" + name + "). "
+ self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
u"Expected other end tag."))
for item in headingElements:
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index e97214f..31b83a9 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -53,6 +53,7 @@ class HTMLInputStream(object):
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
self.queue = []
+ self.errors = []
self.line = self.col = 0
self.lineLengths = []
@@ -214,7 +215,10 @@ class HTMLInputStream(object):
return EOF
# Normalize newlines and null characters
- if c == '\x00': c = u'\uFFFD'
+ if c == '\x00':
+ self.errors.append('null character found in input stream, '
+ 'replaced with U+FFFD')
+ c = u'\uFFFD'
if c == '\r':
c = self.dataStream.read(1, 1)
if c != '\n':
diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py
index 947f3d9..fdea914 100644
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
# For EmptyTags, process both a Start and an End tag
if token["type"] == "EmptyTag":
+ save = self.tokenizer.contentModelFlag
self.phase.processStartTag(token["name"], token["data"])
+ self.tokenizer.contentModelFlag = save
token["data"] = {}
token["type"] = "EndTag"
@@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
+ self.phases["initial"] = XmlInitialPhase(self, self.tree)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
@@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
+class XmlInitialPhase(html5parser.InitialPhase):
+ """ Consume XML Prologs """
+ def processComment(self, data):
+ if not data.startswith('?xml') or not data.endswith('?'):
+ html5parser.InitialPhase.processComment(self, data)
+
class XmlRootPhase(html5parser.Phase):
+ """ Consume XML Prologs """
+ def processComment(self, data):
+ print repr(data)
+ if not data.startswith('?xml') or not data.endswith('?'):
+ html5parser.InitialPhase.processComment(self, data)
+
""" Prime the Xml parser """
def __getattr__(self, name):
self.tree.openElements.append(self.tree.document)
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index 4668d28..af27ead 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -2,7 +2,7 @@ import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
-class HTMLSanitizer(HTMLTokenizer):
+class HTMLSanitizerMixin:
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
# => <script> do_nasty_stuff() </script>
# sanitize_html('Click here for $100')
# => Click here for $100
- def __iter__(self):
- for token in HTMLTokenizer.__iter__(self):
- if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
- if token["name"] in self.allowed_elements:
- if token.has_key("data"):
- attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
- for attr in self.attr_val_is_uri:
- if not attrs.has_key(attr): continue
- val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
- if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
- del attrs[attr]
- if attrs.has_key('style'):
- attrs['style'] = self.sanitize_css(attrs['style'])
- token["data"] = [[name,val] for name,val in attrs.items()]
- yield token
- else:
- if token["type"] == "EndTag":
- token["data"] = "%s>" % token["name"]
- elif token["data"]:
- attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
- token["data"] = "<%s%s>" % (token["name"],attrs)
- else:
- token["data"] = "<%s>" % token["name"]
- if token["type"] == "EmptyTag":
- token["data"]=token["data"][:-1] + "/>"
- token["type"] = "Characters"
- del token["name"]
- yield token
- elif token["type"] == "Comment":
- pass
+ def sanitize_token(self, token):
+ if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+ if token["name"] in self.allowed_elements:
+ if token.has_key("data"):
+ attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+ for attr in self.attr_val_is_uri:
+ if not attrs.has_key(attr): continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+ if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+ del attrs[attr]
+ if attrs.has_key('style'):
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token["data"] = [[name,val] for name,val in attrs.items()]
+ return token
else:
- yield token
+ if token["type"] == "EndTag":
+ token["data"] = "%s>" % token["name"]
+ elif token["data"]:
+ attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+ token["data"] = "<%s%s>" % (token["name"],attrs)
+ else:
+ token["data"] = "<%s>" % token["name"]
+ if token["type"] == "EmptyTag":
+ token["data"]=token["data"][:-1] + "/>"
+ token["type"] = "Characters"
+ del token["name"]
+ return token
+ elif token["type"] == "Comment":
+ pass
+ else:
+ return token
def sanitize_css(self, style):
# disallow urls
@@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
+
+class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token: yield token
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index 0191774..308788a 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -7,10 +7,6 @@ except NameError:
import gettext
_ = gettext.gettext
-from html5lib.filters.whitespace import Filter as WhitespaceFilter
-from html5lib.filters.optionaltags import Filter as OptionalTagFilter
-from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
-
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements
@@ -67,17 +63,16 @@ class HTMLSerializer(object):
escape_lt_in_attrs = False
escape_rcdata = False
- omit_optional_tags = True
-
- strip_whitespace = False
-
inject_meta_charset = True
+ strip_whitespace = False
+ sanitize = False
+ omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
- "escape_rcdata")
+ "escape_rcdata", 'use_trailing_solidus', "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
@@ -91,13 +86,19 @@ class HTMLSerializer(object):
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
- treewalker = InjectMetaCharsetFilter(treewalker, encoding)
+ from html5lib.filters.inject_meta_charset import Filter
+ treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
- treewalker = WhitespaceFilter(treewalker)
+ from html5lib.filters.whitespace import Filter
+ treewalker = Filter(treewalker)
+ if self.sanitize:
+ from html5lib.filters.sanitizer import Filter
+ treewalker = Filter(treewalker)
if self.omit_optional_tags:
- treewalker = OptionalTagFilter(treewalker)
+ from html5lib.filters.optionaltags import Filter
+ treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index 0bb4b54..151a489 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -93,6 +93,8 @@ class HTMLTokenizer(object):
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
while self.state():
+ while self.stream.errors:
+ yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
while self.tokenQueue:
yield self.tokenQueue.pop(0)
@@ -130,7 +132,6 @@ class HTMLTokenizer(object):
allowed = hexDigits
radix = 16
- char = u"\uFFFD"
charStack = []
# Consume all the characters that are in range while making sure we
@@ -155,8 +156,8 @@ class HTMLTokenizer(object):
charAsInt = entitiesWindows1252[charAsInt - 128]
- # 0 is not a good number, neither are illegal Unicode code points.
- if charAsInt > 0 and charAsInt <= 1114111:
+ # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
+ if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
try:
# XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work
@@ -167,7 +168,11 @@ class HTMLTokenizer(object):
char = eval("u'\\U%08x'" % charAsInt)
except:
self.tokenQueue.append({"type": "ParseError", "data":
- _("Numeric entity couldn't be converted to character.")})
+ _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
+ else:
+ char = u"\uFFFD"
+ self.tokenQueue.append({"type": "ParseError", "data":
+ _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index 0700543..f9b580d 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
- def hilite(self, encoding):
- print 'foo'
- method = new.instancemethod(hilite, self.dom, self.dom.__class__)
- setattr(self.dom, 'hilite', method)
return self
def insertDoctype(self, name):
diff --git a/tests/data/apply/config-html.ini b/tests/data/apply/config-html.ini
index 635b552..7356ed9 100644
--- a/tests/data/apply/config-html.ini
+++ b/tests/data/apply/config-html.ini
@@ -1,5 +1,5 @@
[Planet]
-output_theme = genshi_fancy
+output_theme = asf
output_dir = tests/work/apply
name = test planet
cache_directory = tests/work/spider/cache
@@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
bill_of_materials:
images/#{face}
-[index.html.genshi]
+[index.html.xslt]
filters:
- xhtml2html.py>index.html4
+ xhtml2html.plugin?quote_attr_values=True"e_char="'">index.html4
[tests/data/spider/testfeed0.atom]
name = not found
diff --git a/tests/test_apply.py b/tests/test_apply.py
index fdfbadf..5a726a7 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
def tearDown(self):
shutil.rmtree(os.path.split(workdir)[0])
- def test_apply_asf(self):
- config.load(configfile % 'asf')
+ def apply_asf(self):
splice.apply(self.feeddata)
# verify that selected files are there
@@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
self.assertEqual(12, content)
self.assertEqual(3, lang)
+ def test_apply_asf(self):
+ config.load(configfile % 'asf')
+ self.apply_asf()
+
def test_apply_classic_fancy(self):
config.load(configfile % 'fancy')
self.apply_fancy()
@@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
def test_apply_filter_html(self):
config.load(configfile % 'html')
- self.apply_fancy()
+ self.apply_asf()
output = open(os.path.join(workdir, 'index.html')).read()
self.assertTrue(output.find('/>')>=0)
@@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
if method.startswith('test_'): break
else:
delattr(ApplyTest,'test_apply_genshi_fancy')
- delattr(ApplyTest,'test_apply_filter_html')
try:
import libxml2
diff --git a/tests/test_filter_genshi.py b/tests/test_filter_genshi.py
index 769778e..c7a8baf 100644
--- a/tests/test_filter_genshi.py
+++ b/tests/test_filter_genshi.py
@@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
self.assertTrue(output.find('')>=0)
- def test_xhtml2html_filter(self):
- testfile = 'tests/data/filter/index.html'
- filter = 'xhtml2html.py'
- output = shell.run(filter, open(testfile).read(), mode="filter")
- self.assertTrue(output.find('/>')<0)
- self.assertTrue(output.find('')>=0)
-
try:
import genshi
except:
diff --git a/tests/test_filters.py b/tests/test_filters.py
index d03b3b4..b756c86 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
self.assertEqual('', output)
+ def test_xhtml2html_filter(self):
+ testfile = 'tests/data/filter/index.html'
+ filter = 'xhtml2html.plugin?quote_attr_values=True'
+ output = shell.run(filter, open(testfile).read(), mode="filter")
+ self.assertTrue(output.find('/>')<0)
+ self.assertTrue(output.find('')>=0)
+
try:
from subprocess import Popen, PIPE
From 1e2b6a18fefd5328be9ba35a97aa51a25c87f0e6 Mon Sep 17 00:00:00 2001
From: Sam Ruby name
is defined.
The content_type
parameter can be defined to indicate that
this subscription is a reading list, i.e., is an external list
-of subscriptions. At the moment, two formats of reading lists are supported:
-opml
and foaf
. In the future, support for formats
-like xoxo
could be added.
opml
, foaf
, and csv
. In the future,
+support for formats like xoxo
could be added.
Normalization overrides can also be defined here.
diff --git a/planet/config.py b/planet/config.py index 5ffb1cb..9209bc9 100644 --- a/planet/config.py +++ b/planet/config.py @@ -138,7 +138,7 @@ def load(config_file): parser.read(config_file) import config, planet - from planet import opml, foaf + from planet import opml, foaf, csv_config log = planet.logger if not log: log = planet.getLogger(config.log_level(),config.log_format()) @@ -197,6 +197,8 @@ def load(config_file): opml.opml2config(data, cached_config) elif content_type(list).find('foaf')>=0: foaf.foaf2config(data, cached_config) + elif content_type(list).find('csv')>=0: + csv_config.csv2config(data, cached_config) else: from planet import shell import StringIO @@ -346,7 +348,8 @@ def reading_lists(): for section in parser.sections(): if parser.has_option(section, 'content_type'): type = parser.get(section, 'content_type') - if type.find('opml')>=0 or type.find('foaf')>=0 or type.find('.')>=0: + if type.find('opml')>=0 or type.find('foaf')>=0 or \ + type.find('csv')>=0 or type.find('.')>=0: result.append(section) return result diff --git a/planet/csv_config.py b/planet/csv_config.py new file mode 100755 index 0000000..717b1eb --- /dev/null +++ b/planet/csv_config.py @@ -0,0 +1,28 @@ +import csv + +# input = csv, output = ConfigParser +def csv2config(input, config=None): + + if not hasattr(input, 'read'): + input = csv.StringIO(input) + + if not config: + config = ConfigParser() + + reader = csv.DictReader(input) + for entry in reader: + section = entry[reader.fieldnames[0]] + config.add_section(section) + for name, value in entry.items(): + if value and name != reader.fieldnames[0]: + config.set(section, name, value) + + return config + +if __name__ == "__main__": + # small main program which converts OPML into config.ini format + import sys, urllib + config = ConfigParser() + for input in sys.argv[1:]: + csv2config(urllib.urlopen(input), config) + config.write(sys.stdout) diff --git a/tests/data/config/basic.csv b/tests/data/config/basic.csv new file mode 100644 index 0000000..b7e4178 --- /dev/null +++ b/tests/data/config/basic.csv @@ -0,0 +1,3 @@ +url,name,filters +feed1,one +feed2,two,bar diff --git a/tests/data/config/rlist-csv.ini b/tests/data/config/rlist-csv.ini new file mode 100644 index 0000000..2ecf300 --- /dev/null +++ b/tests/data/config/rlist-csv.ini @@ -0,0 +1,7 @@ +[Planet] +name = CSV Test Configuration +cache_directory = tests/work/config/cache +filters = foo + +[tests/data/config/basic.csv] +content_type = csv diff --git a/tests/test_config_csv.py b/tests/test_config_csv.py new file mode 100644 index 0000000..945e27d --- /dev/null +++ b/tests/test_config_csv.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +import unittest +from planet import config + +class ConfigCsvTest(unittest.TestCase): + def setUp(self): + config.load('tests/data/config/rlist-csv.ini') + + # administrivia + + def test_feeds(self): + feeds = config.subscriptions() + feeds.sort() + self.assertEqual(['feed1', 'feed2'], feeds) + + def test_filters(self): + self.assertEqual(['foo','bar'], config.filters('feed2')) + self.assertEqual(['foo'], config.filters('feed1')) From 291faf2d8ff3fb2ee1b8cfa1671ddb8e16ac81fa Mon Sep 17 00:00:00 2001 From: Sam Rubyand