diff --git a/docs/normalization.html b/docs/normalization.html index 39dd279..33883a9 100644 --- a/docs/normalization.html +++ b/docs/normalization.html @@ -95,6 +95,13 @@ attributes on these elements.
  • Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.
  • +
  • xml_base will adjust the xml:base values in effect for each of the text constructs in the feed (things like title, summary, and content). Other elements in the feed (most notably, link are not affected by this value. + +
  • diff --git a/planet/__init__.py b/planet/__init__.py index 3375432..61ac2a2 100644 --- a/planet/__init__.py +++ b/planet/__init__.py @@ -30,5 +30,7 @@ def getLogger(level, format): return logger - - +# Configure feed parser +from planet import feedparser +feedparser.SANITIZE_HTML=0 +feedparser.RESOLVE_RELATIVE_URIS=0 diff --git a/planet/config.py b/planet/config.py index 1960295..0d3b605 100644 --- a/planet/config.py +++ b/planet/config.py @@ -125,6 +125,7 @@ def __init__(): define_tmpl('summary_type', '') define_tmpl('content_type', '') define_tmpl('future_dates', 'keep') + define_tmpl('xml_base', '') def load(config_file): """ initialize and load a configuration""" diff --git a/planet/feedparser.py b/planet/feedparser.py index b3b2467..e562d1f 100755 --- a/planet/feedparser.py +++ b/planet/feedparser.py @@ -11,7 +11,7 @@ Recommended: Python 2.3 or later Recommended: CJKCodecs and iconv_codec """ -__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs" +__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs" __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -65,6 +65,14 @@ TIDY_MARKUP = 0 # if TIDY_MARKUP = 1 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] +# If you want feedparser to automatically resolve all relative URIs, set this +# to 1. +RESOLVE_RELATIVE_URIS = 1 + +# If you want feedparser to automatically sanitize all potentially unsafe +# HTML content, set this to 1. +SANITIZE_HTML = 1 + # ---------- required modules (should come with any Python distribution) ---------- import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 try: @@ -732,7 +740,7 @@ class _FeedParserMixin: is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types # resolve relative URIs within embedded markup - if is_htmlish: + if is_htmlish and RESOLVE_RELATIVE_URIS: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) @@ -753,7 +761,7 @@ class _FeedParserMixin: self._getContext()['vcard'] = vcard # sanitize embedded markup - if is_htmlish: + if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) diff --git a/planet/scrub.py b/planet/scrub.py index 42d75ae..ced4a5c 100644 --- a/planet/scrub.py +++ b/planet/scrub.py @@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed. import time # Planet modules import planet, config, shell +from planet import feedparser type_map = {'text': 'text/plain', 'html': 'text/html', 'xhtml': 'application/xhtml+xml'} @@ -92,3 +93,40 @@ def scrub(feed_uri, data): or entry['published_parsed'] <= now) and (not entry.has_key('updated_parsed') or not entry['updated_parsed'] or entry['updated_parsed'] <= now)] + + scrub_xmlbase = config.xml_base(feed_uri) + + # resolve relative URIs and sanitize + for entry in data.entries + [data.feed]: + for key in entry.keys(): + if key == 'content': + node = entry.content[0] + elif key.endswith('_detail'): + node = entry[key] + else: + continue + + if not node.has_key('type'): continue + if not 'html' in node['type']: continue + if not node.has_key('value'): continue + + if node.has_key('base'): + if scrub_xmlbase: + if scrub_xmlbase == 'feed_alternate': + if entry.has_key('source') and \ + entry.source.has_key('link'): + node['base'] = entry.source.link + elif data.feed.has_key('link'): + node['base'] = data.feed.link + elif scrub_xmlbase == 'entry_alternate': + if entry.has_key('link'): + node['base'] = entry.link + else: + node['base'] = feedparser._urljoin( + node['base'], scrub_xmlbase) + + node['value'] = feedparser._resolveRelativeURIs( + node.value, node.base, 'utf-8', node.type) + + node['value'] = feedparser._sanitizeHTML( + node.value, 'utf-8', node.type) diff --git a/tests/test_reconstitute.py b/tests/test_reconstitute.py index ecae6e7..754ea7a 100644 --- a/tests/test_reconstitute.py +++ b/tests/test_reconstitute.py @@ -3,6 +3,7 @@ import unittest, os, sys, glob, new, re, StringIO, time from planet import feedparser from planet.reconstitute import reconstitute +from planet.scrub import scrub testfiles = 'tests/data/reconstitute/%s.xml' @@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase): # parse and reconstitute to a string work = StringIO.StringIO() results = feedparser.parse(data) + scrub(testfiles%name, results) reconstitute(results, results.entries[0]).writexml(work) # verify the results diff --git a/tests/test_scrub.py b/tests/test_scrub.py index 17874a3..8a16d65 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -6,7 +6,7 @@ from planet.scrub import scrub from planet import feedparser, config feed = ''' - + F&ouml;o ignoreme @@ -15,7 +15,9 @@ feed = ''' F&ouml;o F&ouml;o F&ouml;o + + F&ouml;o @@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase): data = deepcopy(base) scrub('testfeed', data) self.assertEqual(0, len(data.entries)) + + def test_scrub_xmlbase(self): + base = feedparser.parse(feed) + self.assertEqual('http://example.com/', + base.entries[0].title_detail.base) + + config.parser.readfp(StringIO.StringIO(configData)) + config.parser.set('testfeed', 'xml_base', 'feed_alternate') + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual('http://example.com/feed/', + data.entries[0].title_detail.base) + + config.parser.set('testfeed', 'xml_base', 'entry_alternate') + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual('http://example.com/entry/1/', + data.entries[0].title_detail.base) + + config.parser.set('testfeed', 'xml_base', 'base/') + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual('http://example.com/base/', + data.entries[0].title_detail.base) + + config.parser.set('testfeed', 'xml_base', 'http://example.org/data/') + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual('http://example.org/data/', + data.entries[0].title_detail.base) diff --git a/themes/common/validate.html.xslt b/themes/common/validate.html.xslt index 2d1a2e4..0cabdcc 100644 --- a/themes/common/validate.html.xslt +++ b/themes/common/validate.html.xslt @@ -35,7 +35,7 @@ Name Format + //planet:xml_base | //planet:*[contains(local-name(),'_type')]"> Notes @@ -128,12 +128,12 @@ -
    + planet:xml_base | planet:*[contains(local-name(),'_type')]">