xml_base overrides

This commit is contained in:
Sam Ruby 2007-01-22 13:46:45 -05:00
parent 631dd44ff0
commit 77d15d22cf
8 changed files with 99 additions and 9 deletions

View File

@ -95,6 +95,13 @@ attributes on these elements.</li>
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
</ul>
</li>
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
<ul style="margin:0">
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
</ul>
</li>
</ul>
</body>
</html>

View File

@ -30,5 +30,7 @@ def getLogger(level, format):
return logger
# Configure feed parser
from planet import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0

View File

@ -125,6 +125,7 @@ def __init__():
define_tmpl('summary_type', '')
define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
def load(config_file):
""" initialize and load a configuration"""

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
# if TIDY_MARKUP = 1
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
# ---------- required modules (should come with any Python distribution) ----------
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
try:
@ -732,7 +740,7 @@ class _FeedParserMixin:
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup
if is_htmlish:
if is_htmlish and RESOLVE_RELATIVE_URIS:
if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
@ -753,7 +761,7 @@ class _FeedParserMixin:
self._getContext()['vcard'] = vcard
# sanitize embedded markup
if is_htmlish:
if is_htmlish and SANITIZE_HTML:
if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))

View File

@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
import time
# Planet modules
import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content':
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML(
node.value, 'utf-8', node.type)

View File

@ -3,6 +3,7 @@
import unittest, os, sys, glob, new, re, StringIO, time
from planet import feedparser
from planet.reconstitute import reconstitute
from planet.scrub import scrub
testfiles = 'tests/data/reconstitute/%s.xml'
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
# parse and reconstitute to a string
work = StringIO.StringIO()
results = feedparser.parse(data)
scrub(testfiles%name, results)
reconstitute(results, results.entries[0]).writexml(work)
# verify the results

View File

@ -6,7 +6,7 @@ from planet.scrub import scrub
from planet import feedparser, config
feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'>
<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
<author><name>F&amp;ouml;o</name></author>
<entry xml:lang="en">
<id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
<title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content>
<link href="http://example.com/entry/1/"/>
<source>
<link href="http://example.com/feed/"/>
<author><name>F&amp;ouml;o</name></author>
</source>
</entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual(0, len(data.entries))
def test_scrub_xmlbase(self):
base = feedparser.parse(feed)
self.assertEqual('http://example.com/',
base.entries[0].title_detail.base)
config.parser.readfp(StringIO.StringIO(configData))
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/feed/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/entry/1/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'base/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/base/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.org/data/',
data.entries[0].title_detail.base)

View File

@ -35,7 +35,7 @@
<th>Name</th>
<th>Format</th>
<xsl:if test="//planet:ignore_in_feed | //planet:filters |
//planet:*[contains(local-name(),'_type')]">
//planet:xml_base | //planet:*[contains(local-name(),'_type')]">
<th>Notes</th>
</xsl:if>
</tr>
@ -128,12 +128,12 @@
</a>
</td>
<td><xsl:value-of select="planet:format"/></td>
<xsl:if test="planet:ignore_in_feed | planet:filters |
<xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
planet:*[contains(local-name(),'_type')]">
<td>
<dl>
<xsl:for-each select="planet:ignore_in_feed | planet:filters |
planet:*[contains(local-name(),'_type')]">
planet:xml_base | planet:*[contains(local-name(),'_type')]">
<xsl:sort select="local-name()"/>
<dt><xsl:value-of select="local-name()"/></dt>
<dd><xsl:value-of select="."/></dd>