xml_base overrides
This commit is contained in:
parent
631dd44ff0
commit
77d15d22cf
@ -95,6 +95,13 @@ attributes on these elements.</li>
|
||||
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
|
||||
<ul style="margin:0">
|
||||
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
|
||||
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
|
||||
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -30,5 +30,7 @@ def getLogger(level, format):
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
|
||||
# Configure feed parser
|
||||
from planet import feedparser
|
||||
feedparser.SANITIZE_HTML=0
|
||||
feedparser.RESOLVE_RELATIVE_URIS=0
|
||||
|
@ -125,6 +125,7 @@ def __init__():
|
||||
define_tmpl('summary_type', '')
|
||||
define_tmpl('content_type', '')
|
||||
define_tmpl('future_dates', 'keep')
|
||||
define_tmpl('xml_base', '')
|
||||
|
||||
def load(config_file):
|
||||
""" initialize and load a configuration"""
|
||||
|
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||
"""
|
||||
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
|
||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
|
||||
# if TIDY_MARKUP = 1
|
||||
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
|
||||
|
||||
# If you want feedparser to automatically resolve all relative URIs, set this
|
||||
# to 1.
|
||||
RESOLVE_RELATIVE_URIS = 1
|
||||
|
||||
# If you want feedparser to automatically sanitize all potentially unsafe
|
||||
# HTML content, set this to 1.
|
||||
SANITIZE_HTML = 1
|
||||
|
||||
# ---------- required modules (should come with any Python distribution) ----------
|
||||
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
||||
try:
|
||||
@ -732,7 +740,7 @@ class _FeedParserMixin:
|
||||
|
||||
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
||||
# resolve relative URIs within embedded markup
|
||||
if is_htmlish:
|
||||
if is_htmlish and RESOLVE_RELATIVE_URIS:
|
||||
if element in self.can_contain_relative_uris:
|
||||
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
|
||||
@ -753,7 +761,7 @@ class _FeedParserMixin:
|
||||
self._getContext()['vcard'] = vcard
|
||||
|
||||
# sanitize embedded markup
|
||||
if is_htmlish:
|
||||
if is_htmlish and SANITIZE_HTML:
|
||||
if element in self.can_contain_dangerous_markup:
|
||||
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
|
||||
|
@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
|
||||
import time
|
||||
# Planet modules
|
||||
import planet, config, shell
|
||||
from planet import feedparser
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
|
||||
or entry['published_parsed'] <= now) and
|
||||
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
|
||||
or entry['updated_parsed'] <= now)]
|
||||
|
||||
scrub_xmlbase = config.xml_base(feed_uri)
|
||||
|
||||
# resolve relative URIs and sanitize
|
||||
for entry in data.entries + [data.feed]:
|
||||
for key in entry.keys():
|
||||
if key == 'content':
|
||||
node = entry.content[0]
|
||||
elif key.endswith('_detail'):
|
||||
node = entry[key]
|
||||
else:
|
||||
continue
|
||||
|
||||
if not node.has_key('type'): continue
|
||||
if not 'html' in node['type']: continue
|
||||
if not node.has_key('value'): continue
|
||||
|
||||
if node.has_key('base'):
|
||||
if scrub_xmlbase:
|
||||
if scrub_xmlbase == 'feed_alternate':
|
||||
if entry.has_key('source') and \
|
||||
entry.source.has_key('link'):
|
||||
node['base'] = entry.source.link
|
||||
elif data.feed.has_key('link'):
|
||||
node['base'] = data.feed.link
|
||||
elif scrub_xmlbase == 'entry_alternate':
|
||||
if entry.has_key('link'):
|
||||
node['base'] = entry.link
|
||||
else:
|
||||
node['base'] = feedparser._urljoin(
|
||||
node['base'], scrub_xmlbase)
|
||||
|
||||
node['value'] = feedparser._resolveRelativeURIs(
|
||||
node.value, node.base, 'utf-8', node.type)
|
||||
|
||||
node['value'] = feedparser._sanitizeHTML(
|
||||
node.value, 'utf-8', node.type)
|
||||
|
@ -3,6 +3,7 @@
|
||||
import unittest, os, sys, glob, new, re, StringIO, time
|
||||
from planet import feedparser
|
||||
from planet.reconstitute import reconstitute
|
||||
from planet.scrub import scrub
|
||||
|
||||
testfiles = 'tests/data/reconstitute/%s.xml'
|
||||
|
||||
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
|
||||
# parse and reconstitute to a string
|
||||
work = StringIO.StringIO()
|
||||
results = feedparser.parse(data)
|
||||
scrub(testfiles%name, results)
|
||||
reconstitute(results, results.entries[0]).writexml(work)
|
||||
|
||||
# verify the results
|
||||
|
@ -6,7 +6,7 @@ from planet.scrub import scrub
|
||||
from planet import feedparser, config
|
||||
|
||||
feed = '''
|
||||
<feed xmlns='http://www.w3.org/2005/Atom'>
|
||||
<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<entry xml:lang="en">
|
||||
<id>ignoreme</id>
|
||||
@ -15,7 +15,9 @@ feed = '''
|
||||
<title>F&ouml;o</title>
|
||||
<summary>F&ouml;o</summary>
|
||||
<content>F&ouml;o</content>
|
||||
<link href="http://example.com/entry/1/"/>
|
||||
<source>
|
||||
<link href="http://example.com/feed/"/>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
</source>
|
||||
</entry>
|
||||
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual(0, len(data.entries))
|
||||
|
||||
def test_scrub_xmlbase(self):
|
||||
base = feedparser.parse(feed)
|
||||
self.assertEqual('http://example.com/',
|
||||
base.entries[0].title_detail.base)
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual('http://example.com/feed/',
|
||||
data.entries[0].title_detail.base)
|
||||
|
||||
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual('http://example.com/entry/1/',
|
||||
data.entries[0].title_detail.base)
|
||||
|
||||
config.parser.set('testfeed', 'xml_base', 'base/')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual('http://example.com/base/',
|
||||
data.entries[0].title_detail.base)
|
||||
|
||||
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual('http://example.org/data/',
|
||||
data.entries[0].title_detail.base)
|
||||
|
@ -35,7 +35,7 @@
|
||||
<th>Name</th>
|
||||
<th>Format</th>
|
||||
<xsl:if test="//planet:ignore_in_feed | //planet:filters |
|
||||
//planet:*[contains(local-name(),'_type')]">
|
||||
//planet:xml_base | //planet:*[contains(local-name(),'_type')]">
|
||||
<th>Notes</th>
|
||||
</xsl:if>
|
||||
</tr>
|
||||
@ -128,12 +128,12 @@
|
||||
</a>
|
||||
</td>
|
||||
<td><xsl:value-of select="planet:format"/></td>
|
||||
<xsl:if test="planet:ignore_in_feed | planet:filters |
|
||||
<xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
|
||||
planet:*[contains(local-name(),'_type')]">
|
||||
<td>
|
||||
<dl>
|
||||
<xsl:for-each select="planet:ignore_in_feed | planet:filters |
|
||||
planet:*[contains(local-name(),'_type')]">
|
||||
planet:xml_base | planet:*[contains(local-name(),'_type')]">
|
||||
<xsl:sort select="local-name()"/>
|
||||
<dt><xsl:value-of select="local-name()"/></dt>
|
||||
<dd><xsl:value-of select="."/></dd>
|
||||
|
Loading…
Reference in New Issue
Block a user