xml_base overrides

This commit is contained in:
Sam Ruby 2007-01-22 13:46:45 -05:00
parent 631dd44ff0
commit 77d15d22cf
8 changed files with 99 additions and 9 deletions

View File

@ -95,6 +95,13 @@ attributes on these elements.</li>
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li> <li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
</ul> </ul>
</li> </li>
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
<ul style="margin:0">
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
</ul>
</li>
</ul> </ul>
</body> </body>
</html> </html>

View File

@ -30,5 +30,7 @@ def getLogger(level, format):
return logger return logger
# Configure feed parser
from planet import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0

View File

@ -125,6 +125,7 @@ def __init__():
define_tmpl('summary_type', '') define_tmpl('summary_type', '')
define_tmpl('content_type', '') define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep') define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
def load(config_file): def load(config_file):
""" initialize and load a configuration""" """ initialize and load a configuration"""

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
""" """
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs" __version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
# if TIDY_MARKUP = 1 # if TIDY_MARKUP = 1
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
# ---------- required modules (should come with any Python distribution) ---------- # ---------- required modules (should come with any Python distribution) ----------
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
try: try:
@ -732,7 +740,7 @@ class _FeedParserMixin:
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup # resolve relative URIs within embedded markup
if is_htmlish: if is_htmlish and RESOLVE_RELATIVE_URIS:
if element in self.can_contain_relative_uris: if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
@ -753,7 +761,7 @@ class _FeedParserMixin:
self._getContext()['vcard'] = vcard self._getContext()['vcard'] = vcard
# sanitize embedded markup # sanitize embedded markup
if is_htmlish: if is_htmlish and SANITIZE_HTML:
if element in self.can_contain_dangerous_markup: if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))

View File

@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
import time import time
# Planet modules # Planet modules
import planet, config, shell import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html', type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'} 'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
or entry['published_parsed'] <= now) and or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed'] (not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)] or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content':
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML(
node.value, 'utf-8', node.type)

View File

@ -3,6 +3,7 @@
import unittest, os, sys, glob, new, re, StringIO, time import unittest, os, sys, glob, new, re, StringIO, time
from planet import feedparser from planet import feedparser
from planet.reconstitute import reconstitute from planet.reconstitute import reconstitute
from planet.scrub import scrub
testfiles = 'tests/data/reconstitute/%s.xml' testfiles = 'tests/data/reconstitute/%s.xml'
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
# parse and reconstitute to a string # parse and reconstitute to a string
work = StringIO.StringIO() work = StringIO.StringIO()
results = feedparser.parse(data) results = feedparser.parse(data)
scrub(testfiles%name, results)
reconstitute(results, results.entries[0]).writexml(work) reconstitute(results, results.entries[0]).writexml(work)
# verify the results # verify the results

View File

@ -6,7 +6,7 @@ from planet.scrub import scrub
from planet import feedparser, config from planet import feedparser, config
feed = ''' feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'> <feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
<entry xml:lang="en"> <entry xml:lang="en">
<id>ignoreme</id> <id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
<title>F&amp;ouml;o</title> <title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary> <summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content> <content>F&amp;ouml;o</content>
<link href="http://example.com/entry/1/"/>
<source> <source>
<link href="http://example.com/feed/"/>
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
</source> </source>
</entry> </entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
data = deepcopy(base) data = deepcopy(base)
scrub('testfeed', data) scrub('testfeed', data)
self.assertEqual(0, len(data.entries)) self.assertEqual(0, len(data.entries))
def test_scrub_xmlbase(self):
base = feedparser.parse(feed)
self.assertEqual('http://example.com/',
base.entries[0].title_detail.base)
config.parser.readfp(StringIO.StringIO(configData))
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/feed/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/entry/1/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'base/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/base/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.org/data/',
data.entries[0].title_detail.base)

View File

@ -35,7 +35,7 @@
<th>Name</th> <th>Name</th>
<th>Format</th> <th>Format</th>
<xsl:if test="//planet:ignore_in_feed | //planet:filters | <xsl:if test="//planet:ignore_in_feed | //planet:filters |
//planet:*[contains(local-name(),'_type')]"> //planet:xml_base | //planet:*[contains(local-name(),'_type')]">
<th>Notes</th> <th>Notes</th>
</xsl:if> </xsl:if>
</tr> </tr>
@ -128,12 +128,12 @@
</a> </a>
</td> </td>
<td><xsl:value-of select="planet:format"/></td> <td><xsl:value-of select="planet:format"/></td>
<xsl:if test="planet:ignore_in_feed | planet:filters | <xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
planet:*[contains(local-name(),'_type')]"> planet:*[contains(local-name(),'_type')]">
<td> <td>
<dl> <dl>
<xsl:for-each select="planet:ignore_in_feed | planet:filters | <xsl:for-each select="planet:ignore_in_feed | planet:filters |
planet:*[contains(local-name(),'_type')]"> planet:xml_base | planet:*[contains(local-name(),'_type')]">
<xsl:sort select="local-name()"/> <xsl:sort select="local-name()"/>
<dt><xsl:value-of select="local-name()"/></dt> <dt><xsl:value-of select="local-name()"/></dt>
<dd><xsl:value-of select="."/></dd> <dd><xsl:value-of select="."/></dd>