xml_base overrides

2007-01-22 13:46:45 -05:00 · 2007-01-22 13:46:45 -05:00 · 77d15d22cf
commit 77d15d22cf
parent 631dd44ff0
8 changed files with 99 additions and 9 deletions
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -95,6 +95,13 @@ attributes on these elements.</li>
 <li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
 </ul>
 </li>
+<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>).  Other elements in the feed (most notably, <code>link</code> are not affected by this value.
+<ul style="margin:0">
+<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
+<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
+<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>.  These values may be relative or absolute.  If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
+</ul>
+</li>
 </ul>
 </body>
 </html>
--- a/planet/init.py
+++ b/planet/init.py
@ -30,5 +30,7 @@ def getLogger(level, format):

    return logger

-
-
+# Configure feed parser
+from planet import feedparser
+feedparser.SANITIZE_HTML=0
+feedparser.RESOLVE_RELATIVE_URIS=0
--- a/planet/config.py
+++ b/planet/config.py
@ -125,6 +125,7 @@ def __init__():
    define_tmpl('summary_type', '')
    define_tmpl('content_type', '')
    define_tmpl('future_dates', 'keep')
+    define_tmpl('xml_base', '')

 def load(config_file):
    """ initialize and load a configuration"""
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
 # if TIDY_MARKUP = 1
 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]

+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
+
 # ---------- required modules (should come with any Python distribution) ----------
 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
 try:
@ -732,7 +740,7 @@ class _FeedParserMixin:

        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
        # resolve relative URIs within embedded markup
-        if is_htmlish:
+        if is_htmlish and RESOLVE_RELATIVE_URIS:
            if element in self.can_contain_relative_uris:
                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
                
@ -753,7 +761,7 @@ class _FeedParserMixin:
                    self._getContext()['vcard'] = vcard
        
        # sanitize embedded markup
-        if is_htmlish:
+        if is_htmlish and SANITIZE_HTML:
            if element in self.can_contain_dangerous_markup:
                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))

--- a/planet/scrub.py
+++ b/planet/scrub.py
@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
 import time
 # Planet modules
 import planet, config, shell
+from planet import feedparser

 type_map = {'text': 'text/plain', 'html': 'text/html',
    'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]
+
+    scrub_xmlbase = config.xml_base(feed_uri)
+
+    # resolve relative URIs and sanitize
+    for entry in data.entries + [data.feed]:
+        for key in entry.keys():
+            if key == 'content':
+                node = entry.content[0]
+            elif key.endswith('_detail'):
+                node = entry[key]
+            else:
+                continue
+
+            if not node.has_key('type'): continue
+            if not 'html' in node['type']: continue
+            if not node.has_key('value'): continue
+
+            if node.has_key('base'):
+                if scrub_xmlbase:
+                    if scrub_xmlbase == 'feed_alternate':
+                        if entry.has_key('source') and \
+                            entry.source.has_key('link'):
+                            node['base'] = entry.source.link
+                        elif data.feed.has_key('link'):
+                            node['base'] = data.feed.link
+                    elif scrub_xmlbase == 'entry_alternate':
+                        if entry.has_key('link'):
+                            node['base'] = entry.link
+                    else:
+                        node['base'] = feedparser._urljoin(
+                            node['base'], scrub_xmlbase)
+
+                node['value'] = feedparser._resolveRelativeURIs(
+                    node.value, node.base, 'utf-8', node.type)
+
+            node['value'] = feedparser._sanitizeHTML(
+                node.value, 'utf-8', node.type)
--- a/tests/test_reconstitute.py
+++ b/tests/test_reconstitute.py
@ -3,6 +3,7 @@
 import unittest, os, sys, glob, new, re, StringIO, time
 from planet import feedparser
 from planet.reconstitute import reconstitute
+from planet.scrub import scrub

 testfiles = 'tests/data/reconstitute/%s.xml'

@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
        # parse and reconstitute to a string
        work = StringIO.StringIO()
        results = feedparser.parse(data)
+        scrub(testfiles%name, results)
        reconstitute(results, results.entries[0]).writexml(work)

        # verify the results
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -6,7 +6,7 @@ from planet.scrub import scrub
 from planet import feedparser, config

 feed = '''
-<feed xmlns='http://www.w3.org/2005/Atom'>
+<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
  <author><name>F&amp;ouml;o</name></author>
  <entry xml:lang="en">
    <id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
    <title>F&amp;ouml;o</title>
    <summary>F&amp;ouml;o</summary>
    <content>F&amp;ouml;o</content>
+    <link href="http://example.com/entry/1/"/>
    <source>
+      <link href="http://example.com/feed/"/>
      <author><name>F&amp;ouml;o</name></author>
    </source>
  </entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual(0, len(data.entries))
+
+    def test_scrub_xmlbase(self):
+        base = feedparser.parse(feed)
+        self.assertEqual('http://example.com/',
+             base.entries[0].title_detail.base)
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        config.parser.set('testfeed', 'xml_base', 'feed_alternate')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/feed/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'entry_alternate')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/entry/1/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'base/')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/base/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.org/data/',
+             data.entries[0].title_detail.base)
--- a/themes/common/validate.html.xslt
+++ b/themes/common/validate.html.xslt
@ -35,7 +35,7 @@
              <th>Name</th>
              <th>Format</th>
              <xsl:if test="//planet:ignore_in_feed | //planet:filters |
-                //planet:*[contains(local-name(),'_type')]">
+                //planet:xml_base | //planet:*[contains(local-name(),'_type')]">
                <th>Notes</th>
              </xsl:if>
            </tr>
@ -128,12 +128,12 @@
        </a>
      </td>
      <td><xsl:value-of select="planet:format"/></td>
-      <xsl:if test="planet:ignore_in_feed | planet:filters |
+      <xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
        planet:*[contains(local-name(),'_type')]">
        <td>
          <dl>
            <xsl:for-each select="planet:ignore_in_feed | planet:filters |
-              planet:*[contains(local-name(),'_type')]">
+              planet:xml_base | planet:*[contains(local-name(),'_type')]">
              <xsl:sort select="local-name()"/>
              <dt><xsl:value-of select="local-name()"/></dt>
              <dd><xsl:value-of select="."/></dd>