Add xml:lang to list of scrubbable attributes

2006-10-25 12:20:28 -04:00 · 2006-10-25 12:20:28 -04:00 · 2529bdd36a
commit 2529bdd36a
parent fdaf129f9b
5 changed files with 65 additions and 5 deletions
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -78,8 +78,9 @@ be corrected automatically, and for these, there are configuration parameters
 that can be used to help.</p>
 <ul>
 <li><code>ignore_in_feed</code> allows you to list any number of elements
-which are to be ignored in feeds.  This is often handy in the case of feeds
-where the <code>id</code> or <code>updated</code> values can't be trusted.</li>
+or attributes which are to be ignored in feeds.  This is often handy in the
+case of feeds where the <code>id</code>, <code>updated</code> or
+<code>xml:lang</code> values can't be trusted.</li>
 <li><code>title_type</code>, <code>summary_type</code>,
 <code>content_type</code> allow you to override the 
 <a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -159,7 +159,7 @@ def content(xentry, name, detail, bozo):
        xcontent.setAttribute('type', 'html')
        xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))

-    if detail.language:
+    if detail.get("language"):
        xcontent.setAttribute('xml:lang', detail.language)

    xentry.appendChild(xcontent)
--- a/planet/spider.py
+++ b/planet/spider.py
@ -63,10 +63,16 @@ def scrub(feed, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed).split():
+        if tag.find('lang')>=0: tag='language'
+        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+            for key in entry.keys():
+                if not key.endswith('_detail'): continue
+                for detail in entry[key].copy():
+                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed):
--- a/tests/reconstitute.py
+++ b/tests/reconstitute.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python
+import os, sys, ConfigParser, shutil, glob
+venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0,venus_base)
+
+if __name__ == "__main__":
+
+    if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
+        import planet
+        planet.getLogger('DEBUG',None)
+        del sys.argv[1]
+
+    from planet import config
+    config.parser = ConfigParser.ConfigParser()
+    config.parser.add_section('Planet')
+    config.parser.add_section(sys.argv[1])
+    work = reduce(os.path.join, ['tests','work','reconsititute'], venus_base)
+    output = os.path.join(work, 'output')
+    config.parser.set('Planet','cache_directory',work)
+    config.parser.set('Planet','output_dir',output)
+    config.parser.set('Planet','template_files','themes/common/atom.xml.xslt')
+
+    for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
+        config.parser.set(sys.argv[1], name.lstrip('-'), value)
+
+    from planet import spider
+    spider.spiderPlanet(only_if_new=False)
+
+    from planet import feedparser
+    for source in glob.glob(os.path.join(work, 'sources/*')):
+        feed = feedparser.parse(source).feed
+        if feed.has_key('title'):
+            config.parser.set('Planet','name',feed.title_detail.value)
+        if feed.has_key('link'):
+            config.parser.set('Planet','link',feed.link)
+        if feed.has_key('author_detail'):
+            if feed.author_detail.has_key('name'):
+                config.parser.set('Planet','owner_name',feed.author_detail.name)
+            if feed.author_detail.has_key('email'):
+                config.parser.set('Planet','owner_email',feed.author_detail.email)
+
+    from planet import splice
+    doc = splice.splice()
+    splice.apply(doc.toxml('utf-8'))
+
+    atom = open(os.path.join(output,'atom.xml')).read()
+
+    shutil.rmtree(work)
+    os.removedirs(os.path.dirname(work))
+
+    print atom
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -7,7 +7,7 @@ from planet import feedparser, config
 feed = '''
 <feed xmlns='http://www.w3.org/2005/Atom'>
  <author><name>F&amp;ouml;o</name></author>
-  <entry>
+  <entry xml:lang="en">
    <id>ignoreme</id>
    <author><name>F&amp;ouml;o</name></author>
    <updated>2000-01-01T00:00:00Z</updated>
@ -23,7 +23,7 @@ feed = '''

 configData = '''
 [testfeed]
-ignore_in_feed = id updated
+ignore_in_feed = id updated xml:lang
 name_type = html
 title_type = html
 summary_type = html
@ -40,12 +40,14 @@ class ScrubTest(unittest.TestCase):
        self.assertTrue(data.entries[0].has_key('id'))
        self.assertTrue(data.entries[0].has_key('updated'))
        self.assertTrue(data.entries[0].has_key('updated_parsed'))
+        self.assertTrue(data.entries[0].summary_detail.has_key('language'))

        scrub('testfeed', data)

        self.assertFalse(data.entries[0].has_key('id'))
        self.assertFalse(data.entries[0].has_key('updated'))
        self.assertFalse(data.entries[0].has_key('updated_parsed'))
+        self.assertFalse(data.entries[0].summary_detail.has_key('language'))

        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)