Add xml:lang to list of scrubbable attributes

This commit is contained in:
Sam Ruby 2006-10-25 12:20:28 -04:00
parent fdaf129f9b
commit 2529bdd36a
5 changed files with 65 additions and 5 deletions

View File

@ -78,8 +78,9 @@ be corrected automatically, and for these, there are configuration parameters
that can be used to help.</p> that can be used to help.</p>
<ul> <ul>
<li><code>ignore_in_feed</code> allows you to list any number of elements <li><code>ignore_in_feed</code> allows you to list any number of elements
which are to be ignored in feeds. This is often handy in the case of feeds or attributes which are to be ignored in feeds. This is often handy in the
where the <code>id</code> or <code>updated</code> values can't be trusted.</li> case of feeds where the <code>id</code>, <code>updated</code> or
<code>xml:lang</code> values can't be trusted.</li>
<li><code>title_type</code>, <code>summary_type</code>, <li><code>title_type</code>, <code>summary_type</code>,
<code>content_type</code> allow you to override the <code>content_type</code> allow you to override the
<a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a> <a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>

View File

@ -159,7 +159,7 @@ def content(xentry, name, detail, bozo):
xcontent.setAttribute('type', 'html') xcontent.setAttribute('type', 'html')
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8'))) xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
if detail.language: if detail.get("language"):
xcontent.setAttribute('xml:lang', detail.language) xcontent.setAttribute('xml:lang', detail.language)
xentry.appendChild(xcontent) xentry.appendChild(xcontent)

View File

@ -63,10 +63,16 @@ def scrub(feed, data):
# some data is not trustworthy # some data is not trustworthy
for tag in config.ignore_in_feed(feed).split(): for tag in config.ignore_in_feed(feed).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries: for entry in data.entries:
if entry.has_key(tag): del entry[tag] if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types # adjust title types
if config.title_type(feed): if config.title_type(feed):

51
tests/reconstitute.py Normal file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
import os, sys, ConfigParser, shutil, glob
venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0,venus_base)
if __name__ == "__main__":
if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
import planet
planet.getLogger('DEBUG',None)
del sys.argv[1]
from planet import config
config.parser = ConfigParser.ConfigParser()
config.parser.add_section('Planet')
config.parser.add_section(sys.argv[1])
work = reduce(os.path.join, ['tests','work','reconsititute'], venus_base)
output = os.path.join(work, 'output')
config.parser.set('Planet','cache_directory',work)
config.parser.set('Planet','output_dir',output)
config.parser.set('Planet','template_files','themes/common/atom.xml.xslt')
for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
config.parser.set(sys.argv[1], name.lstrip('-'), value)
from planet import spider
spider.spiderPlanet(only_if_new=False)
from planet import feedparser
for source in glob.glob(os.path.join(work, 'sources/*')):
feed = feedparser.parse(source).feed
if feed.has_key('title'):
config.parser.set('Planet','name',feed.title_detail.value)
if feed.has_key('link'):
config.parser.set('Planet','link',feed.link)
if feed.has_key('author_detail'):
if feed.author_detail.has_key('name'):
config.parser.set('Planet','owner_name',feed.author_detail.name)
if feed.author_detail.has_key('email'):
config.parser.set('Planet','owner_email',feed.author_detail.email)
from planet import splice
doc = splice.splice()
splice.apply(doc.toxml('utf-8'))
atom = open(os.path.join(output,'atom.xml')).read()
shutil.rmtree(work)
os.removedirs(os.path.dirname(work))
print atom

View File

@ -7,7 +7,7 @@ from planet import feedparser, config
feed = ''' feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'> <feed xmlns='http://www.w3.org/2005/Atom'>
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
<entry> <entry xml:lang="en">
<id>ignoreme</id> <id>ignoreme</id>
<author><name>F&amp;ouml;o</name></author> <author><name>F&amp;ouml;o</name></author>
<updated>2000-01-01T00:00:00Z</updated> <updated>2000-01-01T00:00:00Z</updated>
@ -23,7 +23,7 @@ feed = '''
configData = ''' configData = '''
[testfeed] [testfeed]
ignore_in_feed = id updated ignore_in_feed = id updated xml:lang
name_type = html name_type = html
title_type = html title_type = html
summary_type = html summary_type = html
@ -40,12 +40,14 @@ class ScrubTest(unittest.TestCase):
self.assertTrue(data.entries[0].has_key('id')) self.assertTrue(data.entries[0].has_key('id'))
self.assertTrue(data.entries[0].has_key('updated')) self.assertTrue(data.entries[0].has_key('updated'))
self.assertTrue(data.entries[0].has_key('updated_parsed')) self.assertTrue(data.entries[0].has_key('updated_parsed'))
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
scrub('testfeed', data) scrub('testfeed', data)
self.assertFalse(data.entries[0].has_key('id')) self.assertFalse(data.entries[0].has_key('id'))
self.assertFalse(data.entries[0].has_key('updated')) self.assertFalse(data.entries[0].has_key('updated'))
self.assertFalse(data.entries[0].has_key('updated_parsed')) self.assertFalse(data.entries[0].has_key('updated_parsed'))
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name) self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name) self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)