Add xml:lang to list of scrubbable attributes

This commit is contained in:
Sam Ruby 2006-10-25 12:20:28 -04:00
parent fdaf129f9b
commit 2529bdd36a
5 changed files with 65 additions and 5 deletions

View File

@ -78,8 +78,9 @@ be corrected automatically, and for these, there are configuration parameters
that can be used to help.</p>
<ul>
<li><code>ignore_in_feed</code> allows you to list any number of elements
which are to be ignored in feeds. This is often handy in the case of feeds
where the <code>id</code> or <code>updated</code> values can't be trusted.</li>
or attributes which are to be ignored in feeds. This is often handy in the
case of feeds where the <code>id</code>, <code>updated</code> or
<code>xml:lang</code> values can't be trusted.</li>
<li><code>title_type</code>, <code>summary_type</code>,
<code>content_type</code> allow you to override the
<a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>

View File

@ -159,7 +159,7 @@ def content(xentry, name, detail, bozo):
xcontent.setAttribute('type', 'html')
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
if detail.language:
if detail.get("language"):
xcontent.setAttribute('xml:lang', detail.language)
xentry.appendChild(xcontent)

View File

@ -63,10 +63,16 @@ def scrub(feed, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed):

51
tests/reconstitute.py Normal file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
import os, sys, ConfigParser, shutil, glob
venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0,venus_base)
if __name__ == "__main__":
if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
import planet
planet.getLogger('DEBUG',None)
del sys.argv[1]
from planet import config
config.parser = ConfigParser.ConfigParser()
config.parser.add_section('Planet')
config.parser.add_section(sys.argv[1])
work = reduce(os.path.join, ['tests','work','reconsititute'], venus_base)
output = os.path.join(work, 'output')
config.parser.set('Planet','cache_directory',work)
config.parser.set('Planet','output_dir',output)
config.parser.set('Planet','template_files','themes/common/atom.xml.xslt')
for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
config.parser.set(sys.argv[1], name.lstrip('-'), value)
from planet import spider
spider.spiderPlanet(only_if_new=False)
from planet import feedparser
for source in glob.glob(os.path.join(work, 'sources/*')):
feed = feedparser.parse(source).feed
if feed.has_key('title'):
config.parser.set('Planet','name',feed.title_detail.value)
if feed.has_key('link'):
config.parser.set('Planet','link',feed.link)
if feed.has_key('author_detail'):
if feed.author_detail.has_key('name'):
config.parser.set('Planet','owner_name',feed.author_detail.name)
if feed.author_detail.has_key('email'):
config.parser.set('Planet','owner_email',feed.author_detail.email)
from planet import splice
doc = splice.splice()
splice.apply(doc.toxml('utf-8'))
atom = open(os.path.join(output,'atom.xml')).read()
shutil.rmtree(work)
os.removedirs(os.path.dirname(work))
print atom

View File

@ -7,7 +7,7 @@ from planet import feedparser, config
feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'>
<author><name>F&amp;ouml;o</name></author>
<entry>
<entry xml:lang="en">
<id>ignoreme</id>
<author><name>F&amp;ouml;o</name></author>
<updated>2000-01-01T00:00:00Z</updated>
@ -23,7 +23,7 @@ feed = '''
configData = '''
[testfeed]
ignore_in_feed = id updated
ignore_in_feed = id updated xml:lang
name_type = html
title_type = html
summary_type = html
@ -40,12 +40,14 @@ class ScrubTest(unittest.TestCase):
self.assertTrue(data.entries[0].has_key('id'))
self.assertTrue(data.entries[0].has_key('updated'))
self.assertTrue(data.entries[0].has_key('updated_parsed'))
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
scrub('testfeed', data)
self.assertFalse(data.entries[0].has_key('id'))
self.assertFalse(data.entries[0].has_key('updated'))
self.assertFalse(data.entries[0].has_key('updated_parsed'))
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)