Add xml:lang to list of scrubbable attributes
This commit is contained in:
parent
fdaf129f9b
commit
2529bdd36a
@ -78,8 +78,9 @@ be corrected automatically, and for these, there are configuration parameters
|
||||
that can be used to help.</p>
|
||||
<ul>
|
||||
<li><code>ignore_in_feed</code> allows you to list any number of elements
|
||||
which are to be ignored in feeds. This is often handy in the case of feeds
|
||||
where the <code>id</code> or <code>updated</code> values can't be trusted.</li>
|
||||
or attributes which are to be ignored in feeds. This is often handy in the
|
||||
case of feeds where the <code>id</code>, <code>updated</code> or
|
||||
<code>xml:lang</code> values can't be trusted.</li>
|
||||
<li><code>title_type</code>, <code>summary_type</code>,
|
||||
<code>content_type</code> allow you to override the
|
||||
<a href="http://www.feedparser.org/docs/reference-entry-title_detail.html#reference.entry.title_detail.type"><code>type</code></a>
|
||||
|
@ -159,7 +159,7 @@ def content(xentry, name, detail, bozo):
|
||||
xcontent.setAttribute('type', 'html')
|
||||
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
|
||||
|
||||
if detail.language:
|
||||
if detail.get("language"):
|
||||
xcontent.setAttribute('xml:lang', detail.language)
|
||||
|
||||
xentry.appendChild(xcontent)
|
||||
|
@ -63,10 +63,16 @@ def scrub(feed, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed).split():
|
||||
if tag.find('lang')>=0: tag='language'
|
||||
if data.feed.has_key(tag): del data.feed[tag]
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
for key in entry.keys():
|
||||
if not key.endswith('_detail'): continue
|
||||
for detail in entry[key].copy():
|
||||
if detail == tag: del entry[key][detail]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed):
|
||||
|
51
tests/reconstitute.py
Normal file
51
tests/reconstitute.py
Normal file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python
|
||||
import os, sys, ConfigParser, shutil, glob
|
||||
venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0,venus_base)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
|
||||
import planet
|
||||
planet.getLogger('DEBUG',None)
|
||||
del sys.argv[1]
|
||||
|
||||
from planet import config
|
||||
config.parser = ConfigParser.ConfigParser()
|
||||
config.parser.add_section('Planet')
|
||||
config.parser.add_section(sys.argv[1])
|
||||
work = reduce(os.path.join, ['tests','work','reconsititute'], venus_base)
|
||||
output = os.path.join(work, 'output')
|
||||
config.parser.set('Planet','cache_directory',work)
|
||||
config.parser.set('Planet','output_dir',output)
|
||||
config.parser.set('Planet','template_files','themes/common/atom.xml.xslt')
|
||||
|
||||
for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
|
||||
config.parser.set(sys.argv[1], name.lstrip('-'), value)
|
||||
|
||||
from planet import spider
|
||||
spider.spiderPlanet(only_if_new=False)
|
||||
|
||||
from planet import feedparser
|
||||
for source in glob.glob(os.path.join(work, 'sources/*')):
|
||||
feed = feedparser.parse(source).feed
|
||||
if feed.has_key('title'):
|
||||
config.parser.set('Planet','name',feed.title_detail.value)
|
||||
if feed.has_key('link'):
|
||||
config.parser.set('Planet','link',feed.link)
|
||||
if feed.has_key('author_detail'):
|
||||
if feed.author_detail.has_key('name'):
|
||||
config.parser.set('Planet','owner_name',feed.author_detail.name)
|
||||
if feed.author_detail.has_key('email'):
|
||||
config.parser.set('Planet','owner_email',feed.author_detail.email)
|
||||
|
||||
from planet import splice
|
||||
doc = splice.splice()
|
||||
splice.apply(doc.toxml('utf-8'))
|
||||
|
||||
atom = open(os.path.join(output,'atom.xml')).read()
|
||||
|
||||
shutil.rmtree(work)
|
||||
os.removedirs(os.path.dirname(work))
|
||||
|
||||
print atom
|
@ -7,7 +7,7 @@ from planet import feedparser, config
|
||||
feed = '''
|
||||
<feed xmlns='http://www.w3.org/2005/Atom'>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<entry>
|
||||
<entry xml:lang="en">
|
||||
<id>ignoreme</id>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<updated>2000-01-01T00:00:00Z</updated>
|
||||
@ -23,7 +23,7 @@ feed = '''
|
||||
|
||||
configData = '''
|
||||
[testfeed]
|
||||
ignore_in_feed = id updated
|
||||
ignore_in_feed = id updated xml:lang
|
||||
name_type = html
|
||||
title_type = html
|
||||
summary_type = html
|
||||
@ -40,12 +40,14 @@ class ScrubTest(unittest.TestCase):
|
||||
self.assertTrue(data.entries[0].has_key('id'))
|
||||
self.assertTrue(data.entries[0].has_key('updated'))
|
||||
self.assertTrue(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertFalse(data.entries[0].has_key('id'))
|
||||
self.assertFalse(data.entries[0].has_key('updated'))
|
||||
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
|
||||
|
Loading…
x
Reference in New Issue
Block a user