diff --git a/docs/normalization.html b/docs/normalization.html index 5577899..de73812 100644 --- a/docs/normalization.html +++ b/docs/normalization.html @@ -78,8 +78,9 @@ be corrected automatically, and for these, there are configuration parameters that can be used to help.
ignore_in_feed
allows you to list any number of elements
-which are to be ignored in feeds. This is often handy in the case of feeds
-where the id
or updated
values can't be trusted.id
, updated
or
+xml:lang
values can't be trusted.
title_type
, summary_type
,
content_type
allow you to override the
type
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 3707d6d..1cee1bb 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -159,7 +159,7 @@ def content(xentry, name, detail, bozo):
xcontent.setAttribute('type', 'html')
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
- if detail.language:
+ if detail.get("language"):
xcontent.setAttribute('xml:lang', detail.language)
xentry.appendChild(xcontent)
diff --git a/planet/spider.py b/planet/spider.py
index ce473ee..3722528 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -63,10 +63,16 @@ def scrub(feed, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed).split():
+ if tag.find('lang')>=0: tag='language'
+ if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+ for key in entry.keys():
+ if not key.endswith('_detail'): continue
+ for detail in entry[key].copy():
+ if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed):
diff --git a/tests/reconstitute.py b/tests/reconstitute.py
new file mode 100644
index 0000000..f2b71ab
--- /dev/null
+++ b/tests/reconstitute.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+import os, sys, ConfigParser, shutil, glob
+venus_base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0,venus_base)
+
+if __name__ == "__main__":
+
+ if sys.argv[1] == '-v' or sys.argv[1] == '--verbose':
+ import planet
+ planet.getLogger('DEBUG',None)
+ del sys.argv[1]
+
+ from planet import config
+ config.parser = ConfigParser.ConfigParser()
+ config.parser.add_section('Planet')
+ config.parser.add_section(sys.argv[1])
+ work = reduce(os.path.join, ['tests','work','reconsititute'], venus_base)
+ output = os.path.join(work, 'output')
+ config.parser.set('Planet','cache_directory',work)
+ config.parser.set('Planet','output_dir',output)
+ config.parser.set('Planet','template_files','themes/common/atom.xml.xslt')
+
+ for name, value in zip(sys.argv[2::2],sys.argv[3::2]):
+ config.parser.set(sys.argv[1], name.lstrip('-'), value)
+
+ from planet import spider
+ spider.spiderPlanet(only_if_new=False)
+
+ from planet import feedparser
+ for source in glob.glob(os.path.join(work, 'sources/*')):
+ feed = feedparser.parse(source).feed
+ if feed.has_key('title'):
+ config.parser.set('Planet','name',feed.title_detail.value)
+ if feed.has_key('link'):
+ config.parser.set('Planet','link',feed.link)
+ if feed.has_key('author_detail'):
+ if feed.author_detail.has_key('name'):
+ config.parser.set('Planet','owner_name',feed.author_detail.name)
+ if feed.author_detail.has_key('email'):
+ config.parser.set('Planet','owner_email',feed.author_detail.email)
+
+ from planet import splice
+ doc = splice.splice()
+ splice.apply(doc.toxml('utf-8'))
+
+ atom = open(os.path.join(output,'atom.xml')).read()
+
+ shutil.rmtree(work)
+ os.removedirs(os.path.dirname(work))
+
+ print atom
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index dc94e05..7d9d1b0 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -7,7 +7,7 @@ from planet import feedparser, config
feed = '''