Resync with feedparser (and add support for RSS source element)

2010-05-11 19:57:40 -04:00 · 2010-05-11 19:57:40 -04:00 · 1bcee5cecf
commit 1bcee5cecf
parent 83447dcc23
3 changed files with 94 additions and 21 deletions
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -226,6 +226,14 @@ def source(xsource, source, bozo, format):
        author(xsource, 'contributor', contributor)

    links(xsource, source)
+    if not source.links and source.has_key('href'): #rss
+        xlink = xdoc.createElement('link')
+        xlink.setAttribute('href', source.get('href'))
+        xsource.appendChild(xlink)
+        if source.has_key('title'):
+            xtitle = xdoc.createElement('title')
+            xtitle.appendChild(xdoc.createTextNode(source.get('title')))
+            xsource.appendChild(xtitle)

    content(xsource, 'rights', source.get('rights_detail',None), bozo)
    content(xsource, 'subtitle', source.get('subtitle_detail',None), bozo)
--- a/planet/vendor/feedparser.py
+++ b/planet/vendor/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 293 $"[11:14] + "-svn"
+__version__ = "4.2-pre-" + "$Revision: 308 $"[11:14] + "-svn"
 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -40,7 +40,8 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
                    "Aaron Swartz <http://aaronsw.com/>",
                    "Kevin Marks <http://epeus.blogspot.com/>",
-                    "Sam Ruby <http://intertwingly.net/>"]
+                    "Sam Ruby <http://intertwingly.net/>",
+                    "Ade Oshineye <http://blog.oshineye.com/>"]
 _debug = 0

 # HTTP "User-Agent" header to send to servers when downloading feeds.
@ -407,6 +408,8 @@ class _FeedParserMixin:
                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
                  'http://purl.org/rss/1.0/modules/link/':                'l',
                  'http://search.yahoo.com/mrss':                         'media',
+                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
+                  'http://search.yahoo.com/mrss/':                         'media',
                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
@ -547,7 +550,15 @@ class _FeedParserMixin:
            method = getattr(self, methodname)
            return method(attrsD)
        except AttributeError:
-            return self.push(prefix + suffix, 1)
+            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
+            unknown_tag = prefix + suffix
+            if len(attrsD) == 0:
+                # No attributes so merge it into the encosing dictionary
+                return self.push(unknown_tag, 1)
+            else:
+                # Has attributes so create it in its own dictionary
+                context = self._getContext()
+                context[unknown_tag] = attrsD

    def unknown_endtag(self, tag):
        if _debug: sys.stderr.write('end %s\n' % tag)
@ -643,12 +654,19 @@ class _FeedParserMixin:
        if _debug: sys.stderr.write('entering parse_declaration\n')
        if self.rawdata[i:i+9] == '<![CDATA[':
            k = self.rawdata.find(']]>', i)
-            if k == -1: k = len(self.rawdata)
+            if k == -1:
+                # CDATA block began but didn't finish
+                k = len(self.rawdata)
+                return k
            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
            return k+3
        else:
            k = self.rawdata.find('>', i)
-            return k+1
+            if k >= 0:
+                return k+1
+            else:
+                # We have an incomplete CDATA block.
+                return k

    def mapContentType(self, contentType):
        contentType = contentType.lower()
@ -919,7 +937,10 @@ class _FeedParserMixin:
                      '0.92': 'rss092',
                      '0.93': 'rss093',
                      '0.94': 'rss094'}
-        if not self.version:
+        #If we're here then this is an RSS feed.
+        #If we don't have a version or have a version that starts with something
+        #other than RSS then there's been a mistake. Correct it.
+        if not self.version or not self.version.startswith('rss'):
            attr_version = attrsD.get('version', '')
            version = versionmap.get(attr_version)
            if version:
@ -1481,11 +1502,18 @@ class _FeedParserMixin:
            context['id'] = href
            
    def _start_source(self, attrsD):
+        if 'url' in attrsD:
+          # This means that we're processing a source element from an RSS 2.0 feed
+          self.sourcedata['href'] = attrsD[u'url']
+        self.push('source', 1)
        self.insource = 1
        self.hasTitle = 0

    def _end_source(self):
        self.insource = 0
+        value = self.pop('source')
+        if value:
+          self.sourcedata['title'] = value
        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
        self.sourcedata.clear()

@ -1532,6 +1560,33 @@ class _FeedParserMixin:
        value = self.pop('itunes_explicit', 0)
        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0

+    def _start_media_content(self, attrsD):
+        context = self._getContext()
+        context.setdefault('media_content', [])
+        context['media_content'].append(attrsD)
+
+    def _start_media_thumbnail(self, attrsD):
+        context = self._getContext()
+        context.setdefault('media_thumbnail', [])
+        self.push('url', 1) # new
+        context['media_thumbnail'].append(attrsD)
+
+    def _end_media_thumbnail(self):
+        url = self.pop('url')
+        context = self._getContext()
+        if url != None and len(url.strip()) != 0:
+            if not context['media_thumbnail'][-1].has_key('url'):
+                context['media_thumbnail'][-1]['url'] = url
+
+    def _start_media_player(self, attrsD):
+        self.push('media_player', 0)
+        self._getContext()['media_player'] = FeedParserDict(attrsD)
+
+    def _end_media_player(self):
+        value = self.pop('media_player')
+        context = self._getContext()
+        context['media_player']['content'] = value
+
 if _XML_AVAILABLE:
    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
        def __init__(self, baseuri, baselang, encoding):
@ -1616,7 +1671,7 @@ if _XML_AVAILABLE:
        def error(self, exc):
            self.bozo = 1
            self.exc = exc
-            
+
        def fatalError(self, exc):
            self.error(exc)
            raise exc
@ -1624,15 +1679,18 @@ if _XML_AVAILABLE:
 class _BaseHTMLProcessor(sgmllib.SGMLParser):
    special = re.compile('''[<>'"]''')
    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
-    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
-      'img', 'input', 'isindex', 'link', 'meta', 'param']
-    
+    elements_no_end_tag = [
+      'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 
+      'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
+      'source', 'track', 'wbr'
+    ]
+
    def __init__(self, encoding, type):
        self.encoding = encoding
        self.type = type
        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
        sgmllib.SGMLParser.__init__(self)
-        
+
    def reset(self):
        self.pieces = []
        sgmllib.SGMLParser.reset(self)
@ -1730,7 +1788,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
+        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
        self.pieces.append(text)
        
    def handle_comment(self, text):
@ -2257,12 +2315,16 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
        return _urljoin(self.baseuri, uri.strip())
    
    def unknown_starttag(self, tag, attrs):
+        if _debug:
+            sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
        attrs = self.normalize_attrs(attrs)
        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
-        
+
 def _resolveRelativeURIs(htmlSource, baseURI, encoding, type):
-    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
+    if _debug:
+        sys.stderr.write('entering _resolveRelativeURIs\n')
+
    p = _RelativeURIResolver(baseURI, encoding, type)
    p.feed(htmlSource)
    return p.output()
@ -2475,7 +2537,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):

        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        # This replaced a regexp that used re.match and was prone to pathological back-tracking.
+        if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''

        clean = []
        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
@ -2721,7 +2784,8 @@ _iso8601_re = [
    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
    'CC', r'(?P<century>\d\d$)')
    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
-    + r'(:(?P<second>\d{2}(\.\d*)?))?'
+    + r'(:(?P<second>\d{2}))?'
+    + r'(\.(?P<fracsecond>\d+))?'
    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
    for tmpl in _iso8601_tmpl]
 del tmpl
@ -3352,7 +3416,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
    except Exception, e:
        result['bozo'] = 1
        result['bozo_exception'] = e
-        data = ''
+        data = None
        f = None

    # if feed is gzip-compressed, decompress it
@ -3410,8 +3474,9 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
            bozo_message = 'no Content-type specified'
        result['bozo'] = 1
        result['bozo_exception'] = NonXMLContentType(bozo_message)
-        
-    result['version'], data, entities = _stripDoctype(data)
+
+    if data is not None:
+        result['version'], data, entities = _stripDoctype(data)

    baseuri = http_headers.get('content-location', result.get('href'))
    baselang = http_headers.get('content-language', None)
@ -3424,7 +3489,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
        return result

    # if there was a problem downloading, we're done
-    if not data:
+    if data is None:
        return result

    # determine character encoding
--- a/tests/data/reconstitute/rss_source.xml
+++ b/tests/data/reconstitute/rss_source.xml
@ -1,6 +1,6 @@
 <!--
 Description:  source element
-Expect:       source.title == 'foo' 
+Expect:       source.title == 'org' 
 -->

 <rss version="2.0">