Latest from Sam.

2007-02-05 15:15:04 -06:00 · 2007-02-05 15:15:04 -06:00 · 215777b9ee
commit 215777b9ee
parent 5276e47197 bc33615ced
16 changed files with 559 additions and 246 deletions
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -95,6 +95,13 @@ attributes on these elements.</li>
 <li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
 </ul>
 </li>
 <li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>).  Other elements in the feed (most notably, <code>link</code> are not affected by this value.
 <ul style="margin:0">
 <li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
 <li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
 <li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>.  These values may be relative or absolute.  If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
 </ul>
 </li>
 </ul>
 </body>
 </html>
--- a/planet/init.py
+++ b/planet/init.py
@ -30,5 +30,7 @@ def getLogger(level, format):
    return logger
-
+# Configure feed parser
-
+from planet import feedparser
 feedparser.SANITIZE_HTML=0
 feedparser.RESOLVE_RELATIVE_URIS=0
--- a/planet/config.py
+++ b/planet/config.py
@ -125,6 +125,7 @@ def __init__():
    define_tmpl('summary_type', '')
    define_tmpl('content_type', '')
    define_tmpl('future_dates', 'keep')
    define_tmpl('xml_base', '')
 def load(config_file):
    """ initialize and load a configuration"""
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """
-__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
 # if TIDY_MARKUP = 1
 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
 # If you want feedparser to automatically resolve all relative URIs, set this
 # to 1.
 RESOLVE_RELATIVE_URIS = 1
 # If you want feedparser to automatically sanitize all potentially unsafe
 # HTML content, set this to 1.
 SANITIZE_HTML = 1
 # ---------- required modules (should come with any Python distribution) ----------
 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
 try:
@ -732,7 +740,7 @@ class _FeedParserMixin:
        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
        # resolve relative URIs within embedded markup
-        if is_htmlish:
+        if is_htmlish and RESOLVE_RELATIVE_URIS:
            if element in self.can_contain_relative_uris:
                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
@ -753,7 +761,7 @@ class _FeedParserMixin:
                    self._getContext()['vcard'] = vcard
        # sanitize embedded markup
-        if is_htmlish:
+        if is_htmlish and SANITIZE_HTML:
            if element in self.can_contain_dangerous_markup:
                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@ -1,3 +1,4 @@
 # Differences from the current specification (23 December 2006) are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
    # the crazy mode
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
        #Keep a ref to this for special handling of whitespace in <pre>
        self.processSpaceCharactersNonPre = self.processSpaceCharacters
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("script", "style"), self.startTagScriptStyle),
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
            self.tree.openElements[-1])
    # the real deal
    def processSpaceCharactersPre(self, data):
        #Sometimes (start of <pre> blocks) we want to drop leading newlines
        self.processSpaceCharacters = self.processSpaceCharactersNonPre
        if (data.startswith("\n") and self.tree.openElements[-1].name == "pre" 
            and not self.tree.openElements[-1].hasContent()):
            data = data[1:]
        if data:
            self.tree.insertText(data)
    def processCharacters(self, data):
        # XXX The specification says to do this for every character at the
        # moment, but apparently that doesn't match the real world so we don't
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
        if name == "pre":
            self.processSpaceCharacters = self.processSpaceCharactersPre
    def startTagForm(self, name, attributes):
        if self.tree.formPointer:
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
            self.parser.phase.processEndTag(name)
    def endTagBlock(self, name):
        #Put us back in the right whitespace handling mode
        if name == "pre":
            self.processSpaceCharacters = self.processSpaceCharactersNonPre
        inScope = self.tree.elementInScope(name)
        if inScope:
            self.tree.generateImpliedEndTags()
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@ -11,11 +11,6 @@ References:
 * http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
 * Produce SAX events based on the produced DOM.  This is intended not to
   support streaming, but rather to support application level compatibility. 
 * Optional namespace support
 * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
   indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
 * Selectively lowercase only XHTML, but not foreign markup
 """
@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
            if token["data"]:
               self.parseError(_("End tag contains unexpected attributes."))
        elif token["type"] == "Comment":
            # Rescue CDATA from the comments
            if (token["data"].startswith("[CDATA[") and
                token["data"].endswith("]]")):
                token["type"] = "Characters"
                token["data"] = token["data"][7:-2]
        return token
 class XHTMLParser(XMLParser):
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@ -1,5 +1,6 @@
 import _base
-from xml.dom import minidom, Node
+from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
 import new
 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
        def hilite(self, encoding):
            print 'foo'
        method = new.instancemethod(hilite, self.dom, self.dom.__class__)
        setattr(self.dom, 'hilite', method)
        return self
    def doctypeClass(self,name):
@ -129,3 +134,58 @@ def testSerializer(element):
    serializeElement(element, 0)
    return "\n".join(rv)
 def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
  if node.nodeType == Node.ELEMENT_NODE:
    if not nsmap:
      handler.startElement(node.nodeName, node.attributes)
      for child in node.childNodes: dom2sax(child, handler, nsmap)
      handler.endElement(node.nodeName)
    else:
      attributes = dict(node.attributes.itemsNS()) 
      # gather namespace declarations
      prefixes = []
      for attrname in node.attributes.keys():
        attr = node.getAttributeNode(attrname)
        if (attr.namespaceURI == XMLNS_NAMESPACE or
           (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
          prefix = (attr.localName != 'xmlns' and attr.localName or None)
          handler.startPrefixMapping(prefix, attr.nodeValue)
          prefixes.append(prefix)
          nsmap = nsmap.copy()
          nsmap[prefix] = attr.nodeValue
          del attributes[(attr.namespaceURI, attr.localName)]
      # apply namespace declarations
      for attrname in node.attributes.keys():
        attr = node.getAttributeNode(attrname)
        if attr.namespaceURI == None and ':' in attr.nodeName:
          prefix = attr.nodeName.split(':')[0]
          if nsmap.has_key(prefix):
            del attributes[(attr.namespaceURI, attr.localName)]
            attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
      # SAX events
      ns = node.namespaceURI or nsmap.get(None,None)
      handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
      for child in node.childNodes: dom2sax(child, handler, nsmap)
      handler.endElementNS((ns, node.nodeName), node.nodeName)
      for prefix in prefixes: handler.endPrefixMapping(prefix)
  elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
    handler.characters(node.nodeValue)
  elif node.nodeType == Node.DOCUMENT_NODE:
    handler.startDocument()
    for child in node.childNodes: dom2sax(child, handler, nsmap)
    handler.endDocument()
  else:
    # ATTRIBUTE_NODE
    # ENTITY_NODE
    # PROCESSING_INSTRUCTION_NODE
    # COMMENT_NODE
    # DOCUMENT_TYPE_NODE
    # NOTATION_NODE
    pass
--- a/planet/html5lib/treebuilders/etree.py
+++ b/planet/html5lib/treebuilders/etree.py
@ -1,208 +1,5 @@
-try:
+import etreefull
    from xml.etree import ElementTree
 except ImportError:
    from elementtree import ElementTree
 import _base
 class Element(_base.Node):
    def __init__(self, name):
        self._element = ElementTree.Element(name)
        self.name = name
        self.parent = None
        self._childNodes = []
        self._flags = []
        #Set the element text and tail to the empty string rather than None
        #XXX - is this desirable or should we do it on a case by case basis?
        self._element.text = ""
        self._element.tail = ""
    def _setName(self, name):
        self._element.tag = name
    def _getName(self):
        return self._element.tag
    name = property(_getName, _setName)
    def _getAttributes(self):
        return self._element.attrib
    def _setAttributes(self, attributes):
        #Delete existing attributes first
        #XXX - there may be a better way to do this...
        for key in self._element.attrib.keys():
            del self._element.attrib[key]
        for key, value in attributes.iteritems():
            self._element.set(key, value)
    attributes = property(_getAttributes, _setAttributes)
    def _getChildNodes(self):
        return self._childNodes
    def _setChildNodes(self, value):
        del self._element[:]
        self._childNodes = []
        for element in value:
            self.insertChild(element)
    childNodes = property(_getChildNodes, _setChildNodes)
    def hasContent(self):
        """Return true if the node has children or text"""
        return bool(self._element.text or self._element.getchildren())
    def appendChild(self, node):
        self._childNodes.append(node)
        self._element.append(node._element)
        node.parent = self
    def insertBefore(self, node, refNode):
        index = self._element.getchildren().index(refNode._element)
        self._element.insert(index, node._element)
        node.parent = self
    def removeChild(self, node):
        self._element.remove(node._element)
        node.parent=None
    def insertText(self, data, insertBefore=None):
        if not(len(self._element)):
            self._element.text += data
        elif insertBefore is None:
            #Insert the text as the tail of the last child element
            self._element[-1].tail += data
        else:
            #Insert the text before the specified node
            children = self._element.getchildren()
            index = children.index(insertBefore._element)
            if index > 0:
                self._element[index-1].tail += data
            else:
                self._element.text += data
    def cloneNode(self):
        element = Element(self.name)
        element.attributes = self.attributes
        return element
    def reparentChildren(self, newParent):
        if newParent.childNodes:
            newParent.childNodes[-1]._element.tail += self._element.text
        else:
            newParent._element.text += self._element.text
        self._element.text = ""
        _base.Node.reparentChildren(self, newParent)
 class Comment(Element):
    def __init__(self, data):
        Element.__init__(self, Comment)
        self._element.text = data
    def _getData(self):
        return self._element.text
    def _setData(self, value):
        self._element.text = value
    data = property(_getData, _setData)
 class DocumentType(Element):
    def __init__(self, name):
        Element.__init__(self, DocumentType) 
        self._element.text = name
 class Document(Element):
    def __init__(self):
        Element.__init__(self, Document) 
 def testSerializer(element):
    rv = []
    finalText = None
    def serializeElement(element, indent=0):
        if element.tag is DocumentType:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
        elif element.tag is Document:
            rv.append("#document")
            if element.text:
                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
            if element.tail:
                finalText = element.tail
        elif element.tag is Comment:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.tag))
            if hasattr(element, "attrib"):
                for name, value in element.attrib.iteritems():
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            if element.text:
                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
        indent += 2
        for child in element.getchildren():
            serializeElement(child, indent)
        if element.tail:
            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
    serializeElement(element, 0)
    if finalText is not None:
        rv.append("|%s\"%s\""%(' '*2, finalText))
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if element.tag is DocumentType:
            rv.append("<!DOCTYPE %s>"%(element.text,))
        elif element.tag is Document:
            if element.text:
                rv.append(element.text)
            if element.tail:
                finalText = element.tail
            for child in element.getchildren():
                serializeElement(child)
        elif element.tag is Comment:
            rv.append("<!--%s-->"%(element.text,))
        else:
            #This is assumed to be an ordinary element
            if not element.attrib:
                rv.append("<%s>"%(element.tag,))
            else:
                attr = " ".join(["%s=\"%s\""%(name, value) 
                                 for name, value in element.attrib.iteritems()])
                rv.append("<%s %s>"%(element.tag, attr))
            if element.text:
                rv.append(element.text)
            for child in element.getchildren():
                serializeElement(child)
            rv.append("</%s>"%(element.tag,))
        if element.tail:
            rv.append(element.tail)
    serializeElement(element)
    if finalText is not None:
        rv.append("%s\""%(' '*2, finalText))
    return "".join(rv)
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = Comment
    def testSerializer(self, element):
        return testSerializer(element)
 class TreeBuilder(etreefull.TreeBuilder):
    def getDocument(self):
-        return self.document._element
+        return self.document._element.find("html")
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@ -0,0 +1,216 @@
 try:
    from xml.etree import ElementTree
 except ImportError:
    from elementtree import ElementTree
 import _base
 class Element(_base.Node):
    def __init__(self, name):
        self._element = ElementTree.Element(name)
        self.name = name
        self.parent = None
        self._childNodes = []
        self._flags = []
    def _setName(self, name):
        self._element.tag = name
    def _getName(self):
        return self._element.tag
    name = property(_getName, _setName)
    def _getAttributes(self):
        return self._element.attrib
    def _setAttributes(self, attributes):
        #Delete existing attributes first
        #XXX - there may be a better way to do this...
        for key in self._element.attrib.keys():
            del self._element.attrib[key]
        for key, value in attributes.iteritems():
            self._element.set(key, value)
    attributes = property(_getAttributes, _setAttributes)
    def _getChildNodes(self):
        return self._childNodes
    def _setChildNodes(self, value):
        del self._element[:]
        self._childNodes = []
        for element in value:
            self.insertChild(element)
    childNodes = property(_getChildNodes, _setChildNodes)
    def hasContent(self):
        """Return true if the node has children or text"""
        return bool(self._element.text or self._element.getchildren())
    def appendChild(self, node):
        self._childNodes.append(node)
        self._element.append(node._element)
        node.parent = self
    def insertBefore(self, node, refNode):
        index = self._element.getchildren().index(refNode._element)
        self._element.insert(index, node._element)
        node.parent = self
    def removeChild(self, node):
        self._element.remove(node._element)
        node.parent=None
    def insertText(self, data, insertBefore=None):
        if not(len(self._element)):
            if not self._element.text:
                self._element.text = ""
            self._element.text += data
        elif insertBefore is None:
            #Insert the text as the tail of the last child element
            if not self._element[-1].tail:
                self._element[-1].tail = ""
            self._element[-1].tail += data
        else:
            #Insert the text before the specified node
            children = self._element.getchildren()
            index = children.index(insertBefore._element)
            if index > 0:
                if not self._element[index-1].tail:
                    self._element[index-1].tail = ""
                self._element[index-1].tail += data
            else:
                if not self._element.text:
                    self._element.text = ""
                self._element.text += data
    def cloneNode(self):
        element = Element(self.name)
        element.attributes = self.attributes
        return element
    def reparentChildren(self, newParent):
        if newParent.childNodes:
            newParent.childNodes[-1]._element.tail += self._element.text
        else:
            if not newParent._element.text:
                newParent._element.text = ""
            if self._element.text is not None:
                newParent._element.text += self._element.text
        self._element.text = ""
        _base.Node.reparentChildren(self, newParent)
 class Comment(Element):
    def __init__(self, data):
        #Use the superclass constructor to set all properties on the 
        #wrapper element
        Element.__init__(self, None)
        self._element = ElementTree.Comment(data)
    def _getData(self):
        return self._element.text
    def _setData(self, value):
        self._element.text = value
    data = property(_getData, _setData)
 class DocumentType(Element):
    def __init__(self, name):
        Element.__init__(self, DocumentType) 
        self._element.text = name
 class Document(Element):
    def __init__(self):
        Element.__init__(self, Document) 
 def testSerializer(element):
    rv = []
    finalText = None
    def serializeElement(element, indent=0):
        if element.tag is DocumentType:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
        elif element.tag is Document:
            rv.append("#document")
            if element.text:
                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
            if element.tail:
                finalText = element.tail
        elif element.tag is ElementTree.Comment:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.tag))
            if hasattr(element, "attrib"):
                for name, value in element.attrib.iteritems():
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            if element.text:
                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
        indent += 2
        for child in element.getchildren():
            serializeElement(child, indent)
        if element.tail:
            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
    serializeElement(element, 0)
    if finalText is not None:
        rv.append("|%s\"%s\""%(' '*2, finalText))
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if element.tag is DocumentType:
            rv.append("<!DOCTYPE %s>"%(element.text,))
        elif element.tag is Document:
            if element.text:
                rv.append(element.text)
            if element.tail:
                finalText = element.tail
            for child in element.getchildren():
                serializeElement(child)
        elif element.tag is ElementTree.Comment:
            rv.append("<!--%s-->"%(element.text,))
        else:
            #This is assumed to be an ordinary element
            if not element.attrib:
                rv.append("<%s>"%(element.tag,))
            else:
                attr = " ".join(["%s=\"%s\""%(name, value) 
                                 for name, value in element.attrib.iteritems()])
                rv.append("<%s %s>"%(element.tag, attr))
            if element.text:
                rv.append(element.text)
            for child in element.getchildren():
                serializeElement(child)
            rv.append("</%s>"%(element.tag,))
        if element.tail:
            rv.append(element.tail)
    serializeElement(element)
    if finalText is not None:
        rv.append("%s\""%(' '*2, finalText))
    return "".join(rv)
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = Comment
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        return self.document._element
--- a/planet/httplib2/init.py
+++ b/planet/httplib2/init.py
@ -16,11 +16,13 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
    "Xavier Verges Farrero",
    "Jonathan Feinberg",
    "Blair Zajac",
-    "Sam Ruby"]
+    "Sam Ruby",
    "Louis Nyffenegger"]
 __license__ = "MIT"
-__version__ = "$Rev: 217 $"
+__version__ = "$Rev: 227 $"
 import re 
 import sys 
 import md5
 import email
 import email.Utils
@ -41,6 +43,12 @@ import hmac
 from gettext import gettext as _
 from socket import gaierror
 if sys.version_info >= (2,3):
    from iri2uri import iri2uri
 else:
    def iri2uri(uri):
        return uri
 __all__ = ['Http', 'Response', 'HttpLib2Error',
  'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 
  'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
@ -51,7 +59,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error',
 debuglevel = 0
 # Python 2.3 support
-if 'sorted' not in __builtins__:
+if sys.version_info < (2,4):
    def sorted(seq):
        seq.sort()
        return seq
@ -60,7 +68,6 @@ if 'sorted' not in __builtins__:
 def HTTPResponse__getheaders(self):
    """Return list of (header, value) tuples."""
    if self.msg is None:
        print "================================"
        raise httplib.ResponseNotReady()
    return self.msg.items()
@ -75,6 +82,8 @@ class RedirectLimit(HttpLib2Error): pass
 class FailedToDecompressContent(HttpLib2Error): pass
 class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
 class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
 class RelativeURIError(HttpLib2Error): pass
 class ServerNotFoundError(HttpLib2Error): pass
 # Open Items:
 # -----------
@ -118,6 +127,8 @@ def parse_uri(uri):
 def urlnorm(uri):
    (scheme, authority, path, query, fragment) = parse_uri(uri)
    if not scheme or not authority:
        raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
    authority = authority.lower()
    scheme = scheme.lower()
    if not path: 
@ -125,6 +136,7 @@ def urlnorm(uri):
    # Could do syntax based normalization of the URI before
    # computing the digest. See Section 6.2.2 of Std 66.
    request_uri = query and "?".join([path, query]) or path
    scheme = scheme.lower()
    defrag_uri = scheme + "://" + authority + request_uri
    return scheme, authority, request_uri, defrag_uri
@ -143,9 +155,10 @@ def safename(filename):
    try:
        if re_url_scheme.match(filename):
            if isinstance(filename,str):
-                filename=filename.decode('utf-8').encode('idna')
+                filename = filename.decode('utf-8')
                filename = filename.encode('idna')
            else:
-                filename=filename.encode('idna')
+                filename = filename.encode('idna')
    except:
        pass
    if isinstance(filename,unicode):
@ -260,16 +273,26 @@ def _entry_disposition(response_headers, request_headers):
        now = time.time()
        current_age = max(0, now - date)
        if cc_response.has_key('max-age'):
            try:
                freshness_lifetime = int(cc_response['max-age'])
            except:
                freshness_lifetime = 0
        elif response_headers.has_key('expires'):
            expires = email.Utils.parsedate_tz(response_headers['expires'])
            freshness_lifetime = max(0, calendar.timegm(expires) - date)
        else:
            freshness_lifetime = 0
        if cc.has_key('max-age'):
-            freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
+            try:
                freshness_lifetime = int(cc['max-age'])
            except:
                freshness_lifetime = 0
        if cc.has_key('min-fresh'):
-            current_age += int(cc['min-fresh'])
+            try:
                min_fresh = int(cc['min-fresh'])
            except:
                min_fresh = 0
            current_age += min_fresh 
        if freshness_lifetime > current_age:
            retval = "FRESH"
    return retval 
@ -418,13 +441,13 @@ class DigestAuthentication(Authentication):
    def response(self, response, content):
        if not response.has_key('authentication-info'):
-            challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
+            challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
            if 'true' == challenge.get('stale'):
                self.challenge['nonce'] = challenge['nonce']
                self.challenge['nc'] = 1 
                return True
        else:
-            updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
+            updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
            if updated_challenge.has_key('nextnonce'):
                self.challenge['nonce'] = updated_challenge['nextnonce']
@ -440,7 +463,6 @@ class HmacDigestAuthentication(Authentication):
        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
        challenge = _parse_www_authenticate(response, 'www-authenticate')
        self.challenge = challenge['hmacdigest']
        print self.challenge
        # TODO: self.challenge['domain']
        self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
        if self.challenge['reason'] not in ['unauthorized', 'integrity']:
@ -466,9 +488,6 @@ class HmacDigestAuthentication(Authentication):
                    self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
                    ":", self.challenge['realm']
                    ])
        print response['www-authenticate']
        print "".join([self.credentials[1], self.challenge['salt']])
        print "key_str = %s" % self.key
        self.key = self.pwhashmod.new(self.key).hexdigest().lower()
    def request(self, method, request_uri, headers, content):
@ -479,8 +498,6 @@ class HmacDigestAuthentication(Authentication):
        created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
        cnonce = _cnonce()
        request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
        print "key = %s" % self.key
        print "msg = %s" % request_digest
        request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
        headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
                self.credentials[0], 
@ -641,6 +658,8 @@ class Http:
            try:
                conn.request(method, request_uri, body, headers)
                response = conn.getresponse()
            except gaierror:
                raise ServerNotFoundError("Unable to find the server at %s" % request_uri)
            except:
                if i == 0:
                    conn.close()
@ -752,6 +771,8 @@ a string that contains the response entity body.
        if not headers.has_key('user-agent'):
            headers['user-agent'] = "Python-httplib2/%s" % __version__
        uri = iri2uri(uri)
        (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
        if not self.connections.has_key(scheme+":"+authority):
@ -780,7 +801,7 @@ a string that contains the response entity body.
        else:
            cachekey = None
-        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
+        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
            # http://www.w3.org/1999/04/Editing/ 
            headers['if-match'] = info['etag']
@ -815,9 +836,9 @@ a string that contains the response entity body.
                    return (response, content)
                if entry_disposition == "STALE":
-                    if info.has_key('etag') and not self.ignore_etag:
+                    if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
                        headers['if-none-match'] = info['etag']
-                    if info.has_key('last-modified'):
+                    if info.has_key('last-modified') and not 'last-modified' in headers:
                        headers['if-modified-since'] = info['last-modified']
                elif entry_disposition == "TRANSPARENT":
                    pass
--- a/planet/httplib2/iri2uri.py
+++ b/planet/httplib2/iri2uri.py
@ -0,0 +1,110 @@
 """
 iri2uri
 Converts an IRI to a URI.
 """
 __author__ = "Joe Gregorio (joe@bitworking.org)"
 __copyright__ = "Copyright 2006, Joe Gregorio"
 __contributors__ = []
 __version__ = "1.0.0"
 __license__ = "MIT"
 __history__ = """
 """
 import urlparse
 # Convert an IRI to a URI following the rules in RFC 3987
 # 
 # The characters we need to enocde and escape are defined in the spec:
 #
 # iprivate =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
 # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
 #         / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
 #         / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
 #         / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
 #         / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
 #         / %xD0000-DFFFD / %xE1000-EFFFD
 escape_range = [
   (0xA0, 0xD7FF ),
   (0xE000, 0xF8FF ),
   (0xF900, 0xFDCF ),
   (0xFDF0, 0xFFEF),
   (0x10000, 0x1FFFD ),
   (0x20000, 0x2FFFD ),
   (0x30000, 0x3FFFD),
   (0x40000, 0x4FFFD ),
   (0x50000, 0x5FFFD ),
   (0x60000, 0x6FFFD),
   (0x70000, 0x7FFFD ),
   (0x80000, 0x8FFFD ),
   (0x90000, 0x9FFFD),
   (0xA0000, 0xAFFFD ),
   (0xB0000, 0xBFFFD ),
   (0xC0000, 0xCFFFD),
   (0xD0000, 0xDFFFD ),
   (0xE1000, 0xEFFFD),
   (0xF0000, 0xFFFFD ),
   (0x100000, 0x10FFFD)
 ]
 def encode(c):
    retval = c
    i = ord(c)
    for low, high in escape_range:
        if i < low:
            break
        if i >= low and i <= high:
            retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
            break
    return retval
 def iri2uri(uri):
    """Convert an IRI to a URI. Note that IRIs must be 
    passed in a unicode strings. That is, do not utf-8 encode
    the IRI before passing it into the function.""" 
    if isinstance(uri ,unicode):
        (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
        authority = authority.encode('idna')
        # For each character in 'ucschar' or 'iprivate'
        #  1. encode as utf-8
        #  2. then %-encode each octet of that utf-8 
        uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
        uri = "".join([encode(c) for c in uri])
    return uri
 if __name__ == "__main__":
    import unittest
    class Test(unittest.TestCase):
        def test_uris(self):
            """Test that URIs are invariant under the transformation."""
            invariant = [ 
                u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
                u"http://www.ietf.org/rfc/rfc2396.txt",
                u"ldap://[2001:db8::7]/c=GB?objectClass?one",
                u"mailto:John.Doe@example.com",
                u"news:comp.infosystems.www.servers.unix",
                u"tel:+1-816-555-1212",
                u"telnet://192.0.2.16:80/",
                u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
            for uri in invariant:
                self.assertEqual(uri, iri2uri(uri))
        def test_iri(self):
            """ Test that the right type of escaping is done for each part of the URI."""
            self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
            self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
            self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
            self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
            self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
            self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
            self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
    unittest.main()
--- a/planet/scrub.py
+++ b/planet/scrub.py
@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
 import time
 # Planet modules
 import planet, config, shell
 from planet import feedparser
 type_map = {'text': 'text/plain', 'html': 'text/html',
    'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]
    scrub_xmlbase = config.xml_base(feed_uri)
    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content':
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue
            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue
            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)
                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)
            node['value'] = feedparser._sanitizeHTML(
                node.value, 'utf-8', node.type)
--- a/planet/spider.py
+++ b/planet/spider.py
@ -254,7 +254,6 @@ def writeCache(feed_uri, feed_info, data):
 def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2, md5
    from socket import gaierror, error 
    from httplib import BadStatusLine
    h = httplib2.Http(config.http_cache_directory())
@ -304,13 +303,12 @@ def httpThread(thread_index, input_queue, output_queue, log):
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except gaierror:
            log.error("Fail to resolve server name %s via %d",
                uri, thread_index)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d",
                uri, thread_index)
-        except error, e:
+        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.error, e:
            if e.__class__.__name__.lower()=='timeout':
                feed.headers['status'] = '408'
                log.warn("Timeout in thread-%d", thread_index)
--- a/tests/test_reconstitute.py
+++ b/tests/test_reconstitute.py
@ -3,6 +3,7 @@
 import unittest, os, sys, glob, new, re, StringIO, time
 from planet import feedparser
 from planet.reconstitute import reconstitute
 from planet.scrub import scrub
 testfiles = 'tests/data/reconstitute/%s.xml'
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
        # parse and reconstitute to a string
        work = StringIO.StringIO()
        results = feedparser.parse(data)
        scrub(testfiles%name, results)
        reconstitute(results, results.entries[0]).writexml(work)
        # verify the results
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -6,7 +6,7 @@ from planet.scrub import scrub
 from planet import feedparser, config
 feed = '''
-<feed xmlns='http://www.w3.org/2005/Atom'>
+<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
  <author><name>F&amp;ouml;o</name></author>
  <entry xml:lang="en">
    <id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
    <title>F&amp;ouml;o</title>
    <summary>F&amp;ouml;o</summary>
    <content>F&amp;ouml;o</content>
    <link href="http://example.com/entry/1/"/>
    <source>
      <link href="http://example.com/feed/"/>
      <author><name>F&amp;ouml;o</name></author>
    </source>
  </entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual(0, len(data.entries))
    def test_scrub_xmlbase(self):
        base = feedparser.parse(feed)
        self.assertEqual('http://example.com/',
             base.entries[0].title_detail.base)
        config.parser.readfp(StringIO.StringIO(configData))
        config.parser.set('testfeed', 'xml_base', 'feed_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/feed/',
             data.entries[0].title_detail.base)
        config.parser.set('testfeed', 'xml_base', 'entry_alternate')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/entry/1/',
             data.entries[0].title_detail.base)
        config.parser.set('testfeed', 'xml_base', 'base/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.com/base/',
             data.entries[0].title_detail.base)
        config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual('http://example.org/data/',
             data.entries[0].title_detail.base)
--- a/themes/common/validate.html.xslt
+++ b/themes/common/validate.html.xslt
@ -35,7 +35,7 @@
              <th>Name</th>
              <th>Format</th>
              <xsl:if test="//planet:ignore_in_feed | //planet:filters |
-                //planet:*[contains(local-name(),'_type')]">
+                //planet:xml_base | //planet:*[contains(local-name(),'_type')]">
                <th>Notes</th>
              </xsl:if>
            </tr>
@ -128,12 +128,12 @@
        </a>
      </td>
      <td><xsl:value-of select="planet:format"/></td>
-      <xsl:if test="planet:ignore_in_feed | planet:filters |
+      <xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
        planet:*[contains(local-name(),'_type')]">
        <td>
          <dl>
            <xsl:for-each select="planet:ignore_in_feed | planet:filters |
-              planet:*[contains(local-name(),'_type')]">
+              planet:xml_base | planet:*[contains(local-name(),'_type')]">
              <xsl:sort select="local-name()"/>
              <dt><xsl:value-of select="local-name()"/></dt>
              <dd><xsl:value-of select="."/></dd>