Latest from Sam.

2007-02-05 15:15:04 -06:00 · 2007-02-05 15:15:04 -06:00 · 215777b9ee
commit 215777b9ee
parent 5276e47197 bc33615ced
16 changed files with 559 additions and 246 deletions
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -95,6 +95,13 @@ attributes on these elements.</li>
 <li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
 </ul>
 </li>
+<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>).  Other elements in the feed (most notably, <code>link</code> are not affected by this value.
+<ul style="margin:0">
+<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
+<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
+<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>.  These values may be relative or absolute.  If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
+</ul>
+</li>
 </ul>
 </body>
 </html>
--- a/planet/init.py
+++ b/planet/init.py
@ -30,5 +30,7 @@ def getLogger(level, format):

    return logger

-
-
+# Configure feed parser
+from planet import feedparser
+feedparser.SANITIZE_HTML=0
+feedparser.RESOLVE_RELATIVE_URIS=0
--- a/planet/config.py
+++ b/planet/config.py
@ -125,6 +125,7 @@ def __init__():
    define_tmpl('summary_type', '')
    define_tmpl('content_type', '')
    define_tmpl('future_dates', 'keep')
+    define_tmpl('xml_base', '')

 def load(config_file):
    """ initialize and load a configuration"""
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
 # if TIDY_MARKUP = 1
 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]

+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
+
 # ---------- required modules (should come with any Python distribution) ----------
 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
 try:
@ -732,7 +740,7 @@ class _FeedParserMixin:

        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
        # resolve relative URIs within embedded markup
-        if is_htmlish:
+        if is_htmlish and RESOLVE_RELATIVE_URIS:
            if element in self.can_contain_relative_uris:
                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
                
@ -753,7 +761,7 @@ class _FeedParserMixin:
                    self._getContext()['vcard'] = vcard
        
        # sanitize embedded markup
-        if is_htmlish:
+        if is_htmlish and SANITIZE_HTML:
            if element in self.can_contain_dangerous_markup:
                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))

--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@ -1,3 +1,4 @@
+
 # Differences from the current specification (23 December 2006) are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
    # the crazy mode
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
+
+        #Keep a ref to this for special handling of whitespace in <pre>
+        self.processSpaceCharactersNonPre = self.processSpaceCharacters
+
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("script", "style"), self.startTagScriptStyle),
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
            self.tree.openElements[-1])

    # the real deal
+    def processSpaceCharactersPre(self, data):
+        #Sometimes (start of <pre> blocks) we want to drop leading newlines
+        self.processSpaceCharacters = self.processSpaceCharactersNonPre
+        if (data.startswith("\n") and self.tree.openElements[-1].name == "pre" 
+            and not self.tree.openElements[-1].hasContent()):
+            data = data[1:]
+        if data:
+            self.tree.insertText(data)
+
    def processCharacters(self, data):
        # XXX The specification says to do this for every character at the
        # moment, but apparently that doesn't match the real world so we don't
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
+        if name == "pre":
+            self.processSpaceCharacters = self.processSpaceCharactersPre

    def startTagForm(self, name, attributes):
        if self.tree.formPointer:
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
            self.parser.phase.processEndTag(name)

    def endTagBlock(self, name):
+        #Put us back in the right whitespace handling mode
+        if name == "pre":
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
        inScope = self.tree.elementInScope(name)
        if inScope:
            self.tree.generateImpliedEndTags()
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@ -11,11 +11,6 @@ References:
 * http://wiki.whatwg.org/wiki/HtmlVsXhtml

@@TODO:
- * Produce SAX events based on the produced DOM.  This is intended not to
-   support streaming, but rather to support application level compatibility. 
- * Optional namespace support
- * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
-   indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
 * Selectively lowercase only XHTML, but not foreign markup
 """

@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
            if token["data"]:
               self.parseError(_("End tag contains unexpected attributes."))

+        elif token["type"] == "Comment":
+            # Rescue CDATA from the comments
+            if (token["data"].startswith("[CDATA[") and
+                token["data"].endswith("]]")):
+                token["type"] = "Characters"
+                token["data"] = token["data"][7:-2]
+
        return token

 class XHTMLParser(XMLParser):
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@ -1,5 +1,6 @@
 import _base
-from xml.dom import minidom, Node
+from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
+import new

 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
+        def hilite(self, encoding):
+            print 'foo'
+        method = new.instancemethod(hilite, self.dom, self.dom.__class__)
+        setattr(self.dom, 'hilite', method)
        return self

    def doctypeClass(self,name):
@ -129,3 +134,58 @@ def testSerializer(element):
    serializeElement(element, 0)

    return "\n".join(rv)
+
+def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
+  if node.nodeType == Node.ELEMENT_NODE:
+    if not nsmap:
+      handler.startElement(node.nodeName, node.attributes)
+      for child in node.childNodes: dom2sax(child, handler, nsmap)
+      handler.endElement(node.nodeName)
+    else:
+      attributes = dict(node.attributes.itemsNS()) 
+
+      # gather namespace declarations
+      prefixes = []
+      for attrname in node.attributes.keys():
+        attr = node.getAttributeNode(attrname)
+        if (attr.namespaceURI == XMLNS_NAMESPACE or
+           (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
+          prefix = (attr.localName != 'xmlns' and attr.localName or None)
+          handler.startPrefixMapping(prefix, attr.nodeValue)
+          prefixes.append(prefix)
+          nsmap = nsmap.copy()
+          nsmap[prefix] = attr.nodeValue
+          del attributes[(attr.namespaceURI, attr.localName)]
+
+      # apply namespace declarations
+      for attrname in node.attributes.keys():
+        attr = node.getAttributeNode(attrname)
+        if attr.namespaceURI == None and ':' in attr.nodeName:
+          prefix = attr.nodeName.split(':')[0]
+          if nsmap.has_key(prefix):
+            del attributes[(attr.namespaceURI, attr.localName)]
+            attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
+
+      # SAX events
+      ns = node.namespaceURI or nsmap.get(None,None)
+      handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
+      for child in node.childNodes: dom2sax(child, handler, nsmap)
+      handler.endElementNS((ns, node.nodeName), node.nodeName)
+      for prefix in prefixes: handler.endPrefixMapping(prefix)
+
+  elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
+    handler.characters(node.nodeValue)
+
+  elif node.nodeType == Node.DOCUMENT_NODE:
+    handler.startDocument()
+    for child in node.childNodes: dom2sax(child, handler, nsmap)
+    handler.endDocument()
+
+  else:
+    # ATTRIBUTE_NODE
+    # ENTITY_NODE
+    # PROCESSING_INSTRUCTION_NODE
+    # COMMENT_NODE
+    # DOCUMENT_TYPE_NODE
+    # NOTATION_NODE
+    pass
--- a/planet/html5lib/treebuilders/etree.py
+++ b/planet/html5lib/treebuilders/etree.py
@ -1,208 +1,5 @@
-try:
-    from xml.etree import ElementTree
-except ImportError:
-    from elementtree import ElementTree
-
-import _base
-
-class Element(_base.Node):
-    def __init__(self, name):
-        self._element = ElementTree.Element(name)
-        self.name = name
-        self.parent = None
-        self._childNodes = []
-        self._flags = []
-
-        #Set the element text and tail to the empty string rather than None
-        #XXX - is this desirable or should we do it on a case by case basis?
-        self._element.text = ""
-        self._element.tail = ""
-
-    def _setName(self, name):
-        self._element.tag = name
-    
-    def _getName(self):
-        return self._element.tag
-
-    name = property(_getName, _setName)
-
-    def _getAttributes(self):
-        return self._element.attrib
-
-    def _setAttributes(self, attributes):
-        #Delete existing attributes first
-        #XXX - there may be a better way to do this...
-        for key in self._element.attrib.keys():
-            del self._element.attrib[key]
-        for key, value in attributes.iteritems():
-            self._element.set(key, value)
-
-    attributes = property(_getAttributes, _setAttributes)
-
-    def _getChildNodes(self):
-        return self._childNodes
-
-    def _setChildNodes(self, value):
-        del self._element[:]
-        self._childNodes = []
-        for element in value:
-            self.insertChild(element)
-
-    childNodes = property(_getChildNodes, _setChildNodes)
-
-    def hasContent(self):
-        """Return true if the node has children or text"""
-        return bool(self._element.text or self._element.getchildren())
-
-    def appendChild(self, node):
-        self._childNodes.append(node)
-        self._element.append(node._element)
-        node.parent = self
-
-    def insertBefore(self, node, refNode):
-        index = self._element.getchildren().index(refNode._element)
-        self._element.insert(index, node._element)
-        node.parent = self
-
-    def removeChild(self, node):
-        self._element.remove(node._element)
-        node.parent=None
-
-    def insertText(self, data, insertBefore=None):
-        if not(len(self._element)):
-            self._element.text += data
-        elif insertBefore is None:
-            #Insert the text as the tail of the last child element
-            self._element[-1].tail += data
-        else:
-            #Insert the text before the specified node
-            children = self._element.getchildren()
-            index = children.index(insertBefore._element)
-            if index > 0:
-                self._element[index-1].tail += data
-            else:
-                self._element.text += data
-
-    def cloneNode(self):
-        element = Element(self.name)
-        element.attributes = self.attributes
-        return element
-
-    def reparentChildren(self, newParent):
-        if newParent.childNodes:
-            newParent.childNodes[-1]._element.tail += self._element.text
-        else:
-            newParent._element.text += self._element.text
-        self._element.text = ""
-        _base.Node.reparentChildren(self, newParent)
-
-class Comment(Element):
-    def __init__(self, data):
-        Element.__init__(self, Comment)
-        self._element.text = data
-
-    def _getData(self):
-        return self._element.text
-
-    def _setData(self, value):
-        self._element.text = value
-
-    data = property(_getData, _setData)
-
-class DocumentType(Element):
-    def __init__(self, name):
-        Element.__init__(self, DocumentType) 
-        self._element.text = name
-
-class Document(Element):
-    def __init__(self):
-        Element.__init__(self, Document) 
-
-def testSerializer(element):
-    rv = []
-    finalText = None
-    def serializeElement(element, indent=0):
-        if element.tag is DocumentType:
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
-        elif element.tag is Document:
-            rv.append("#document")
-            if element.text:
-                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
-            if element.tail:
-                finalText = element.tail
-        elif element.tag is Comment:
-            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
-        else:
-            rv.append("|%s<%s>"%(' '*indent, element.tag))
-            if hasattr(element, "attrib"):
-                for name, value in element.attrib.iteritems():
-                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
-            if element.text:
-                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
-        indent += 2
-        for child in element.getchildren():
-            serializeElement(child, indent)
-        if element.tail:
-            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
-    serializeElement(element, 0)
-
-    if finalText is not None:
-        rv.append("|%s\"%s\""%(' '*2, finalText))
-
-    return "\n".join(rv)
-
-def tostring(element):
-    """Serialize an element and its child nodes to a string"""
-    rv = []
-    finalText = None
-    def serializeElement(element):
-        if element.tag is DocumentType:
-            rv.append("<!DOCTYPE %s>"%(element.text,))
-        elif element.tag is Document:
-            if element.text:
-                rv.append(element.text)
-            if element.tail:
-                finalText = element.tail
-
-            for child in element.getchildren():
-                serializeElement(child)
-
-        elif element.tag is Comment:
-            rv.append("<!--%s-->"%(element.text,))
-        else:
-            #This is assumed to be an ordinary element
-            if not element.attrib:
-                rv.append("<%s>"%(element.tag,))
-            else:
-                attr = " ".join(["%s=\"%s\""%(name, value) 
-                                 for name, value in element.attrib.iteritems()])
-                rv.append("<%s %s>"%(element.tag, attr))
-            if element.text:
-                rv.append(element.text)
-
-            for child in element.getchildren():
-                serializeElement(child)
-
-            rv.append("</%s>"%(element.tag,))
-
-        if element.tail:
-            rv.append(element.tail)
-
-    serializeElement(element)
-
-    if finalText is not None:
-        rv.append("%s\""%(' '*2, finalText))
-
-    return "".join(rv)
-
-class TreeBuilder(_base.TreeBuilder):
-    documentClass = Document
-    doctypeClass = DocumentType
-    elementClass = Element
-    commentClass = Comment
-
-    def testSerializer(self, element):
-        return testSerializer(element)
+import etreefull

+class TreeBuilder(etreefull.TreeBuilder):
    def getDocument(self):
-        return self.document._element
+        return self.document._element.find("html")
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@ -0,0 +1,216 @@
+try:
+    from xml.etree import ElementTree
+except ImportError:
+    from elementtree import ElementTree
+
+import _base
+
+class Element(_base.Node):
+    def __init__(self, name):
+        self._element = ElementTree.Element(name)
+        self.name = name
+        self.parent = None
+        self._childNodes = []
+        self._flags = []
+
+    def _setName(self, name):
+        self._element.tag = name
+    
+    def _getName(self):
+        return self._element.tag
+
+    name = property(_getName, _setName)
+
+    def _getAttributes(self):
+        return self._element.attrib
+
+    def _setAttributes(self, attributes):
+        #Delete existing attributes first
+        #XXX - there may be a better way to do this...
+        for key in self._element.attrib.keys():
+            del self._element.attrib[key]
+        for key, value in attributes.iteritems():
+            self._element.set(key, value)
+
+    attributes = property(_getAttributes, _setAttributes)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    def _setChildNodes(self, value):
+        del self._element[:]
+        self._childNodes = []
+        for element in value:
+            self.insertChild(element)
+
+    childNodes = property(_getChildNodes, _setChildNodes)
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self._element.text or self._element.getchildren())
+
+    def appendChild(self, node):
+        self._childNodes.append(node)
+        self._element.append(node._element)
+        node.parent = self
+
+    def insertBefore(self, node, refNode):
+        index = self._element.getchildren().index(refNode._element)
+        self._element.insert(index, node._element)
+        node.parent = self
+
+    def removeChild(self, node):
+        self._element.remove(node._element)
+        node.parent=None
+
+    def insertText(self, data, insertBefore=None):
+        if not(len(self._element)):
+            if not self._element.text:
+                self._element.text = ""
+            self._element.text += data
+        elif insertBefore is None:
+            #Insert the text as the tail of the last child element
+            if not self._element[-1].tail:
+                self._element[-1].tail = ""
+            self._element[-1].tail += data
+        else:
+            #Insert the text before the specified node
+            children = self._element.getchildren()
+            index = children.index(insertBefore._element)
+            if index > 0:
+                if not self._element[index-1].tail:
+                    self._element[index-1].tail = ""
+                self._element[index-1].tail += data
+            else:
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+
+    def cloneNode(self):
+        element = Element(self.name)
+        element.attributes = self.attributes
+        return element
+
+    def reparentChildren(self, newParent):
+        if newParent.childNodes:
+            newParent.childNodes[-1]._element.tail += self._element.text
+        else:
+            if not newParent._element.text:
+                newParent._element.text = ""
+            if self._element.text is not None:
+                newParent._element.text += self._element.text
+        self._element.text = ""
+        _base.Node.reparentChildren(self, newParent)
+
+class Comment(Element):
+    def __init__(self, data):
+        #Use the superclass constructor to set all properties on the 
+        #wrapper element
+        Element.__init__(self, None)
+        self._element = ElementTree.Comment(data)
+
+    def _getData(self):
+        return self._element.text
+
+    def _setData(self, value):
+        self._element.text = value
+
+    data = property(_getData, _setData)
+
+class DocumentType(Element):
+    def __init__(self, name):
+        Element.__init__(self, DocumentType) 
+        self._element.text = name
+
+class Document(Element):
+    def __init__(self):
+        Element.__init__(self, Document) 
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    def serializeElement(element, indent=0):
+        if element.tag is DocumentType:
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+        elif element.tag is Document:
+            rv.append("#document")
+            if element.text:
+                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
+            if element.tail:
+                finalText = element.tail
+        elif element.tag is ElementTree.Comment:
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.tag))
+            if hasattr(element, "attrib"):
+                for name, value in element.attrib.iteritems():
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+        indent += 2
+        for child in element.getchildren():
+            serializeElement(child, indent)
+        if element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if element.tag is DocumentType:
+            rv.append("<!DOCTYPE %s>"%(element.text,))
+        elif element.tag is Document:
+            if element.text:
+                rv.append(element.text)
+            if element.tail:
+                finalText = element.tail
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+        elif element.tag is ElementTree.Comment:
+            rv.append("<!--%s-->"%(element.text,))
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = Comment
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.document._element
--- a/planet/httplib2/init.py
+++ b/planet/httplib2/init.py
@ -16,11 +16,13 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
    "Xavier Verges Farrero",
    "Jonathan Feinberg",
    "Blair Zajac",
-    "Sam Ruby"]
+    "Sam Ruby",
+    "Louis Nyffenegger"]
 __license__ = "MIT"
-__version__ = "$Rev: 217 $"
+__version__ = "$Rev: 227 $"

 import re 
+import sys 
 import md5
 import email
 import email.Utils
@ -41,6 +43,12 @@ import hmac
 from gettext import gettext as _
 from socket import gaierror

+if sys.version_info >= (2,3):
+    from iri2uri import iri2uri
+else:
+    def iri2uri(uri):
+        return uri
+
 __all__ = ['Http', 'Response', 'HttpLib2Error',
  'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 
  'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
@ -51,7 +59,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error',
 debuglevel = 0

 # Python 2.3 support
-if 'sorted' not in __builtins__:
+if sys.version_info < (2,4):
    def sorted(seq):
        seq.sort()
        return seq
@ -60,7 +68,6 @@ if 'sorted' not in __builtins__:
 def HTTPResponse__getheaders(self):
    """Return list of (header, value) tuples."""
    if self.msg is None:
-        print "================================"
        raise httplib.ResponseNotReady()
    return self.msg.items()

@ -75,6 +82,8 @@ class RedirectLimit(HttpLib2Error): pass
 class FailedToDecompressContent(HttpLib2Error): pass
 class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
 class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
+class RelativeURIError(HttpLib2Error): pass
+class ServerNotFoundError(HttpLib2Error): pass

 # Open Items:
 # -----------
@ -118,6 +127,8 @@ def parse_uri(uri):

 def urlnorm(uri):
    (scheme, authority, path, query, fragment) = parse_uri(uri)
+    if not scheme or not authority:
+        raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
    authority = authority.lower()
    scheme = scheme.lower()
    if not path: 
@ -125,6 +136,7 @@ def urlnorm(uri):
    # Could do syntax based normalization of the URI before
    # computing the digest. See Section 6.2.2 of Std 66.
    request_uri = query and "?".join([path, query]) or path
+    scheme = scheme.lower()
    defrag_uri = scheme + "://" + authority + request_uri
    return scheme, authority, request_uri, defrag_uri

@ -143,9 +155,10 @@ def safename(filename):
    try:
        if re_url_scheme.match(filename):
            if isinstance(filename,str):
-                filename=filename.decode('utf-8').encode('idna')
+                filename = filename.decode('utf-8')
+                filename = filename.encode('idna')
            else:
-                filename=filename.encode('idna')
+                filename = filename.encode('idna')
    except:
        pass
    if isinstance(filename,unicode):
@ -260,16 +273,26 @@ def _entry_disposition(response_headers, request_headers):
        now = time.time()
        current_age = max(0, now - date)
        if cc_response.has_key('max-age'):
-            freshness_lifetime = int(cc_response['max-age'])
+            try:
+                freshness_lifetime = int(cc_response['max-age'])
+            except:
+                freshness_lifetime = 0
        elif response_headers.has_key('expires'):
            expires = email.Utils.parsedate_tz(response_headers['expires'])
            freshness_lifetime = max(0, calendar.timegm(expires) - date)
        else:
            freshness_lifetime = 0
        if cc.has_key('max-age'):
-            freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
+            try:
+                freshness_lifetime = int(cc['max-age'])
+            except:
+                freshness_lifetime = 0
        if cc.has_key('min-fresh'):
-            current_age += int(cc['min-fresh'])
+            try:
+                min_fresh = int(cc['min-fresh'])
+            except:
+                min_fresh = 0
+            current_age += min_fresh 
        if freshness_lifetime > current_age:
            retval = "FRESH"
    return retval 
@ -418,13 +441,13 @@ class DigestAuthentication(Authentication):

    def response(self, response, content):
        if not response.has_key('authentication-info'):
-            challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
+            challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
            if 'true' == challenge.get('stale'):
                self.challenge['nonce'] = challenge['nonce']
                self.challenge['nc'] = 1 
                return True
        else:
-            updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
+            updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})

            if updated_challenge.has_key('nextnonce'):
                self.challenge['nonce'] = updated_challenge['nextnonce']
@ -440,7 +463,6 @@ class HmacDigestAuthentication(Authentication):
        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
        challenge = _parse_www_authenticate(response, 'www-authenticate')
        self.challenge = challenge['hmacdigest']
-        print self.challenge
        # TODO: self.challenge['domain']
        self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
        if self.challenge['reason'] not in ['unauthorized', 'integrity']:
@ -466,9 +488,6 @@ class HmacDigestAuthentication(Authentication):
                    self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
                    ":", self.challenge['realm']
                    ])
-        print response['www-authenticate']
-        print "".join([self.credentials[1], self.challenge['salt']])
-        print "key_str = %s" % self.key
        self.key = self.pwhashmod.new(self.key).hexdigest().lower()

    def request(self, method, request_uri, headers, content):
@ -479,8 +498,6 @@ class HmacDigestAuthentication(Authentication):
        created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
        cnonce = _cnonce()
        request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
-        print "key = %s" % self.key
-        print "msg = %s" % request_digest
        request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
        headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
                self.credentials[0], 
@ -641,6 +658,8 @@ class Http:
            try:
                conn.request(method, request_uri, body, headers)
                response = conn.getresponse()
+            except gaierror:
+                raise ServerNotFoundError("Unable to find the server at %s" % request_uri)
            except:
                if i == 0:
                    conn.close()
@ -752,6 +771,8 @@ a string that contains the response entity body.
        if not headers.has_key('user-agent'):
            headers['user-agent'] = "Python-httplib2/%s" % __version__

+        uri = iri2uri(uri)
+
        (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)

        if not self.connections.has_key(scheme+":"+authority):
@ -780,7 +801,7 @@ a string that contains the response entity body.
        else:
            cachekey = None
                    
-        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
+        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
            # http://www.w3.org/1999/04/Editing/ 
            headers['if-match'] = info['etag']

@ -815,9 +836,9 @@ a string that contains the response entity body.
                    return (response, content)

                if entry_disposition == "STALE":
-                    if info.has_key('etag') and not self.ignore_etag:
+                    if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
                        headers['if-none-match'] = info['etag']
-                    if info.has_key('last-modified'):
+                    if info.has_key('last-modified') and not 'last-modified' in headers:
                        headers['if-modified-since'] = info['last-modified']
                elif entry_disposition == "TRANSPARENT":
                    pass
--- a/planet/httplib2/iri2uri.py
+++ b/planet/httplib2/iri2uri.py
@ -0,0 +1,110 @@
+"""
+iri2uri
+
+Converts an IRI to a URI.
+
+"""
+__author__ = "Joe Gregorio (joe@bitworking.org)"
+__copyright__ = "Copyright 2006, Joe Gregorio"
+__contributors__ = []
+__version__ = "1.0.0"
+__license__ = "MIT"
+__history__ = """
+"""
+
+import urlparse
+
+
+# Convert an IRI to a URI following the rules in RFC 3987
+# 
+# The characters we need to enocde and escape are defined in the spec:
+#
+# iprivate =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
+# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
+#         / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
+#         / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
+#         / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
+#         / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
+#         / %xD0000-DFFFD / %xE1000-EFFFD
+
+escape_range = [
+   (0xA0, 0xD7FF ),
+   (0xE000, 0xF8FF ),
+   (0xF900, 0xFDCF ),
+   (0xFDF0, 0xFFEF),
+   (0x10000, 0x1FFFD ),
+   (0x20000, 0x2FFFD ),
+   (0x30000, 0x3FFFD),
+   (0x40000, 0x4FFFD ),
+   (0x50000, 0x5FFFD ),
+   (0x60000, 0x6FFFD),
+   (0x70000, 0x7FFFD ),
+   (0x80000, 0x8FFFD ),
+   (0x90000, 0x9FFFD),
+   (0xA0000, 0xAFFFD ),
+   (0xB0000, 0xBFFFD ),
+   (0xC0000, 0xCFFFD),
+   (0xD0000, 0xDFFFD ),
+   (0xE1000, 0xEFFFD),
+   (0xF0000, 0xFFFFD ),
+   (0x100000, 0x10FFFD)
+]
+ 
+def encode(c):
+    retval = c
+    i = ord(c)
+    for low, high in escape_range:
+        if i < low:
+            break
+        if i >= low and i <= high:
+            retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
+            break
+    return retval
+
+
+def iri2uri(uri):
+    """Convert an IRI to a URI. Note that IRIs must be 
+    passed in a unicode strings. That is, do not utf-8 encode
+    the IRI before passing it into the function.""" 
+    if isinstance(uri ,unicode):
+        (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
+        authority = authority.encode('idna')
+        # For each character in 'ucschar' or 'iprivate'
+        #  1. encode as utf-8
+        #  2. then %-encode each octet of that utf-8 
+        uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
+        uri = "".join([encode(c) for c in uri])
+    return uri
+        
+if __name__ == "__main__":
+    import unittest
+
+    class Test(unittest.TestCase):
+
+        def test_uris(self):
+            """Test that URIs are invariant under the transformation."""
+            invariant = [ 
+                u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
+                u"http://www.ietf.org/rfc/rfc2396.txt",
+                u"ldap://[2001:db8::7]/c=GB?objectClass?one",
+                u"mailto:John.Doe@example.com",
+                u"news:comp.infosystems.www.servers.unix",
+                u"tel:+1-816-555-1212",
+                u"telnet://192.0.2.16:80/",
+                u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
+            for uri in invariant:
+                self.assertEqual(uri, iri2uri(uri))
+            
+        def test_iri(self):
+            """ Test that the right type of escaping is done for each part of the URI."""
+            self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
+            self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
+            self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
+            self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
+            self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
+            self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
+            self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
+
+    unittest.main()
+
+    
--- a/planet/scrub.py
+++ b/planet/scrub.py
@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
 import time
 # Planet modules
 import planet, config, shell
+from planet import feedparser

 type_map = {'text': 'text/plain', 'html': 'text/html',
    'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
          or entry['published_parsed'] <= now) and
        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
          or entry['updated_parsed'] <= now)]
+
+    scrub_xmlbase = config.xml_base(feed_uri)
+
+    # resolve relative URIs and sanitize
+    for entry in data.entries + [data.feed]:
+        for key in entry.keys():
+            if key == 'content':
+                node = entry.content[0]
+            elif key.endswith('_detail'):
+                node = entry[key]
+            else:
+                continue
+
+            if not node.has_key('type'): continue
+            if not 'html' in node['type']: continue
+            if not node.has_key('value'): continue
+
+            if node.has_key('base'):
+                if scrub_xmlbase:
+                    if scrub_xmlbase == 'feed_alternate':
+                        if entry.has_key('source') and \
+                            entry.source.has_key('link'):
+                            node['base'] = entry.source.link
+                        elif data.feed.has_key('link'):
+                            node['base'] = data.feed.link
+                    elif scrub_xmlbase == 'entry_alternate':
+                        if entry.has_key('link'):
+                            node['base'] = entry.link
+                    else:
+                        node['base'] = feedparser._urljoin(
+                            node['base'], scrub_xmlbase)
+
+                node['value'] = feedparser._resolveRelativeURIs(
+                    node.value, node.base, 'utf-8', node.type)
+
+            node['value'] = feedparser._sanitizeHTML(
+                node.value, 'utf-8', node.type)
--- a/planet/spider.py
+++ b/planet/spider.py
@ -254,7 +254,6 @@ def writeCache(feed_uri, feed_info, data):

 def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2, md5
-    from socket import gaierror, error 
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
@ -304,13 +303,12 @@ def httpThread(thread_index, input_queue, output_queue, log):
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
-        except gaierror:
-            log.error("Fail to resolve server name %s via %d",
-                uri, thread_index)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d",
                uri, thread_index)
-        except error, e:
+        except httplib2.HttpLib2Error, e:
+            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
+        except socket.error, e:
            if e.__class__.__name__.lower()=='timeout':
                feed.headers['status'] = '408'
                log.warn("Timeout in thread-%d", thread_index)
--- a/tests/test_reconstitute.py
+++ b/tests/test_reconstitute.py
@ -3,6 +3,7 @@
 import unittest, os, sys, glob, new, re, StringIO, time
 from planet import feedparser
 from planet.reconstitute import reconstitute
+from planet.scrub import scrub

 testfiles = 'tests/data/reconstitute/%s.xml'

@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
        # parse and reconstitute to a string
        work = StringIO.StringIO()
        results = feedparser.parse(data)
+        scrub(testfiles%name, results)
        reconstitute(results, results.entries[0]).writexml(work)

        # verify the results
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -6,7 +6,7 @@ from planet.scrub import scrub
 from planet import feedparser, config

 feed = '''
-<feed xmlns='http://www.w3.org/2005/Atom'>
+<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
  <author><name>F&amp;ouml;o</name></author>
  <entry xml:lang="en">
    <id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
    <title>F&amp;ouml;o</title>
    <summary>F&amp;ouml;o</summary>
    <content>F&amp;ouml;o</content>
+    <link href="http://example.com/entry/1/"/>
    <source>
+      <link href="http://example.com/feed/"/>
      <author><name>F&amp;ouml;o</name></author>
    </source>
  </entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
        data = deepcopy(base)
        scrub('testfeed', data)
        self.assertEqual(0, len(data.entries))
+
+    def test_scrub_xmlbase(self):
+        base = feedparser.parse(feed)
+        self.assertEqual('http://example.com/',
+             base.entries[0].title_detail.base)
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        config.parser.set('testfeed', 'xml_base', 'feed_alternate')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/feed/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'entry_alternate')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/entry/1/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'base/')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.com/base/',
+             data.entries[0].title_detail.base)
+
+        config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual('http://example.org/data/',
+             data.entries[0].title_detail.base)
--- a/themes/common/validate.html.xslt
+++ b/themes/common/validate.html.xslt
@ -35,7 +35,7 @@
              <th>Name</th>
              <th>Format</th>
              <xsl:if test="//planet:ignore_in_feed | //planet:filters |
-                //planet:*[contains(local-name(),'_type')]">
+                //planet:xml_base | //planet:*[contains(local-name(),'_type')]">
                <th>Notes</th>
              </xsl:if>
            </tr>
@ -128,12 +128,12 @@
        </a>
      </td>
      <td><xsl:value-of select="planet:format"/></td>
-      <xsl:if test="planet:ignore_in_feed | planet:filters |
+      <xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
        planet:*[contains(local-name(),'_type')]">
        <td>
          <dl>
            <xsl:for-each select="planet:ignore_in_feed | planet:filters |
-              planet:*[contains(local-name(),'_type')]">
+              planet:xml_base | planet:*[contains(local-name(),'_type')]">
              <xsl:sort select="local-name()"/>
              <dt><xsl:value-of select="local-name()"/></dt>
              <dd><xsl:value-of select="."/></dd>