Latest from Sam.

This commit is contained in:
Jacques Distler 2007-02-05 15:15:04 -06:00
commit 215777b9ee
16 changed files with 559 additions and 246 deletions

View File

@ -95,6 +95,13 @@ attributes on these elements.</li>
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
</ul>
</li>
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
<ul style="margin:0">
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
</ul>
</li>
</ul>
</body>
</html>

View File

@ -30,5 +30,7 @@ def getLogger(level, format):
return logger
# Configure feed parser
from planet import feedparser
feedparser.SANITIZE_HTML=0
feedparser.RESOLVE_RELATIVE_URIS=0

View File

@ -125,6 +125,7 @@ def __init__():
define_tmpl('summary_type', '')
define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
def load(config_file):
""" initialize and load a configuration"""

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
# if TIDY_MARKUP = 1
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
# ---------- required modules (should come with any Python distribution) ----------
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
try:
@ -732,7 +740,7 @@ class _FeedParserMixin:
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup
if is_htmlish:
if is_htmlish and RESOLVE_RELATIVE_URIS:
if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
@ -753,7 +761,7 @@ class _FeedParserMixin:
self._getContext()['vcard'] = vcard
# sanitize embedded markup
if is_htmlish:
if is_htmlish and SANITIZE_HTML:
if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))

View File

@ -1,3 +1,4 @@
# Differences from the current specification (23 December 2006) are as follows:
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body>
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
# the crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
#Keep a ref to this for special handling of whitespace in <pre>
self.processSpaceCharactersNonPre = self.processSpaceCharacters
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("script", "style"), self.startTagScriptStyle),
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
self.tree.openElements[-1])
# the real deal
def processSpaceCharactersPre(self, data):
#Sometimes (start of <pre> blocks) we want to drop leading newlines
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
and not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.insertText(data)
def processCharacters(self, data):
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersPre
def startTagForm(self, name, attributes):
if self.tree.formPointer:
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
self.parser.phase.processEndTag(name)
def endTagBlock(self, name):
#Put us back in the right whitespace handling mode
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersNonPre
inScope = self.tree.elementInScope(name)
if inScope:
self.tree.generateImpliedEndTags()

View File

@ -11,11 +11,6 @@ References:
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Produce SAX events based on the produced DOM. This is intended not to
support streaming, but rather to support application level compatibility.
* Optional namespace support
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
* Selectively lowercase only XHTML, but not foreign markup
"""
@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
elif token["type"] == "Comment":
# Rescue CDATA from the comments
if (token["data"].startswith("[CDATA[") and
token["data"].endswith("]]")):
token["type"] = "Characters"
token["data"] = token["data"][7:-2]
return token
class XHTMLParser(XMLParser):

View File

@ -1,5 +1,6 @@
import _base
from xml.dom import minidom, Node
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
def hilite(self, encoding):
print 'foo'
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
setattr(self.dom, 'hilite', method)
return self
def doctypeClass(self,name):
@ -129,3 +134,58 @@ def testSerializer(element):
serializeElement(element, 0)
return "\n".join(rv)
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())
# gather namespace declarations
prefixes = []
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.localName != 'xmlns' and attr.localName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.localName)]
# apply namespace declarations
for attrname in node.attributes.keys():
attr = node.getAttributeNode(attrname)
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes: handler.endPrefixMapping(prefix)
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)
elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes: dom2sax(child, handler, nsmap)
handler.endDocument()
else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass

View File

@ -1,208 +1,5 @@
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import _base
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
#Set the element text and tail to the empty string rather than None
#XXX - is this desirable or should we do it on a case by case basis?
self._element.text = ""
self._element.tail = ""
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
self._element[index-1].tail += data
else:
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element.attributes = self.attributes
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
Element.__init__(self, Comment)
self._element.text = data
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, DocumentType)
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, Document)
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if element.tag is DocumentType:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag is Document:
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag is Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if element.tag is DocumentType:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag is Document:
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif element.tag is Comment:
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
def testSerializer(self, element):
return testSerializer(element)
import etreefull
class TreeBuilder(etreefull.TreeBuilder):
def getDocument(self):
return self.document._element
return self.document._element.find("html")

View File

@ -0,0 +1,216 @@
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import _base
class Element(_base.Node):
def __init__(self, name):
self._element = ElementTree.Element(name)
self.name = name
self.parent = None
self._childNodes = []
self._flags = []
def _setName(self, name):
self._element.tag = name
def _getName(self):
return self._element.tag
name = property(_getName, _setName)
def _getAttributes(self):
return self._element.attrib
def _setAttributes(self, attributes):
#Delete existing attributes first
#XXX - there may be a better way to do this...
for key in self._element.attrib.keys():
del self._element.attrib[key]
for key, value in attributes.iteritems():
self._element.set(key, value)
attributes = property(_getAttributes, _setAttributes)
def _getChildNodes(self):
return self._childNodes
def _setChildNodes(self, value):
del self._element[:]
self._childNodes = []
for element in value:
self.insertChild(element)
childNodes = property(_getChildNodes, _setChildNodes)
def hasContent(self):
"""Return true if the node has children or text"""
return bool(self._element.text or self._element.getchildren())
def appendChild(self, node):
self._childNodes.append(node)
self._element.append(node._element)
node.parent = self
def insertBefore(self, node, refNode):
index = self._element.getchildren().index(refNode._element)
self._element.insert(index, node._element)
node.parent = self
def removeChild(self, node):
self._element.remove(node._element)
node.parent=None
def insertText(self, data, insertBefore=None):
if not(len(self._element)):
if not self._element.text:
self._element.text = ""
self._element.text += data
elif insertBefore is None:
#Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
else:
#Insert the text before the specified node
children = self._element.getchildren()
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index-1].tail:
self._element[index-1].tail = ""
self._element[index-1].tail += data
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element.attributes = self.attributes
return element
def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
else:
if not newParent._element.text:
newParent._element.text = ""
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
_base.Node.reparentChildren(self, newParent)
class Comment(Element):
def __init__(self, data):
#Use the superclass constructor to set all properties on the
#wrapper element
Element.__init__(self, None)
self._element = ElementTree.Comment(data)
def _getData(self):
return self._element.text
def _setData(self, value):
self._element.text = value
data = property(_getData, _setData)
class DocumentType(Element):
def __init__(self, name):
Element.__init__(self, DocumentType)
self._element.text = name
class Document(Element):
def __init__(self):
Element.__init__(self, Document)
def testSerializer(element):
rv = []
finalText = None
def serializeElement(element, indent=0):
if element.tag is DocumentType:
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
elif element.tag is Document:
rv.append("#document")
if element.text:
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
if element.tail:
finalText = element.tail
elif element.tag is ElementTree.Comment:
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
else:
rv.append("|%s<%s>"%(' '*indent, element.tag))
if hasattr(element, "attrib"):
for name, value in element.attrib.iteritems():
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
if element.text:
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
indent += 2
for child in element.getchildren():
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\""%(' '*2, finalText))
return "\n".join(rv)
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
def serializeElement(element):
if element.tag is DocumentType:
rv.append("<!DOCTYPE %s>"%(element.text,))
elif element.tag is Document:
if element.text:
rv.append(element.text)
if element.tail:
finalText = element.tail
for child in element.getchildren():
serializeElement(child)
elif element.tag is ElementTree.Comment:
rv.append("<!--%s-->"%(element.text,))
else:
#This is assumed to be an ordinary element
if not element.attrib:
rv.append("<%s>"%(element.tag,))
else:
attr = " ".join(["%s=\"%s\""%(name, value)
for name, value in element.attrib.iteritems()])
rv.append("<%s %s>"%(element.tag, attr))
if element.text:
rv.append(element.text)
for child in element.getchildren():
serializeElement(child)
rv.append("</%s>"%(element.tag,))
if element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
rv.append("%s\""%(' '*2, finalText))
return "".join(rv)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = Element
commentClass = Comment
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.document._element

View File

@ -16,11 +16,13 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
"Xavier Verges Farrero",
"Jonathan Feinberg",
"Blair Zajac",
"Sam Ruby"]
"Sam Ruby",
"Louis Nyffenegger"]
__license__ = "MIT"
__version__ = "$Rev: 217 $"
__version__ = "$Rev: 227 $"
import re
import sys
import md5
import email
import email.Utils
@ -41,6 +43,12 @@ import hmac
from gettext import gettext as _
from socket import gaierror
if sys.version_info >= (2,3):
from iri2uri import iri2uri
else:
def iri2uri(uri):
return uri
__all__ = ['Http', 'Response', 'HttpLib2Error',
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
@ -51,7 +59,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error',
debuglevel = 0
# Python 2.3 support
if 'sorted' not in __builtins__:
if sys.version_info < (2,4):
def sorted(seq):
seq.sort()
return seq
@ -60,7 +68,6 @@ if 'sorted' not in __builtins__:
def HTTPResponse__getheaders(self):
"""Return list of (header, value) tuples."""
if self.msg is None:
print "================================"
raise httplib.ResponseNotReady()
return self.msg.items()
@ -75,6 +82,8 @@ class RedirectLimit(HttpLib2Error): pass
class FailedToDecompressContent(HttpLib2Error): pass
class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
class RelativeURIError(HttpLib2Error): pass
class ServerNotFoundError(HttpLib2Error): pass
# Open Items:
# -----------
@ -118,6 +127,8 @@ def parse_uri(uri):
def urlnorm(uri):
(scheme, authority, path, query, fragment) = parse_uri(uri)
if not scheme or not authority:
raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
authority = authority.lower()
scheme = scheme.lower()
if not path:
@ -125,6 +136,7 @@ def urlnorm(uri):
# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
scheme = scheme.lower()
defrag_uri = scheme + "://" + authority + request_uri
return scheme, authority, request_uri, defrag_uri
@ -143,9 +155,10 @@ def safename(filename):
try:
if re_url_scheme.match(filename):
if isinstance(filename,str):
filename=filename.decode('utf-8').encode('idna')
filename = filename.decode('utf-8')
filename = filename.encode('idna')
else:
filename=filename.encode('idna')
filename = filename.encode('idna')
except:
pass
if isinstance(filename,unicode):
@ -260,16 +273,26 @@ def _entry_disposition(response_headers, request_headers):
now = time.time()
current_age = max(0, now - date)
if cc_response.has_key('max-age'):
freshness_lifetime = int(cc_response['max-age'])
try:
freshness_lifetime = int(cc_response['max-age'])
except:
freshness_lifetime = 0
elif response_headers.has_key('expires'):
expires = email.Utils.parsedate_tz(response_headers['expires'])
freshness_lifetime = max(0, calendar.timegm(expires) - date)
else:
freshness_lifetime = 0
if cc.has_key('max-age'):
freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
try:
freshness_lifetime = int(cc['max-age'])
except:
freshness_lifetime = 0
if cc.has_key('min-fresh'):
current_age += int(cc['min-fresh'])
try:
min_fresh = int(cc['min-fresh'])
except:
min_fresh = 0
current_age += min_fresh
if freshness_lifetime > current_age:
retval = "FRESH"
return retval
@ -418,13 +441,13 @@ class DigestAuthentication(Authentication):
def response(self, response, content):
if not response.has_key('authentication-info'):
challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
if 'true' == challenge.get('stale'):
self.challenge['nonce'] = challenge['nonce']
self.challenge['nc'] = 1
return True
else:
updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
if updated_challenge.has_key('nextnonce'):
self.challenge['nonce'] = updated_challenge['nextnonce']
@ -440,7 +463,6 @@ class HmacDigestAuthentication(Authentication):
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
challenge = _parse_www_authenticate(response, 'www-authenticate')
self.challenge = challenge['hmacdigest']
print self.challenge
# TODO: self.challenge['domain']
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
@ -466,9 +488,6 @@ class HmacDigestAuthentication(Authentication):
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
":", self.challenge['realm']
])
print response['www-authenticate']
print "".join([self.credentials[1], self.challenge['salt']])
print "key_str = %s" % self.key
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
def request(self, method, request_uri, headers, content):
@ -479,8 +498,6 @@ class HmacDigestAuthentication(Authentication):
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
cnonce = _cnonce()
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
print "key = %s" % self.key
print "msg = %s" % request_digest
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
self.credentials[0],
@ -641,6 +658,8 @@ class Http:
try:
conn.request(method, request_uri, body, headers)
response = conn.getresponse()
except gaierror:
raise ServerNotFoundError("Unable to find the server at %s" % request_uri)
except:
if i == 0:
conn.close()
@ -752,6 +771,8 @@ a string that contains the response entity body.
if not headers.has_key('user-agent'):
headers['user-agent'] = "Python-httplib2/%s" % __version__
uri = iri2uri(uri)
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
if not self.connections.has_key(scheme+":"+authority):
@ -780,7 +801,7 @@ a string that contains the response entity body.
else:
cachekey = None
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
# http://www.w3.org/1999/04/Editing/
headers['if-match'] = info['etag']
@ -815,9 +836,9 @@ a string that contains the response entity body.
return (response, content)
if entry_disposition == "STALE":
if info.has_key('etag') and not self.ignore_etag:
if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
headers['if-none-match'] = info['etag']
if info.has_key('last-modified'):
if info.has_key('last-modified') and not 'last-modified' in headers:
headers['if-modified-since'] = info['last-modified']
elif entry_disposition == "TRANSPARENT":
pass

110
planet/httplib2/iri2uri.py Normal file
View File

@ -0,0 +1,110 @@
"""
iri2uri
Converts an IRI to a URI.
"""
__author__ = "Joe Gregorio (joe@bitworking.org)"
__copyright__ = "Copyright 2006, Joe Gregorio"
__contributors__ = []
__version__ = "1.0.0"
__license__ = "MIT"
__history__ = """
"""
import urlparse
# Convert an IRI to a URI following the rules in RFC 3987
#
# The characters we need to enocde and escape are defined in the spec:
#
# iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
# / %xD0000-DFFFD / %xE1000-EFFFD
escape_range = [
(0xA0, 0xD7FF ),
(0xE000, 0xF8FF ),
(0xF900, 0xFDCF ),
(0xFDF0, 0xFFEF),
(0x10000, 0x1FFFD ),
(0x20000, 0x2FFFD ),
(0x30000, 0x3FFFD),
(0x40000, 0x4FFFD ),
(0x50000, 0x5FFFD ),
(0x60000, 0x6FFFD),
(0x70000, 0x7FFFD ),
(0x80000, 0x8FFFD ),
(0x90000, 0x9FFFD),
(0xA0000, 0xAFFFD ),
(0xB0000, 0xBFFFD ),
(0xC0000, 0xCFFFD),
(0xD0000, 0xDFFFD ),
(0xE1000, 0xEFFFD),
(0xF0000, 0xFFFFD ),
(0x100000, 0x10FFFD)
]
def encode(c):
retval = c
i = ord(c)
for low, high in escape_range:
if i < low:
break
if i >= low and i <= high:
retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
break
return retval
def iri2uri(uri):
"""Convert an IRI to a URI. Note that IRIs must be
passed in a unicode strings. That is, do not utf-8 encode
the IRI before passing it into the function."""
if isinstance(uri ,unicode):
(scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
authority = authority.encode('idna')
# For each character in 'ucschar' or 'iprivate'
# 1. encode as utf-8
# 2. then %-encode each octet of that utf-8
uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
uri = "".join([encode(c) for c in uri])
return uri
if __name__ == "__main__":
import unittest
class Test(unittest.TestCase):
def test_uris(self):
"""Test that URIs are invariant under the transformation."""
invariant = [
u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
u"http://www.ietf.org/rfc/rfc2396.txt",
u"ldap://[2001:db8::7]/c=GB?objectClass?one",
u"mailto:John.Doe@example.com",
u"news:comp.infosystems.www.servers.unix",
u"tel:+1-816-555-1212",
u"telnet://192.0.2.16:80/",
u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
for uri in invariant:
self.assertEqual(uri, iri2uri(uri))
def test_iri(self):
""" Test that the right type of escaping is done for each part of the URI."""
self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
unittest.main()

View File

@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
import time
# Planet modules
import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content':
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
node['value'] = feedparser._sanitizeHTML(
node.value, 'utf-8', node.type)

View File

@ -254,7 +254,6 @@ def writeCache(feed_uri, feed_info, data):
def httpThread(thread_index, input_queue, output_queue, log):
import httplib2, md5
from socket import gaierror, error
from httplib import BadStatusLine
h = httplib2.Http(config.http_cache_directory())
@ -304,13 +303,12 @@ def httpThread(thread_index, input_queue, output_queue, log):
if resp.has_key('content-encoding'):
del resp['content-encoding']
setattr(feed, 'headers', resp)
except gaierror:
log.error("Fail to resolve server name %s via %d",
uri, thread_index)
except BadStatusLine:
log.error("Bad Status Line received for %s via %d",
uri, thread_index)
except error, e:
except httplib2.HttpLib2Error, e:
log.error("HttpLib2Error: %s via %d", str(e), thread_index)
except socket.error, e:
if e.__class__.__name__.lower()=='timeout':
feed.headers['status'] = '408'
log.warn("Timeout in thread-%d", thread_index)

View File

@ -3,6 +3,7 @@
import unittest, os, sys, glob, new, re, StringIO, time
from planet import feedparser
from planet.reconstitute import reconstitute
from planet.scrub import scrub
testfiles = 'tests/data/reconstitute/%s.xml'
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
# parse and reconstitute to a string
work = StringIO.StringIO()
results = feedparser.parse(data)
scrub(testfiles%name, results)
reconstitute(results, results.entries[0]).writexml(work)
# verify the results

View File

@ -6,7 +6,7 @@ from planet.scrub import scrub
from planet import feedparser, config
feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'>
<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
<author><name>F&amp;ouml;o</name></author>
<entry xml:lang="en">
<id>ignoreme</id>
@ -15,7 +15,9 @@ feed = '''
<title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content>
<link href="http://example.com/entry/1/"/>
<source>
<link href="http://example.com/feed/"/>
<author><name>F&amp;ouml;o</name></author>
</source>
</entry>
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual(0, len(data.entries))
def test_scrub_xmlbase(self):
base = feedparser.parse(feed)
self.assertEqual('http://example.com/',
base.entries[0].title_detail.base)
config.parser.readfp(StringIO.StringIO(configData))
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/feed/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/entry/1/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'base/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.com/base/',
data.entries[0].title_detail.base)
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('http://example.org/data/',
data.entries[0].title_detail.base)

View File

@ -35,7 +35,7 @@
<th>Name</th>
<th>Format</th>
<xsl:if test="//planet:ignore_in_feed | //planet:filters |
//planet:*[contains(local-name(),'_type')]">
//planet:xml_base | //planet:*[contains(local-name(),'_type')]">
<th>Notes</th>
</xsl:if>
</tr>
@ -128,12 +128,12 @@
</a>
</td>
<td><xsl:value-of select="planet:format"/></td>
<xsl:if test="planet:ignore_in_feed | planet:filters |
<xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
planet:*[contains(local-name(),'_type')]">
<td>
<dl>
<xsl:for-each select="planet:ignore_in_feed | planet:filters |
planet:*[contains(local-name(),'_type')]">
planet:xml_base | planet:*[contains(local-name(),'_type')]">
<xsl:sort select="local-name()"/>
<dt><xsl:value-of select="local-name()"/></dt>
<dd><xsl:value-of select="."/></dd>