Latest from Sam.
This commit is contained in:
commit
215777b9ee
@ -95,6 +95,13 @@ attributes on these elements.</li>
|
|||||||
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
|
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
|
||||||
</ul>
|
</ul>
|
||||||
</li>
|
</li>
|
||||||
|
<li><code>xml_base</code> will adjust the <code>xml:base</code> values in effect for each of the text constructs in the feed (things like <code>title</code>, <code>summary</code>, and <code>content</code>). Other elements in the feed (most notably, <code>link</code> are not affected by this value.
|
||||||
|
<ul style="margin:0">
|
||||||
|
<li><code>feed_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found either in the enclosed <code>source</code> or enclosing <code>feed</code> element.</li>
|
||||||
|
<li><code>entry_alternate</code> will replace the <code>xml:base</code> in effect with the value of the <code>alternate</code> <code>link</code> found in this entry.</li>
|
||||||
|
<li>Any other value will be treated as a <a href="http://www.ietf.org/rfc/rfc3986.txt">URI reference</a>. These values may be relative or absolute. If relative, the <code>xml:base</code> values in each text construct will each be adjusted separately using to the specified value.</li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -30,5 +30,7 @@ def getLogger(level, format):
|
|||||||
|
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
# Configure feed parser
|
||||||
|
from planet import feedparser
|
||||||
|
feedparser.SANITIZE_HTML=0
|
||||||
|
feedparser.RESOLVE_RELATIVE_URIS=0
|
||||||
|
@ -125,6 +125,7 @@ def __init__():
|
|||||||
define_tmpl('summary_type', '')
|
define_tmpl('summary_type', '')
|
||||||
define_tmpl('content_type', '')
|
define_tmpl('content_type', '')
|
||||||
define_tmpl('future_dates', 'keep')
|
define_tmpl('future_dates', 'keep')
|
||||||
|
define_tmpl('xml_base', '')
|
||||||
|
|
||||||
def load(config_file):
|
def load(config_file):
|
||||||
""" initialize and load a configuration"""
|
""" initialize and load a configuration"""
|
||||||
|
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
|||||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
|
__version__ = "4.2-pre-" + "$Revision: 1.149 $"[11:16] + "-cvs"
|
||||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification,
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
@ -65,6 +65,14 @@ TIDY_MARKUP = 0
|
|||||||
# if TIDY_MARKUP = 1
|
# if TIDY_MARKUP = 1
|
||||||
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
|
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
|
||||||
|
|
||||||
|
# If you want feedparser to automatically resolve all relative URIs, set this
|
||||||
|
# to 1.
|
||||||
|
RESOLVE_RELATIVE_URIS = 1
|
||||||
|
|
||||||
|
# If you want feedparser to automatically sanitize all potentially unsafe
|
||||||
|
# HTML content, set this to 1.
|
||||||
|
SANITIZE_HTML = 1
|
||||||
|
|
||||||
# ---------- required modules (should come with any Python distribution) ----------
|
# ---------- required modules (should come with any Python distribution) ----------
|
||||||
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
|
||||||
try:
|
try:
|
||||||
@ -732,7 +740,7 @@ class _FeedParserMixin:
|
|||||||
|
|
||||||
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
||||||
# resolve relative URIs within embedded markup
|
# resolve relative URIs within embedded markup
|
||||||
if is_htmlish:
|
if is_htmlish and RESOLVE_RELATIVE_URIS:
|
||||||
if element in self.can_contain_relative_uris:
|
if element in self.can_contain_relative_uris:
|
||||||
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||||
|
|
||||||
@ -753,7 +761,7 @@ class _FeedParserMixin:
|
|||||||
self._getContext()['vcard'] = vcard
|
self._getContext()['vcard'] = vcard
|
||||||
|
|
||||||
# sanitize embedded markup
|
# sanitize embedded markup
|
||||||
if is_htmlish:
|
if is_htmlish and SANITIZE_HTML:
|
||||||
if element in self.can_contain_dangerous_markup:
|
if element in self.can_contain_dangerous_markup:
|
||||||
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
# Differences from the current specification (23 December 2006) are as follows:
|
# Differences from the current specification (23 December 2006) are as follows:
|
||||||
# * Phases and insertion modes are one concept in parser.py.
|
# * Phases and insertion modes are one concept in parser.py.
|
||||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||||
@ -553,6 +554,10 @@ class InBodyPhase(Phase):
|
|||||||
# the crazy mode
|
# the crazy mode
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
|
|
||||||
|
#Keep a ref to this for special handling of whitespace in <pre>
|
||||||
|
self.processSpaceCharactersNonPre = self.processSpaceCharacters
|
||||||
|
|
||||||
self.startTagHandler = utils.MethodDispatcher([
|
self.startTagHandler = utils.MethodDispatcher([
|
||||||
("html", self.startTagHtml),
|
("html", self.startTagHtml),
|
||||||
(("script", "style"), self.startTagScriptStyle),
|
(("script", "style"), self.startTagScriptStyle),
|
||||||
@ -622,6 +627,15 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.openElements[-1])
|
self.tree.openElements[-1])
|
||||||
|
|
||||||
# the real deal
|
# the real deal
|
||||||
|
def processSpaceCharactersPre(self, data):
|
||||||
|
#Sometimes (start of <pre> blocks) we want to drop leading newlines
|
||||||
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||||
|
if (data.startswith("\n") and self.tree.openElements[-1].name == "pre"
|
||||||
|
and not self.tree.openElements[-1].hasContent()):
|
||||||
|
data = data[1:]
|
||||||
|
if data:
|
||||||
|
self.tree.insertText(data)
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
# XXX The specification says to do this for every character at the
|
# XXX The specification says to do this for every character at the
|
||||||
# moment, but apparently that doesn't match the real world so we don't
|
# moment, but apparently that doesn't match the real world so we don't
|
||||||
@ -651,6 +665,8 @@ class InBodyPhase(Phase):
|
|||||||
if self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
self.endTagP("p")
|
self.endTagP("p")
|
||||||
self.tree.insertElement(name, attributes)
|
self.tree.insertElement(name, attributes)
|
||||||
|
if name == "pre":
|
||||||
|
self.processSpaceCharacters = self.processSpaceCharactersPre
|
||||||
|
|
||||||
def startTagForm(self, name, attributes):
|
def startTagForm(self, name, attributes):
|
||||||
if self.tree.formPointer:
|
if self.tree.formPointer:
|
||||||
@ -849,6 +865,9 @@ class InBodyPhase(Phase):
|
|||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagBlock(self, name):
|
def endTagBlock(self, name):
|
||||||
|
#Put us back in the right whitespace handling mode
|
||||||
|
if name == "pre":
|
||||||
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||||
inScope = self.tree.elementInScope(name)
|
inScope = self.tree.elementInScope(name)
|
||||||
if inScope:
|
if inScope:
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
|
@ -11,11 +11,6 @@ References:
|
|||||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||||
|
|
||||||
@@TODO:
|
@@TODO:
|
||||||
* Produce SAX events based on the produced DOM. This is intended not to
|
|
||||||
support streaming, but rather to support application level compatibility.
|
|
||||||
* Optional namespace support
|
|
||||||
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
|
|
||||||
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
|
||||||
* Selectively lowercase only XHTML, but not foreign markup
|
* Selectively lowercase only XHTML, but not foreign markup
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -50,6 +45,13 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
if token["data"]:
|
if token["data"]:
|
||||||
self.parseError(_("End tag contains unexpected attributes."))
|
self.parseError(_("End tag contains unexpected attributes."))
|
||||||
|
|
||||||
|
elif token["type"] == "Comment":
|
||||||
|
# Rescue CDATA from the comments
|
||||||
|
if (token["data"].startswith("[CDATA[") and
|
||||||
|
token["data"].endswith("]]")):
|
||||||
|
token["type"] = "Characters"
|
||||||
|
token["data"] = token["data"][7:-2]
|
||||||
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
class XHTMLParser(XMLParser):
|
class XHTMLParser(XMLParser):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import _base
|
import _base
|
||||||
from xml.dom import minidom, Node
|
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||||
|
import new
|
||||||
|
|
||||||
import re
|
import re
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -71,6 +72,10 @@ class NodeBuilder(_base.Node):
|
|||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||||
|
def hilite(self, encoding):
|
||||||
|
print 'foo'
|
||||||
|
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
|
||||||
|
setattr(self.dom, 'hilite', method)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def doctypeClass(self,name):
|
def doctypeClass(self,name):
|
||||||
@ -129,3 +134,58 @@ def testSerializer(element):
|
|||||||
serializeElement(element, 0)
|
serializeElement(element, 0)
|
||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||||
|
if node.nodeType == Node.ELEMENT_NODE:
|
||||||
|
if not nsmap:
|
||||||
|
handler.startElement(node.nodeName, node.attributes)
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endElement(node.nodeName)
|
||||||
|
else:
|
||||||
|
attributes = dict(node.attributes.itemsNS())
|
||||||
|
|
||||||
|
# gather namespace declarations
|
||||||
|
prefixes = []
|
||||||
|
for attrname in node.attributes.keys():
|
||||||
|
attr = node.getAttributeNode(attrname)
|
||||||
|
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
||||||
|
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
||||||
|
prefix = (attr.localName != 'xmlns' and attr.localName or None)
|
||||||
|
handler.startPrefixMapping(prefix, attr.nodeValue)
|
||||||
|
prefixes.append(prefix)
|
||||||
|
nsmap = nsmap.copy()
|
||||||
|
nsmap[prefix] = attr.nodeValue
|
||||||
|
del attributes[(attr.namespaceURI, attr.localName)]
|
||||||
|
|
||||||
|
# apply namespace declarations
|
||||||
|
for attrname in node.attributes.keys():
|
||||||
|
attr = node.getAttributeNode(attrname)
|
||||||
|
if attr.namespaceURI == None and ':' in attr.nodeName:
|
||||||
|
prefix = attr.nodeName.split(':')[0]
|
||||||
|
if nsmap.has_key(prefix):
|
||||||
|
del attributes[(attr.namespaceURI, attr.localName)]
|
||||||
|
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
|
||||||
|
|
||||||
|
# SAX events
|
||||||
|
ns = node.namespaceURI or nsmap.get(None,None)
|
||||||
|
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||||
|
for prefix in prefixes: handler.endPrefixMapping(prefix)
|
||||||
|
|
||||||
|
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
|
||||||
|
handler.characters(node.nodeValue)
|
||||||
|
|
||||||
|
elif node.nodeType == Node.DOCUMENT_NODE:
|
||||||
|
handler.startDocument()
|
||||||
|
for child in node.childNodes: dom2sax(child, handler, nsmap)
|
||||||
|
handler.endDocument()
|
||||||
|
|
||||||
|
else:
|
||||||
|
# ATTRIBUTE_NODE
|
||||||
|
# ENTITY_NODE
|
||||||
|
# PROCESSING_INSTRUCTION_NODE
|
||||||
|
# COMMENT_NODE
|
||||||
|
# DOCUMENT_TYPE_NODE
|
||||||
|
# NOTATION_NODE
|
||||||
|
pass
|
||||||
|
@ -1,208 +1,5 @@
|
|||||||
try:
|
import etreefull
|
||||||
from xml.etree import ElementTree
|
|
||||||
except ImportError:
|
|
||||||
from elementtree import ElementTree
|
|
||||||
|
|
||||||
import _base
|
|
||||||
|
|
||||||
class Element(_base.Node):
|
|
||||||
def __init__(self, name):
|
|
||||||
self._element = ElementTree.Element(name)
|
|
||||||
self.name = name
|
|
||||||
self.parent = None
|
|
||||||
self._childNodes = []
|
|
||||||
self._flags = []
|
|
||||||
|
|
||||||
#Set the element text and tail to the empty string rather than None
|
|
||||||
#XXX - is this desirable or should we do it on a case by case basis?
|
|
||||||
self._element.text = ""
|
|
||||||
self._element.tail = ""
|
|
||||||
|
|
||||||
def _setName(self, name):
|
|
||||||
self._element.tag = name
|
|
||||||
|
|
||||||
def _getName(self):
|
|
||||||
return self._element.tag
|
|
||||||
|
|
||||||
name = property(_getName, _setName)
|
|
||||||
|
|
||||||
def _getAttributes(self):
|
|
||||||
return self._element.attrib
|
|
||||||
|
|
||||||
def _setAttributes(self, attributes):
|
|
||||||
#Delete existing attributes first
|
|
||||||
#XXX - there may be a better way to do this...
|
|
||||||
for key in self._element.attrib.keys():
|
|
||||||
del self._element.attrib[key]
|
|
||||||
for key, value in attributes.iteritems():
|
|
||||||
self._element.set(key, value)
|
|
||||||
|
|
||||||
attributes = property(_getAttributes, _setAttributes)
|
|
||||||
|
|
||||||
def _getChildNodes(self):
|
|
||||||
return self._childNodes
|
|
||||||
|
|
||||||
def _setChildNodes(self, value):
|
|
||||||
del self._element[:]
|
|
||||||
self._childNodes = []
|
|
||||||
for element in value:
|
|
||||||
self.insertChild(element)
|
|
||||||
|
|
||||||
childNodes = property(_getChildNodes, _setChildNodes)
|
|
||||||
|
|
||||||
def hasContent(self):
|
|
||||||
"""Return true if the node has children or text"""
|
|
||||||
return bool(self._element.text or self._element.getchildren())
|
|
||||||
|
|
||||||
def appendChild(self, node):
|
|
||||||
self._childNodes.append(node)
|
|
||||||
self._element.append(node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
|
||||||
index = self._element.getchildren().index(refNode._element)
|
|
||||||
self._element.insert(index, node._element)
|
|
||||||
node.parent = self
|
|
||||||
|
|
||||||
def removeChild(self, node):
|
|
||||||
self._element.remove(node._element)
|
|
||||||
node.parent=None
|
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
|
||||||
if not(len(self._element)):
|
|
||||||
self._element.text += data
|
|
||||||
elif insertBefore is None:
|
|
||||||
#Insert the text as the tail of the last child element
|
|
||||||
self._element[-1].tail += data
|
|
||||||
else:
|
|
||||||
#Insert the text before the specified node
|
|
||||||
children = self._element.getchildren()
|
|
||||||
index = children.index(insertBefore._element)
|
|
||||||
if index > 0:
|
|
||||||
self._element[index-1].tail += data
|
|
||||||
else:
|
|
||||||
self._element.text += data
|
|
||||||
|
|
||||||
def cloneNode(self):
|
|
||||||
element = Element(self.name)
|
|
||||||
element.attributes = self.attributes
|
|
||||||
return element
|
|
||||||
|
|
||||||
def reparentChildren(self, newParent):
|
|
||||||
if newParent.childNodes:
|
|
||||||
newParent.childNodes[-1]._element.tail += self._element.text
|
|
||||||
else:
|
|
||||||
newParent._element.text += self._element.text
|
|
||||||
self._element.text = ""
|
|
||||||
_base.Node.reparentChildren(self, newParent)
|
|
||||||
|
|
||||||
class Comment(Element):
|
|
||||||
def __init__(self, data):
|
|
||||||
Element.__init__(self, Comment)
|
|
||||||
self._element.text = data
|
|
||||||
|
|
||||||
def _getData(self):
|
|
||||||
return self._element.text
|
|
||||||
|
|
||||||
def _setData(self, value):
|
|
||||||
self._element.text = value
|
|
||||||
|
|
||||||
data = property(_getData, _setData)
|
|
||||||
|
|
||||||
class DocumentType(Element):
|
|
||||||
def __init__(self, name):
|
|
||||||
Element.__init__(self, DocumentType)
|
|
||||||
self._element.text = name
|
|
||||||
|
|
||||||
class Document(Element):
|
|
||||||
def __init__(self):
|
|
||||||
Element.__init__(self, Document)
|
|
||||||
|
|
||||||
def testSerializer(element):
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
def serializeElement(element, indent=0):
|
|
||||||
if element.tag is DocumentType:
|
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
|
||||||
elif element.tag is Document:
|
|
||||||
rv.append("#document")
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
|
||||||
if element.tail:
|
|
||||||
finalText = element.tail
|
|
||||||
elif element.tag is Comment:
|
|
||||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
|
||||||
else:
|
|
||||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
|
||||||
if hasattr(element, "attrib"):
|
|
||||||
for name, value in element.attrib.iteritems():
|
|
||||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
|
||||||
if element.text:
|
|
||||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
|
||||||
indent += 2
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child, indent)
|
|
||||||
if element.tail:
|
|
||||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
|
||||||
serializeElement(element, 0)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "\n".join(rv)
|
|
||||||
|
|
||||||
def tostring(element):
|
|
||||||
"""Serialize an element and its child nodes to a string"""
|
|
||||||
rv = []
|
|
||||||
finalText = None
|
|
||||||
def serializeElement(element):
|
|
||||||
if element.tag is DocumentType:
|
|
||||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
|
||||||
elif element.tag is Document:
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
if element.tail:
|
|
||||||
finalText = element.tail
|
|
||||||
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
elif element.tag is Comment:
|
|
||||||
rv.append("<!--%s-->"%(element.text,))
|
|
||||||
else:
|
|
||||||
#This is assumed to be an ordinary element
|
|
||||||
if not element.attrib:
|
|
||||||
rv.append("<%s>"%(element.tag,))
|
|
||||||
else:
|
|
||||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
|
||||||
for name, value in element.attrib.iteritems()])
|
|
||||||
rv.append("<%s %s>"%(element.tag, attr))
|
|
||||||
if element.text:
|
|
||||||
rv.append(element.text)
|
|
||||||
|
|
||||||
for child in element.getchildren():
|
|
||||||
serializeElement(child)
|
|
||||||
|
|
||||||
rv.append("</%s>"%(element.tag,))
|
|
||||||
|
|
||||||
if element.tail:
|
|
||||||
rv.append(element.tail)
|
|
||||||
|
|
||||||
serializeElement(element)
|
|
||||||
|
|
||||||
if finalText is not None:
|
|
||||||
rv.append("%s\""%(' '*2, finalText))
|
|
||||||
|
|
||||||
return "".join(rv)
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
|
||||||
documentClass = Document
|
|
||||||
doctypeClass = DocumentType
|
|
||||||
elementClass = Element
|
|
||||||
commentClass = Comment
|
|
||||||
|
|
||||||
def testSerializer(self, element):
|
|
||||||
return testSerializer(element)
|
|
||||||
|
|
||||||
|
class TreeBuilder(etreefull.TreeBuilder):
|
||||||
def getDocument(self):
|
def getDocument(self):
|
||||||
return self.document._element
|
return self.document._element.find("html")
|
||||||
|
216
planet/html5lib/treebuilders/etreefull.py
Normal file
216
planet/html5lib/treebuilders/etreefull.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
try:
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
except ImportError:
|
||||||
|
from elementtree import ElementTree
|
||||||
|
|
||||||
|
import _base
|
||||||
|
|
||||||
|
class Element(_base.Node):
|
||||||
|
def __init__(self, name):
|
||||||
|
self._element = ElementTree.Element(name)
|
||||||
|
self.name = name
|
||||||
|
self.parent = None
|
||||||
|
self._childNodes = []
|
||||||
|
self._flags = []
|
||||||
|
|
||||||
|
def _setName(self, name):
|
||||||
|
self._element.tag = name
|
||||||
|
|
||||||
|
def _getName(self):
|
||||||
|
return self._element.tag
|
||||||
|
|
||||||
|
name = property(_getName, _setName)
|
||||||
|
|
||||||
|
def _getAttributes(self):
|
||||||
|
return self._element.attrib
|
||||||
|
|
||||||
|
def _setAttributes(self, attributes):
|
||||||
|
#Delete existing attributes first
|
||||||
|
#XXX - there may be a better way to do this...
|
||||||
|
for key in self._element.attrib.keys():
|
||||||
|
del self._element.attrib[key]
|
||||||
|
for key, value in attributes.iteritems():
|
||||||
|
self._element.set(key, value)
|
||||||
|
|
||||||
|
attributes = property(_getAttributes, _setAttributes)
|
||||||
|
|
||||||
|
def _getChildNodes(self):
|
||||||
|
return self._childNodes
|
||||||
|
|
||||||
|
def _setChildNodes(self, value):
|
||||||
|
del self._element[:]
|
||||||
|
self._childNodes = []
|
||||||
|
for element in value:
|
||||||
|
self.insertChild(element)
|
||||||
|
|
||||||
|
childNodes = property(_getChildNodes, _setChildNodes)
|
||||||
|
|
||||||
|
def hasContent(self):
|
||||||
|
"""Return true if the node has children or text"""
|
||||||
|
return bool(self._element.text or self._element.getchildren())
|
||||||
|
|
||||||
|
def appendChild(self, node):
|
||||||
|
self._childNodes.append(node)
|
||||||
|
self._element.append(node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def insertBefore(self, node, refNode):
|
||||||
|
index = self._element.getchildren().index(refNode._element)
|
||||||
|
self._element.insert(index, node._element)
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
|
def removeChild(self, node):
|
||||||
|
self._element.remove(node._element)
|
||||||
|
node.parent=None
|
||||||
|
|
||||||
|
def insertText(self, data, insertBefore=None):
|
||||||
|
if not(len(self._element)):
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
elif insertBefore is None:
|
||||||
|
#Insert the text as the tail of the last child element
|
||||||
|
if not self._element[-1].tail:
|
||||||
|
self._element[-1].tail = ""
|
||||||
|
self._element[-1].tail += data
|
||||||
|
else:
|
||||||
|
#Insert the text before the specified node
|
||||||
|
children = self._element.getchildren()
|
||||||
|
index = children.index(insertBefore._element)
|
||||||
|
if index > 0:
|
||||||
|
if not self._element[index-1].tail:
|
||||||
|
self._element[index-1].tail = ""
|
||||||
|
self._element[index-1].tail += data
|
||||||
|
else:
|
||||||
|
if not self._element.text:
|
||||||
|
self._element.text = ""
|
||||||
|
self._element.text += data
|
||||||
|
|
||||||
|
def cloneNode(self):
|
||||||
|
element = Element(self.name)
|
||||||
|
element.attributes = self.attributes
|
||||||
|
return element
|
||||||
|
|
||||||
|
def reparentChildren(self, newParent):
|
||||||
|
if newParent.childNodes:
|
||||||
|
newParent.childNodes[-1]._element.tail += self._element.text
|
||||||
|
else:
|
||||||
|
if not newParent._element.text:
|
||||||
|
newParent._element.text = ""
|
||||||
|
if self._element.text is not None:
|
||||||
|
newParent._element.text += self._element.text
|
||||||
|
self._element.text = ""
|
||||||
|
_base.Node.reparentChildren(self, newParent)
|
||||||
|
|
||||||
|
class Comment(Element):
|
||||||
|
def __init__(self, data):
|
||||||
|
#Use the superclass constructor to set all properties on the
|
||||||
|
#wrapper element
|
||||||
|
Element.__init__(self, None)
|
||||||
|
self._element = ElementTree.Comment(data)
|
||||||
|
|
||||||
|
def _getData(self):
|
||||||
|
return self._element.text
|
||||||
|
|
||||||
|
def _setData(self, value):
|
||||||
|
self._element.text = value
|
||||||
|
|
||||||
|
data = property(_getData, _setData)
|
||||||
|
|
||||||
|
class DocumentType(Element):
|
||||||
|
def __init__(self, name):
|
||||||
|
Element.__init__(self, DocumentType)
|
||||||
|
self._element.text = name
|
||||||
|
|
||||||
|
class Document(Element):
|
||||||
|
def __init__(self):
|
||||||
|
Element.__init__(self, Document)
|
||||||
|
|
||||||
|
def testSerializer(element):
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if element.tag is DocumentType:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||||
|
elif element.tag is Document:
|
||||||
|
rv.append("#document")
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
elif element.tag is ElementTree.Comment:
|
||||||
|
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||||
|
if hasattr(element, "attrib"):
|
||||||
|
for name, value in element.attrib.iteritems():
|
||||||
|
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||||
|
if element.text:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||||
|
indent += 2
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child, indent)
|
||||||
|
if element.tail:
|
||||||
|
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
|
def tostring(element):
|
||||||
|
"""Serialize an element and its child nodes to a string"""
|
||||||
|
rv = []
|
||||||
|
finalText = None
|
||||||
|
def serializeElement(element):
|
||||||
|
if element.tag is DocumentType:
|
||||||
|
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||||
|
elif element.tag is Document:
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
if element.tail:
|
||||||
|
finalText = element.tail
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
elif element.tag is ElementTree.Comment:
|
||||||
|
rv.append("<!--%s-->"%(element.text,))
|
||||||
|
else:
|
||||||
|
#This is assumed to be an ordinary element
|
||||||
|
if not element.attrib:
|
||||||
|
rv.append("<%s>"%(element.tag,))
|
||||||
|
else:
|
||||||
|
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||||
|
for name, value in element.attrib.iteritems()])
|
||||||
|
rv.append("<%s %s>"%(element.tag, attr))
|
||||||
|
if element.text:
|
||||||
|
rv.append(element.text)
|
||||||
|
|
||||||
|
for child in element.getchildren():
|
||||||
|
serializeElement(child)
|
||||||
|
|
||||||
|
rv.append("</%s>"%(element.tag,))
|
||||||
|
|
||||||
|
if element.tail:
|
||||||
|
rv.append(element.tail)
|
||||||
|
|
||||||
|
serializeElement(element)
|
||||||
|
|
||||||
|
if finalText is not None:
|
||||||
|
rv.append("%s\""%(' '*2, finalText))
|
||||||
|
|
||||||
|
return "".join(rv)
|
||||||
|
|
||||||
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
|
documentClass = Document
|
||||||
|
doctypeClass = DocumentType
|
||||||
|
elementClass = Element
|
||||||
|
commentClass = Comment
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
return testSerializer(element)
|
||||||
|
|
||||||
|
def getDocument(self):
|
||||||
|
return self.document._element
|
@ -16,11 +16,13 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
|
|||||||
"Xavier Verges Farrero",
|
"Xavier Verges Farrero",
|
||||||
"Jonathan Feinberg",
|
"Jonathan Feinberg",
|
||||||
"Blair Zajac",
|
"Blair Zajac",
|
||||||
"Sam Ruby"]
|
"Sam Ruby",
|
||||||
|
"Louis Nyffenegger"]
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__version__ = "$Rev: 217 $"
|
__version__ = "$Rev: 227 $"
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import md5
|
import md5
|
||||||
import email
|
import email
|
||||||
import email.Utils
|
import email.Utils
|
||||||
@ -41,6 +43,12 @@ import hmac
|
|||||||
from gettext import gettext as _
|
from gettext import gettext as _
|
||||||
from socket import gaierror
|
from socket import gaierror
|
||||||
|
|
||||||
|
if sys.version_info >= (2,3):
|
||||||
|
from iri2uri import iri2uri
|
||||||
|
else:
|
||||||
|
def iri2uri(uri):
|
||||||
|
return uri
|
||||||
|
|
||||||
__all__ = ['Http', 'Response', 'HttpLib2Error',
|
__all__ = ['Http', 'Response', 'HttpLib2Error',
|
||||||
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
|
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
|
||||||
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
|
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
|
||||||
@ -51,7 +59,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error',
|
|||||||
debuglevel = 0
|
debuglevel = 0
|
||||||
|
|
||||||
# Python 2.3 support
|
# Python 2.3 support
|
||||||
if 'sorted' not in __builtins__:
|
if sys.version_info < (2,4):
|
||||||
def sorted(seq):
|
def sorted(seq):
|
||||||
seq.sort()
|
seq.sort()
|
||||||
return seq
|
return seq
|
||||||
@ -60,7 +68,6 @@ if 'sorted' not in __builtins__:
|
|||||||
def HTTPResponse__getheaders(self):
|
def HTTPResponse__getheaders(self):
|
||||||
"""Return list of (header, value) tuples."""
|
"""Return list of (header, value) tuples."""
|
||||||
if self.msg is None:
|
if self.msg is None:
|
||||||
print "================================"
|
|
||||||
raise httplib.ResponseNotReady()
|
raise httplib.ResponseNotReady()
|
||||||
return self.msg.items()
|
return self.msg.items()
|
||||||
|
|
||||||
@ -75,6 +82,8 @@ class RedirectLimit(HttpLib2Error): pass
|
|||||||
class FailedToDecompressContent(HttpLib2Error): pass
|
class FailedToDecompressContent(HttpLib2Error): pass
|
||||||
class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
|
class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
|
||||||
class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
|
class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
|
||||||
|
class RelativeURIError(HttpLib2Error): pass
|
||||||
|
class ServerNotFoundError(HttpLib2Error): pass
|
||||||
|
|
||||||
# Open Items:
|
# Open Items:
|
||||||
# -----------
|
# -----------
|
||||||
@ -118,6 +127,8 @@ def parse_uri(uri):
|
|||||||
|
|
||||||
def urlnorm(uri):
|
def urlnorm(uri):
|
||||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||||
|
if not scheme or not authority:
|
||||||
|
raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri)
|
||||||
authority = authority.lower()
|
authority = authority.lower()
|
||||||
scheme = scheme.lower()
|
scheme = scheme.lower()
|
||||||
if not path:
|
if not path:
|
||||||
@ -125,6 +136,7 @@ def urlnorm(uri):
|
|||||||
# Could do syntax based normalization of the URI before
|
# Could do syntax based normalization of the URI before
|
||||||
# computing the digest. See Section 6.2.2 of Std 66.
|
# computing the digest. See Section 6.2.2 of Std 66.
|
||||||
request_uri = query and "?".join([path, query]) or path
|
request_uri = query and "?".join([path, query]) or path
|
||||||
|
scheme = scheme.lower()
|
||||||
defrag_uri = scheme + "://" + authority + request_uri
|
defrag_uri = scheme + "://" + authority + request_uri
|
||||||
return scheme, authority, request_uri, defrag_uri
|
return scheme, authority, request_uri, defrag_uri
|
||||||
|
|
||||||
@ -143,9 +155,10 @@ def safename(filename):
|
|||||||
try:
|
try:
|
||||||
if re_url_scheme.match(filename):
|
if re_url_scheme.match(filename):
|
||||||
if isinstance(filename,str):
|
if isinstance(filename,str):
|
||||||
filename=filename.decode('utf-8').encode('idna')
|
filename = filename.decode('utf-8')
|
||||||
|
filename = filename.encode('idna')
|
||||||
else:
|
else:
|
||||||
filename=filename.encode('idna')
|
filename = filename.encode('idna')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
if isinstance(filename,unicode):
|
if isinstance(filename,unicode):
|
||||||
@ -260,16 +273,26 @@ def _entry_disposition(response_headers, request_headers):
|
|||||||
now = time.time()
|
now = time.time()
|
||||||
current_age = max(0, now - date)
|
current_age = max(0, now - date)
|
||||||
if cc_response.has_key('max-age'):
|
if cc_response.has_key('max-age'):
|
||||||
|
try:
|
||||||
freshness_lifetime = int(cc_response['max-age'])
|
freshness_lifetime = int(cc_response['max-age'])
|
||||||
|
except:
|
||||||
|
freshness_lifetime = 0
|
||||||
elif response_headers.has_key('expires'):
|
elif response_headers.has_key('expires'):
|
||||||
expires = email.Utils.parsedate_tz(response_headers['expires'])
|
expires = email.Utils.parsedate_tz(response_headers['expires'])
|
||||||
freshness_lifetime = max(0, calendar.timegm(expires) - date)
|
freshness_lifetime = max(0, calendar.timegm(expires) - date)
|
||||||
else:
|
else:
|
||||||
freshness_lifetime = 0
|
freshness_lifetime = 0
|
||||||
if cc.has_key('max-age'):
|
if cc.has_key('max-age'):
|
||||||
freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
|
try:
|
||||||
|
freshness_lifetime = int(cc['max-age'])
|
||||||
|
except:
|
||||||
|
freshness_lifetime = 0
|
||||||
if cc.has_key('min-fresh'):
|
if cc.has_key('min-fresh'):
|
||||||
current_age += int(cc['min-fresh'])
|
try:
|
||||||
|
min_fresh = int(cc['min-fresh'])
|
||||||
|
except:
|
||||||
|
min_fresh = 0
|
||||||
|
current_age += min_fresh
|
||||||
if freshness_lifetime > current_age:
|
if freshness_lifetime > current_age:
|
||||||
retval = "FRESH"
|
retval = "FRESH"
|
||||||
return retval
|
return retval
|
||||||
@ -418,13 +441,13 @@ class DigestAuthentication(Authentication):
|
|||||||
|
|
||||||
def response(self, response, content):
|
def response(self, response, content):
|
||||||
if not response.has_key('authentication-info'):
|
if not response.has_key('authentication-info'):
|
||||||
challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
|
challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {})
|
||||||
if 'true' == challenge.get('stale'):
|
if 'true' == challenge.get('stale'):
|
||||||
self.challenge['nonce'] = challenge['nonce']
|
self.challenge['nonce'] = challenge['nonce']
|
||||||
self.challenge['nc'] = 1
|
self.challenge['nc'] = 1
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
|
updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {})
|
||||||
|
|
||||||
if updated_challenge.has_key('nextnonce'):
|
if updated_challenge.has_key('nextnonce'):
|
||||||
self.challenge['nonce'] = updated_challenge['nextnonce']
|
self.challenge['nonce'] = updated_challenge['nextnonce']
|
||||||
@ -440,7 +463,6 @@ class HmacDigestAuthentication(Authentication):
|
|||||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||||
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
||||||
self.challenge = challenge['hmacdigest']
|
self.challenge = challenge['hmacdigest']
|
||||||
print self.challenge
|
|
||||||
# TODO: self.challenge['domain']
|
# TODO: self.challenge['domain']
|
||||||
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
|
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
|
||||||
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
|
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
|
||||||
@ -466,9 +488,6 @@ class HmacDigestAuthentication(Authentication):
|
|||||||
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
|
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
|
||||||
":", self.challenge['realm']
|
":", self.challenge['realm']
|
||||||
])
|
])
|
||||||
print response['www-authenticate']
|
|
||||||
print "".join([self.credentials[1], self.challenge['salt']])
|
|
||||||
print "key_str = %s" % self.key
|
|
||||||
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
|
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
|
||||||
|
|
||||||
def request(self, method, request_uri, headers, content):
|
def request(self, method, request_uri, headers, content):
|
||||||
@ -479,8 +498,6 @@ class HmacDigestAuthentication(Authentication):
|
|||||||
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
|
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
|
||||||
cnonce = _cnonce()
|
cnonce = _cnonce()
|
||||||
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
|
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
|
||||||
print "key = %s" % self.key
|
|
||||||
print "msg = %s" % request_digest
|
|
||||||
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
|
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
|
||||||
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
|
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
|
||||||
self.credentials[0],
|
self.credentials[0],
|
||||||
@ -641,6 +658,8 @@ class Http:
|
|||||||
try:
|
try:
|
||||||
conn.request(method, request_uri, body, headers)
|
conn.request(method, request_uri, body, headers)
|
||||||
response = conn.getresponse()
|
response = conn.getresponse()
|
||||||
|
except gaierror:
|
||||||
|
raise ServerNotFoundError("Unable to find the server at %s" % request_uri)
|
||||||
except:
|
except:
|
||||||
if i == 0:
|
if i == 0:
|
||||||
conn.close()
|
conn.close()
|
||||||
@ -752,6 +771,8 @@ a string that contains the response entity body.
|
|||||||
if not headers.has_key('user-agent'):
|
if not headers.has_key('user-agent'):
|
||||||
headers['user-agent'] = "Python-httplib2/%s" % __version__
|
headers['user-agent'] = "Python-httplib2/%s" % __version__
|
||||||
|
|
||||||
|
uri = iri2uri(uri)
|
||||||
|
|
||||||
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
|
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
|
||||||
|
|
||||||
if not self.connections.has_key(scheme+":"+authority):
|
if not self.connections.has_key(scheme+":"+authority):
|
||||||
@ -780,7 +801,7 @@ a string that contains the response entity body.
|
|||||||
else:
|
else:
|
||||||
cachekey = None
|
cachekey = None
|
||||||
|
|
||||||
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
|
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers:
|
||||||
# http://www.w3.org/1999/04/Editing/
|
# http://www.w3.org/1999/04/Editing/
|
||||||
headers['if-match'] = info['etag']
|
headers['if-match'] = info['etag']
|
||||||
|
|
||||||
@ -815,9 +836,9 @@ a string that contains the response entity body.
|
|||||||
return (response, content)
|
return (response, content)
|
||||||
|
|
||||||
if entry_disposition == "STALE":
|
if entry_disposition == "STALE":
|
||||||
if info.has_key('etag') and not self.ignore_etag:
|
if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers:
|
||||||
headers['if-none-match'] = info['etag']
|
headers['if-none-match'] = info['etag']
|
||||||
if info.has_key('last-modified'):
|
if info.has_key('last-modified') and not 'last-modified' in headers:
|
||||||
headers['if-modified-since'] = info['last-modified']
|
headers['if-modified-since'] = info['last-modified']
|
||||||
elif entry_disposition == "TRANSPARENT":
|
elif entry_disposition == "TRANSPARENT":
|
||||||
pass
|
pass
|
||||||
|
110
planet/httplib2/iri2uri.py
Normal file
110
planet/httplib2/iri2uri.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
"""
|
||||||
|
iri2uri
|
||||||
|
|
||||||
|
Converts an IRI to a URI.
|
||||||
|
|
||||||
|
"""
|
||||||
|
__author__ = "Joe Gregorio (joe@bitworking.org)"
|
||||||
|
__copyright__ = "Copyright 2006, Joe Gregorio"
|
||||||
|
__contributors__ = []
|
||||||
|
__version__ = "1.0.0"
|
||||||
|
__license__ = "MIT"
|
||||||
|
__history__ = """
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
# Convert an IRI to a URI following the rules in RFC 3987
|
||||||
|
#
|
||||||
|
# The characters we need to enocde and escape are defined in the spec:
|
||||||
|
#
|
||||||
|
# iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
|
||||||
|
# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
|
||||||
|
# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
|
||||||
|
# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
|
||||||
|
# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
|
||||||
|
# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
|
||||||
|
# / %xD0000-DFFFD / %xE1000-EFFFD
|
||||||
|
|
||||||
|
escape_range = [
|
||||||
|
(0xA0, 0xD7FF ),
|
||||||
|
(0xE000, 0xF8FF ),
|
||||||
|
(0xF900, 0xFDCF ),
|
||||||
|
(0xFDF0, 0xFFEF),
|
||||||
|
(0x10000, 0x1FFFD ),
|
||||||
|
(0x20000, 0x2FFFD ),
|
||||||
|
(0x30000, 0x3FFFD),
|
||||||
|
(0x40000, 0x4FFFD ),
|
||||||
|
(0x50000, 0x5FFFD ),
|
||||||
|
(0x60000, 0x6FFFD),
|
||||||
|
(0x70000, 0x7FFFD ),
|
||||||
|
(0x80000, 0x8FFFD ),
|
||||||
|
(0x90000, 0x9FFFD),
|
||||||
|
(0xA0000, 0xAFFFD ),
|
||||||
|
(0xB0000, 0xBFFFD ),
|
||||||
|
(0xC0000, 0xCFFFD),
|
||||||
|
(0xD0000, 0xDFFFD ),
|
||||||
|
(0xE1000, 0xEFFFD),
|
||||||
|
(0xF0000, 0xFFFFD ),
|
||||||
|
(0x100000, 0x10FFFD)
|
||||||
|
]
|
||||||
|
|
||||||
|
def encode(c):
|
||||||
|
retval = c
|
||||||
|
i = ord(c)
|
||||||
|
for low, high in escape_range:
|
||||||
|
if i < low:
|
||||||
|
break
|
||||||
|
if i >= low and i <= high:
|
||||||
|
retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
|
||||||
|
break
|
||||||
|
return retval
|
||||||
|
|
||||||
|
|
||||||
|
def iri2uri(uri):
|
||||||
|
"""Convert an IRI to a URI. Note that IRIs must be
|
||||||
|
passed in a unicode strings. That is, do not utf-8 encode
|
||||||
|
the IRI before passing it into the function."""
|
||||||
|
if isinstance(uri ,unicode):
|
||||||
|
(scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
|
||||||
|
authority = authority.encode('idna')
|
||||||
|
# For each character in 'ucschar' or 'iprivate'
|
||||||
|
# 1. encode as utf-8
|
||||||
|
# 2. then %-encode each octet of that utf-8
|
||||||
|
uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
|
||||||
|
uri = "".join([encode(c) for c in uri])
|
||||||
|
return uri
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_uris(self):
|
||||||
|
"""Test that URIs are invariant under the transformation."""
|
||||||
|
invariant = [
|
||||||
|
u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
|
||||||
|
u"http://www.ietf.org/rfc/rfc2396.txt",
|
||||||
|
u"ldap://[2001:db8::7]/c=GB?objectClass?one",
|
||||||
|
u"mailto:John.Doe@example.com",
|
||||||
|
u"news:comp.infosystems.www.servers.unix",
|
||||||
|
u"tel:+1-816-555-1212",
|
||||||
|
u"telnet://192.0.2.16:80/",
|
||||||
|
u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
|
||||||
|
for uri in invariant:
|
||||||
|
self.assertEqual(uri, iri2uri(uri))
|
||||||
|
|
||||||
|
def test_iri(self):
|
||||||
|
""" Test that the right type of escaping is done for each part of the URI."""
|
||||||
|
self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
|
||||||
|
self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
|
||||||
|
self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
|
||||||
|
self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
|
||||||
|
self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
|
||||||
|
self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
|
||||||
|
self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
|
||||||
|
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
|
|
@ -6,6 +6,7 @@ Process a set of configuration defined sanitations on a given feed.
|
|||||||
import time
|
import time
|
||||||
# Planet modules
|
# Planet modules
|
||||||
import planet, config, shell
|
import planet, config, shell
|
||||||
|
from planet import feedparser
|
||||||
|
|
||||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||||
'xhtml': 'application/xhtml+xml'}
|
'xhtml': 'application/xhtml+xml'}
|
||||||
@ -92,3 +93,40 @@ def scrub(feed_uri, data):
|
|||||||
or entry['published_parsed'] <= now) and
|
or entry['published_parsed'] <= now) and
|
||||||
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
|
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
|
||||||
or entry['updated_parsed'] <= now)]
|
or entry['updated_parsed'] <= now)]
|
||||||
|
|
||||||
|
scrub_xmlbase = config.xml_base(feed_uri)
|
||||||
|
|
||||||
|
# resolve relative URIs and sanitize
|
||||||
|
for entry in data.entries + [data.feed]:
|
||||||
|
for key in entry.keys():
|
||||||
|
if key == 'content':
|
||||||
|
node = entry.content[0]
|
||||||
|
elif key.endswith('_detail'):
|
||||||
|
node = entry[key]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not node.has_key('type'): continue
|
||||||
|
if not 'html' in node['type']: continue
|
||||||
|
if not node.has_key('value'): continue
|
||||||
|
|
||||||
|
if node.has_key('base'):
|
||||||
|
if scrub_xmlbase:
|
||||||
|
if scrub_xmlbase == 'feed_alternate':
|
||||||
|
if entry.has_key('source') and \
|
||||||
|
entry.source.has_key('link'):
|
||||||
|
node['base'] = entry.source.link
|
||||||
|
elif data.feed.has_key('link'):
|
||||||
|
node['base'] = data.feed.link
|
||||||
|
elif scrub_xmlbase == 'entry_alternate':
|
||||||
|
if entry.has_key('link'):
|
||||||
|
node['base'] = entry.link
|
||||||
|
else:
|
||||||
|
node['base'] = feedparser._urljoin(
|
||||||
|
node['base'], scrub_xmlbase)
|
||||||
|
|
||||||
|
node['value'] = feedparser._resolveRelativeURIs(
|
||||||
|
node.value, node.base, 'utf-8', node.type)
|
||||||
|
|
||||||
|
node['value'] = feedparser._sanitizeHTML(
|
||||||
|
node.value, 'utf-8', node.type)
|
||||||
|
@ -254,7 +254,6 @@ def writeCache(feed_uri, feed_info, data):
|
|||||||
|
|
||||||
def httpThread(thread_index, input_queue, output_queue, log):
|
def httpThread(thread_index, input_queue, output_queue, log):
|
||||||
import httplib2, md5
|
import httplib2, md5
|
||||||
from socket import gaierror, error
|
|
||||||
from httplib import BadStatusLine
|
from httplib import BadStatusLine
|
||||||
|
|
||||||
h = httplib2.Http(config.http_cache_directory())
|
h = httplib2.Http(config.http_cache_directory())
|
||||||
@ -304,13 +303,12 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
if resp.has_key('content-encoding'):
|
if resp.has_key('content-encoding'):
|
||||||
del resp['content-encoding']
|
del resp['content-encoding']
|
||||||
setattr(feed, 'headers', resp)
|
setattr(feed, 'headers', resp)
|
||||||
except gaierror:
|
|
||||||
log.error("Fail to resolve server name %s via %d",
|
|
||||||
uri, thread_index)
|
|
||||||
except BadStatusLine:
|
except BadStatusLine:
|
||||||
log.error("Bad Status Line received for %s via %d",
|
log.error("Bad Status Line received for %s via %d",
|
||||||
uri, thread_index)
|
uri, thread_index)
|
||||||
except error, e:
|
except httplib2.HttpLib2Error, e:
|
||||||
|
log.error("HttpLib2Error: %s via %d", str(e), thread_index)
|
||||||
|
except socket.error, e:
|
||||||
if e.__class__.__name__.lower()=='timeout':
|
if e.__class__.__name__.lower()=='timeout':
|
||||||
feed.headers['status'] = '408'
|
feed.headers['status'] = '408'
|
||||||
log.warn("Timeout in thread-%d", thread_index)
|
log.warn("Timeout in thread-%d", thread_index)
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
import unittest, os, sys, glob, new, re, StringIO, time
|
import unittest, os, sys, glob, new, re, StringIO, time
|
||||||
from planet import feedparser
|
from planet import feedparser
|
||||||
from planet.reconstitute import reconstitute
|
from planet.reconstitute import reconstitute
|
||||||
|
from planet.scrub import scrub
|
||||||
|
|
||||||
testfiles = 'tests/data/reconstitute/%s.xml'
|
testfiles = 'tests/data/reconstitute/%s.xml'
|
||||||
|
|
||||||
@ -23,6 +24,7 @@ class ReconstituteTest(unittest.TestCase):
|
|||||||
# parse and reconstitute to a string
|
# parse and reconstitute to a string
|
||||||
work = StringIO.StringIO()
|
work = StringIO.StringIO()
|
||||||
results = feedparser.parse(data)
|
results = feedparser.parse(data)
|
||||||
|
scrub(testfiles%name, results)
|
||||||
reconstitute(results, results.entries[0]).writexml(work)
|
reconstitute(results, results.entries[0]).writexml(work)
|
||||||
|
|
||||||
# verify the results
|
# verify the results
|
||||||
|
@ -6,7 +6,7 @@ from planet.scrub import scrub
|
|||||||
from planet import feedparser, config
|
from planet import feedparser, config
|
||||||
|
|
||||||
feed = '''
|
feed = '''
|
||||||
<feed xmlns='http://www.w3.org/2005/Atom'>
|
<feed xmlns='http://www.w3.org/2005/Atom' xml:base="http://example.com/">
|
||||||
<author><name>F&ouml;o</name></author>
|
<author><name>F&ouml;o</name></author>
|
||||||
<entry xml:lang="en">
|
<entry xml:lang="en">
|
||||||
<id>ignoreme</id>
|
<id>ignoreme</id>
|
||||||
@ -15,7 +15,9 @@ feed = '''
|
|||||||
<title>F&ouml;o</title>
|
<title>F&ouml;o</title>
|
||||||
<summary>F&ouml;o</summary>
|
<summary>F&ouml;o</summary>
|
||||||
<content>F&ouml;o</content>
|
<content>F&ouml;o</content>
|
||||||
|
<link href="http://example.com/entry/1/"/>
|
||||||
<source>
|
<source>
|
||||||
|
<link href="http://example.com/feed/"/>
|
||||||
<author><name>F&ouml;o</name></author>
|
<author><name>F&ouml;o</name></author>
|
||||||
</source>
|
</source>
|
||||||
</entry>
|
</entry>
|
||||||
@ -82,3 +84,33 @@ class ScrubTest(unittest.TestCase):
|
|||||||
data = deepcopy(base)
|
data = deepcopy(base)
|
||||||
scrub('testfeed', data)
|
scrub('testfeed', data)
|
||||||
self.assertEqual(0, len(data.entries))
|
self.assertEqual(0, len(data.entries))
|
||||||
|
|
||||||
|
def test_scrub_xmlbase(self):
|
||||||
|
base = feedparser.parse(feed)
|
||||||
|
self.assertEqual('http://example.com/',
|
||||||
|
base.entries[0].title_detail.base)
|
||||||
|
|
||||||
|
config.parser.readfp(StringIO.StringIO(configData))
|
||||||
|
config.parser.set('testfeed', 'xml_base', 'feed_alternate')
|
||||||
|
data = deepcopy(base)
|
||||||
|
scrub('testfeed', data)
|
||||||
|
self.assertEqual('http://example.com/feed/',
|
||||||
|
data.entries[0].title_detail.base)
|
||||||
|
|
||||||
|
config.parser.set('testfeed', 'xml_base', 'entry_alternate')
|
||||||
|
data = deepcopy(base)
|
||||||
|
scrub('testfeed', data)
|
||||||
|
self.assertEqual('http://example.com/entry/1/',
|
||||||
|
data.entries[0].title_detail.base)
|
||||||
|
|
||||||
|
config.parser.set('testfeed', 'xml_base', 'base/')
|
||||||
|
data = deepcopy(base)
|
||||||
|
scrub('testfeed', data)
|
||||||
|
self.assertEqual('http://example.com/base/',
|
||||||
|
data.entries[0].title_detail.base)
|
||||||
|
|
||||||
|
config.parser.set('testfeed', 'xml_base', 'http://example.org/data/')
|
||||||
|
data = deepcopy(base)
|
||||||
|
scrub('testfeed', data)
|
||||||
|
self.assertEqual('http://example.org/data/',
|
||||||
|
data.entries[0].title_detail.base)
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
<th>Name</th>
|
<th>Name</th>
|
||||||
<th>Format</th>
|
<th>Format</th>
|
||||||
<xsl:if test="//planet:ignore_in_feed | //planet:filters |
|
<xsl:if test="//planet:ignore_in_feed | //planet:filters |
|
||||||
//planet:*[contains(local-name(),'_type')]">
|
//planet:xml_base | //planet:*[contains(local-name(),'_type')]">
|
||||||
<th>Notes</th>
|
<th>Notes</th>
|
||||||
</xsl:if>
|
</xsl:if>
|
||||||
</tr>
|
</tr>
|
||||||
@ -128,12 +128,12 @@
|
|||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td><xsl:value-of select="planet:format"/></td>
|
<td><xsl:value-of select="planet:format"/></td>
|
||||||
<xsl:if test="planet:ignore_in_feed | planet:filters |
|
<xsl:if test="planet:ignore_in_feed | planet:filters | planet:xml_base |
|
||||||
planet:*[contains(local-name(),'_type')]">
|
planet:*[contains(local-name(),'_type')]">
|
||||||
<td>
|
<td>
|
||||||
<dl>
|
<dl>
|
||||||
<xsl:for-each select="planet:ignore_in_feed | planet:filters |
|
<xsl:for-each select="planet:ignore_in_feed | planet:filters |
|
||||||
planet:*[contains(local-name(),'_type')]">
|
planet:xml_base | planet:*[contains(local-name(),'_type')]">
|
||||||
<xsl:sort select="local-name()"/>
|
<xsl:sort select="local-name()"/>
|
||||||
<dt><xsl:value-of select="local-name()"/></dt>
|
<dt><xsl:value-of select="local-name()"/></dt>
|
||||||
<dd><xsl:value-of select="."/></dd>
|
<dd><xsl:value-of select="."/></dd>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user