Boatload of changes from Sam. Most especially, switch from BeautifulSoup to html5lib.
This commit is contained in:
commit
c0b5c38d85
2
THANKS
2
THANKS
@ -4,7 +4,7 @@ Elias Torres - FOAF OnlineAccounts
|
||||
Jacques Distler - Template patches
|
||||
Michael Koziarski - HTTP Auth fix
|
||||
Brian Ewins - Win32 / Portalocker
|
||||
Joe Gregorio - Invoke same version of Python for filters
|
||||
Joe Gregorio - python versioning for filters, verbose tests, spider_threads
|
||||
Harry Fuecks - Pipe characters in file names, filter bug
|
||||
Eric van der Vlist - Filters to add language, category information
|
||||
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
|
||||
|
@ -98,7 +98,17 @@ use for logging output. Note: this configuration value is processed
|
||||
<dd>Number of seconds to wait for any given feed</dd>
|
||||
<dt><del>new_feed_items</del></dt>
|
||||
<dd>Number of items to take from new feeds</dd>
|
||||
<dt><ins>spider_threads</ins></dt>
|
||||
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||
<dt><ins>http_cache_directory</ins></dt>
|
||||
<dd>If <code>spider_threads</code> is specified, you can also specify a
|
||||
directory to be used for an additional HTTP cache to front end the Venus
|
||||
cache. If specified as a relative path, it is evaluated relative to the
|
||||
<code>cache_directory</code>.</dd>
|
||||
</dl>
|
||||
<p>Additional options can be found in
|
||||
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
|
||||
</blockquote>
|
||||
|
||||
<h3 id="default"><code>[DEFAULT]</code></h3>
|
||||
|
@ -23,7 +23,7 @@ window.onload=function() {
|
||||
|
||||
p = document.createElement('p');
|
||||
var a = document.createElement('a');
|
||||
a.setAttribute('href',base+'index.html');
|
||||
a.setAttribute('href',base);
|
||||
a.appendChild(document.createTextNode('Download'));
|
||||
p.appendChild(a);
|
||||
p.appendChild(document.createTextNode(" \u00b7 "));
|
||||
|
@ -33,8 +33,9 @@
|
||||
<ul>
|
||||
<li><a href="http://www.planetplanet.org/">Planet</a></li>
|
||||
<li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
|
||||
<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
|
||||
<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
|
||||
<li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
|
||||
<li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
|
||||
<li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
|
||||
<li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
|
||||
</ul>
|
||||
|
@ -69,7 +69,7 @@ right directory.</p></li>
|
||||
<p>Build your own themes, templates, or filters! And share!</p></li>
|
||||
</ol>
|
||||
|
||||
<h3>Mac OS X and Fink Instructions</h3>
|
||||
<h3 id="macosx">Mac OS X and Fink Instructions</h3>
|
||||
|
||||
<p>
|
||||
The <a href="http://fink.sourceforge.net/">Fink Project</a> packages
|
||||
@ -101,12 +101,20 @@ not yet ported to the newer python so Venus will be less featureful.
|
||||
may want to explicitly specify <code>python2.4</code>.</p></li>
|
||||
</ol>
|
||||
|
||||
<h3>Ubuntu Linux (Edgy Eft) instructions</h3>
|
||||
<h3 id="ubuntu">Ubuntu Linux (Edgy Eft) instructions</h3>
|
||||
|
||||
<p>Before starting, issue the following command:</p>
|
||||
<ul>
|
||||
<li><code>sudo apt-get install bzr python2.4-librdf</code></li>
|
||||
</ul>
|
||||
|
||||
<blockquote><pre>sudo apt-get install bzr python2.4-librdf</pre></blockquote>
|
||||
|
||||
<h3 id="python22">Python 2.2 instructions</h3>
|
||||
|
||||
<p>If you are running Python 2.2, you may also need to install <a href="http://pyxml.sourceforge.net/">pyxml</a>. If the
|
||||
following runs without error, you do <b>not</b> have the problem.</p>
|
||||
<blockquote><pre>python -c "__import__('xml.dom.minidom').dom.minidom.parseString('<entry xml:lang=\"en\"/>')"</pre></blockquote>
|
||||
<p>Installation of pyxml varies by platform. For Ubuntu Linux (Dapper Drake), issue the following command:</p>
|
||||
|
||||
<blockquote><pre>sudo apt-get install python2.2-xml</pre></blockquote>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
@ -11,7 +11,7 @@
|
||||
<h2>Normalization</h2>
|
||||
<p>Venus builds on, and extends, the <a
|
||||
href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
|
||||
href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
|
||||
href="http://code.google.com/p/html5lib/">html5lib</a> to
|
||||
convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
|
||||
meaning that you don't have to worry about funky feeds, tag soup, or character
|
||||
encoding.</p>
|
||||
@ -48,7 +48,7 @@ other security risks are removed.</p>
|
||||
links are resolved</a> within the HTML. This is also done for links
|
||||
in other areas in the feed too.</p>
|
||||
<p>Finally, unmatched tags are closed. This is done with a
|
||||
<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>. Additionally, a
|
||||
<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>. Additionally, a
|
||||
<a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
|
||||
subset of MathML</a>, as well as a
|
||||
<a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
|
||||
@ -69,8 +69,9 @@ are converted into
|
||||
<li><a href="http://www.feedparser.org/docs/reference-entry-content.html">content</a></li>
|
||||
</ul>
|
||||
<p>If no <a href="http://www.feedparser.org/docs/reference-feed-
|
||||
updated.html">updated</a> dates are found in an entry, or if the dates found
|
||||
are in the future, the current time is substituted.</p>
|
||||
updated.html">updated</a> dates are found in an entry, the updated date from
|
||||
the feed is used. If no updated date is found in either the feed or
|
||||
the entry, the current time is substituted.</p>
|
||||
<h3 id="overrides">Overrides</h3>
|
||||
<p>All of the above describes what Venus does automatically, either directly
|
||||
or through its dependencies. There are a number of errors which can not
|
||||
@ -87,6 +88,13 @@ case of feeds where the <code>id</code>, <code>updated</code> or
|
||||
attributes on these elements.</li>
|
||||
<li><code>name_type</code> does something similar for
|
||||
<a href="http://www.feedparser.org/docs/reference-entry-author_detail.html#reference.entry.author_detail.name">author names</a></li>
|
||||
<li><code>future_dates</code> allows you to specify how to deal with dates which are in the future.
|
||||
<ul style="margin:0">
|
||||
<li><code>ignore_date</code> will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.</li>
|
||||
<li><code>ignore_entry</code> will cause the entire entry containing the future date to be ignored until the date is past.</li>
|
||||
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -69,7 +69,7 @@
|
||||
<g font-size="32" fill="#FFF" text-anchor="middle">
|
||||
<text x="350" y="380" fill="#F00">Spider</text>
|
||||
<text x="350" y="460">Universal Feed Parser</text>
|
||||
<text x="350" y="530">BeautifulSoup</text>
|
||||
<text x="350" y="530">html5lib</text>
|
||||
<text x="350" y="600">Reconstitute</text>
|
||||
<text x="350" y="750">Filter(s)</text>
|
||||
<text x="850" y="250" fill="#F00">Splice</text>
|
||||
|
Before Width: | Height: | Size: 4.3 KiB After Width: | Height: | Size: 4.3 KiB |
1
filters/stripAd/google_ad_map.sed
Normal file
1
filters/stripAd/google_ad_map.sed
Normal file
@ -0,0 +1 @@
|
||||
s|<p><map name="google_ad_map.*</p>||
|
@ -54,7 +54,10 @@ if __name__ == "__main__":
|
||||
|
||||
if not offline:
|
||||
from planet import spider
|
||||
spider.spiderPlanet(only_if_new=only_if_new)
|
||||
try:
|
||||
spider.spiderPlanet(only_if_new=only_if_new)
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
from planet import splice
|
||||
doc = splice.splice()
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -16,10 +16,11 @@ def getLogger(level, format):
|
||||
|
||||
try:
|
||||
import logging
|
||||
logging.basicConfig(format=format)
|
||||
except:
|
||||
import compat_logging as logging
|
||||
logging.basicConfig(format=format)
|
||||
|
||||
logging.basicConfig(format=format)
|
||||
logging.getLogger().setLevel(logging.getLevelName(level))
|
||||
logger = logging.getLogger("planet.runner")
|
||||
try:
|
||||
@ -30,25 +31,4 @@ def getLogger(level, format):
|
||||
return logger
|
||||
|
||||
|
||||
def setTimeout(timeout):
|
||||
""" time out rather than hang forever on ultra-slow servers."""
|
||||
if timeout:
|
||||
try:
|
||||
timeout = float(timeout)
|
||||
except:
|
||||
logger.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||
timeout = None
|
||||
|
||||
if timeout:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(timeout)
|
||||
logger.info("Socket timeout set to %d seconds", timeout)
|
||||
except ImportError:
|
||||
import socket
|
||||
if hasattr(socket, 'setdefaulttimeout'):
|
||||
logger.debug("timeoutsocket not found, using python function")
|
||||
socket.setdefaulttimeout(timeout)
|
||||
logger.info("Socket timeout set to %d seconds", timeout)
|
||||
else:
|
||||
logger.error("Unable to set timeout to %d seconds", timeout)
|
||||
|
@ -70,6 +70,11 @@ def __init__():
|
||||
setattr(config, name, lambda default=default: get(None,name,default))
|
||||
planet_predefined_options.append(name)
|
||||
|
||||
# define a list planet-level variable
|
||||
def define_planet_int(name, default=0):
|
||||
setattr(config, name, lambda : int(get(None,name,default)))
|
||||
planet_predefined_options.append(name)
|
||||
|
||||
# define a list planet-level variable
|
||||
def define_planet_list(name, default=''):
|
||||
setattr(config, name, lambda : expand(get(None,name,default)))
|
||||
@ -91,7 +96,6 @@ def __init__():
|
||||
define_planet('cache_directory', "cache")
|
||||
define_planet('log_level', "WARNING")
|
||||
define_planet('log_format', "%(levelname)s:%(name)s:%(message)s")
|
||||
define_planet('feed_timeout', 20)
|
||||
define_planet('date_format', "%B %d, %Y %I:%M %p")
|
||||
define_planet('new_date_format', "%B %d, %Y")
|
||||
define_planet('generator', 'Venus')
|
||||
@ -100,6 +104,9 @@ def __init__():
|
||||
define_planet('owner_email', '')
|
||||
define_planet('output_theme', '')
|
||||
define_planet('output_dir', 'output')
|
||||
define_planet('spider_threads', 0)
|
||||
|
||||
define_planet_int('feed_timeout', 20)
|
||||
|
||||
define_planet_list('template_files')
|
||||
define_planet_list('bill_of_materials')
|
||||
@ -117,6 +124,7 @@ def __init__():
|
||||
define_tmpl('title_type', '')
|
||||
define_tmpl('summary_type', '')
|
||||
define_tmpl('content_type', '')
|
||||
define_tmpl('future_dates', 'keep')
|
||||
|
||||
def load(config_file):
|
||||
""" initialize and load a configuration"""
|
||||
@ -282,10 +290,17 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
|
||||
except:
|
||||
logger.exception("Unable to read %s readinglist", list)
|
||||
|
||||
def http_cache_directory():
|
||||
if parser.has_option('Planet', 'http_cache_directory'):
|
||||
os.path.join(cache_directory(),
|
||||
parser.get('Planet', 'http_cache_directory'))
|
||||
else:
|
||||
return os.path.join(cache_directory(), "cache")
|
||||
|
||||
def cache_sources_directory():
|
||||
if parser.has_option('Planet', 'cache_sources_directory'):
|
||||
parser.get('Planet', 'cache_sources_directory')
|
||||
return os.path.join(cache_directory(),
|
||||
parser.get('Planet', 'cache_sources_directory'))
|
||||
else:
|
||||
return os.path.join(cache_directory(), 'sources')
|
||||
|
||||
|
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||
"""
|
||||
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.144 $"[11:16] + "-cvs"
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
|
||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
34
planet/html5lib/__init__.py
Normal file
34
planet/html5lib/__init__.py
Normal file
@ -0,0 +1,34 @@
|
||||
"""
|
||||
HTML parsing library based on the WHATWG "HTML5"
|
||||
specification. The parser is designed to be compatible with existing
|
||||
HTML found in the wild and implements well-defined error recovery that
|
||||
is largely compatible with modern desktop web browsers.
|
||||
|
||||
Example usage:
|
||||
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
p = html5lib.HTMLParser()
|
||||
tree = p.parse(f)
|
||||
|
||||
By default the returned treeformat is a custom "simpletree", similar
|
||||
to a DOM tree; each element has attributes childNodes and parent
|
||||
holding the parents and children respectively, a name attribute
|
||||
holding the Element name, a data attribute holding the element data
|
||||
(for text and comment nodes) and an attributes dictionary holding the
|
||||
element's attributes (for Element nodes).
|
||||
|
||||
To get output in ElementTree format:
|
||||
|
||||
import html5lib
|
||||
from html5lib.treebuilders import etree
|
||||
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
||||
elementtree = p.parse(f)
|
||||
|
||||
Note: Because HTML documents support various features not in the
|
||||
default ElementTree (e.g. doctypes), we suppy our own simple
|
||||
serializer; html5lib.treebuilders.etree.tostring At present this does not
|
||||
have the encoding support offered by the elementtree serializer.
|
||||
|
||||
"""
|
||||
from html5parser import HTMLParser
|
456
planet/html5lib/constants.py
Normal file
456
planet/html5lib/constants.py
Normal file
@ -0,0 +1,456 @@
|
||||
import string
|
||||
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
EOF = None
|
||||
|
||||
contentModelFlags = {
|
||||
"PCDATA":0,
|
||||
"RCDATA":1,
|
||||
"CDATA":2,
|
||||
"PLAINTEXT":3
|
||||
}
|
||||
|
||||
scopingElements = frozenset((
|
||||
"button",
|
||||
"caption",
|
||||
"html",
|
||||
"marquee",
|
||||
"object",
|
||||
"table",
|
||||
"td",
|
||||
"th"
|
||||
))
|
||||
|
||||
formattingElements = frozenset((
|
||||
"a",
|
||||
"b",
|
||||
"big",
|
||||
"em",
|
||||
"font",
|
||||
"i",
|
||||
"nobr",
|
||||
"s",
|
||||
"small",
|
||||
"strike",
|
||||
"strong",
|
||||
"tt",
|
||||
"u"
|
||||
))
|
||||
|
||||
specialElements = frozenset((
|
||||
"address",
|
||||
"area",
|
||||
"base",
|
||||
"basefont",
|
||||
"bgsound",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"center",
|
||||
"col",
|
||||
"colgroup",
|
||||
"dd",
|
||||
"dir",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"form",
|
||||
"frame",
|
||||
"frameset",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"hr",
|
||||
"iframe",
|
||||
"image",
|
||||
"img",
|
||||
"input",
|
||||
"isindex",
|
||||
"li",
|
||||
"link",
|
||||
"listing",
|
||||
"menu",
|
||||
"meta",
|
||||
"noembed",
|
||||
"noframes",
|
||||
"noscript",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"p",
|
||||
"param",
|
||||
"plaintext",
|
||||
"pre",
|
||||
"script",
|
||||
"select",
|
||||
"spacer",
|
||||
"style",
|
||||
"tbody",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"title",
|
||||
"tr",
|
||||
"ul",
|
||||
"wbr"
|
||||
))
|
||||
|
||||
spaceCharacters = frozenset((
|
||||
u"\t",
|
||||
u"\n",
|
||||
u"\u000B",
|
||||
u"\u000C",
|
||||
u" "
|
||||
))
|
||||
|
||||
tableInsertModeElements = frozenset((
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr"
|
||||
))
|
||||
|
||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||
asciiLetters = frozenset(string.ascii_letters)
|
||||
digits = frozenset(string.digits)
|
||||
hexDigits = frozenset(string.hexdigits)
|
||||
|
||||
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
||||
for c in string.ascii_uppercase])
|
||||
|
||||
# Heading elements need to be ordered
|
||||
headingElements = (
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6"
|
||||
)
|
||||
|
||||
# XXX What about event-source and command?
|
||||
voidElements = frozenset((
|
||||
"base",
|
||||
"link",
|
||||
"meta",
|
||||
"hr",
|
||||
"br",
|
||||
"img",
|
||||
"embed",
|
||||
"param",
|
||||
"area",
|
||||
"col",
|
||||
"input"
|
||||
))
|
||||
|
||||
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
||||
# therefore can't be a frozenset.
|
||||
entitiesWindows1252 = (
|
||||
8364, # 0x80 0x20AC EURO SIGN
|
||||
65533, # 0x81 UNDEFINED
|
||||
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
||||
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
||||
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
||||
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
||||
8224, # 0x86 0x2020 DAGGER
|
||||
8225, # 0x87 0x2021 DOUBLE DAGGER
|
||||
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
8240, # 0x89 0x2030 PER MILLE SIGN
|
||||
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
||||
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
||||
65533, # 0x8D UNDEFINED
|
||||
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
||||
65533, # 0x8F UNDEFINED
|
||||
65533, # 0x90 UNDEFINED
|
||||
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
||||
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
||||
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
||||
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
||||
8226, # 0x95 0x2022 BULLET
|
||||
8211, # 0x96 0x2013 EN DASH
|
||||
8212, # 0x97 0x2014 EM DASH
|
||||
732, # 0x98 0x02DC SMALL TILDE
|
||||
8482, # 0x99 0x2122 TRADE MARK SIGN
|
||||
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
||||
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
||||
65533, # 0x9D UNDEFINED
|
||||
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
)
|
||||
|
||||
entities = {
|
||||
"AElig": u"\u00C6",
|
||||
"Aacute": u"\u00C1",
|
||||
"Acirc": u"\u00C2",
|
||||
"Agrave": u"\u00C0",
|
||||
"Alpha": u"\u0391",
|
||||
"Aring": u"\u00C5",
|
||||
"Atilde": u"\u00C3",
|
||||
"Auml": u"\u00C4",
|
||||
"Beta": u"\u0392",
|
||||
"Ccedil": u"\u00C7",
|
||||
"Chi": u"\u03A7",
|
||||
"Dagger": u"\u2021",
|
||||
"Delta": u"\u0394",
|
||||
"ETH": u"\u00D0",
|
||||
"Eacute": u"\u00C9",
|
||||
"Ecirc": u"\u00CA",
|
||||
"Egrave": u"\u00C8",
|
||||
"Epsilon": u"\u0395",
|
||||
"Eta": u"\u0397",
|
||||
"Euml": u"\u00CB",
|
||||
"Gamma": u"\u0393",
|
||||
"Iacute": u"\u00CD",
|
||||
"Icirc": u"\u00CE",
|
||||
"Igrave": u"\u00CC",
|
||||
"Iota": u"\u0399",
|
||||
"Iuml": u"\u00CF",
|
||||
"Kappa": u"\u039A",
|
||||
"Lambda": u"\u039B",
|
||||
"Mu": u"\u039C",
|
||||
"Ntilde": u"\u00D1",
|
||||
"Nu": u"\u039D",
|
||||
"OElig": u"\u0152",
|
||||
"Oacute": u"\u00D3",
|
||||
"Ocirc": u"\u00D4",
|
||||
"Ograve": u"\u00D2",
|
||||
"Omega": u"\u03A9",
|
||||
"Omicron": u"\u039F",
|
||||
"Oslash": u"\u00D8",
|
||||
"Otilde": u"\u00D5",
|
||||
"Ouml": u"\u00D6",
|
||||
"Phi": u"\u03A6",
|
||||
"Pi": u"\u03A0",
|
||||
"Prime": u"\u2033",
|
||||
"Psi": u"\u03A8",
|
||||
"Rho": u"\u03A1",
|
||||
"Scaron": u"\u0160",
|
||||
"Sigma": u"\u03A3",
|
||||
"THORN": u"\u00DE",
|
||||
"Tau": u"\u03A4",
|
||||
"Theta": u"\u0398",
|
||||
"Uacute": u"\u00DA",
|
||||
"Ucirc": u"\u00DB",
|
||||
"Ugrave": u"\u00D9",
|
||||
"Upsilon": u"\u03A5",
|
||||
"Uuml": u"\u00DC",
|
||||
"Xi": u"\u039E",
|
||||
"Yacute": u"\u00DD",
|
||||
"Yuml": u"\u0178",
|
||||
"Zeta": u"\u0396",
|
||||
"aacute": u"\u00E1",
|
||||
"acirc": u"\u00E2",
|
||||
"acute": u"\u00B4",
|
||||
"aelig": u"\u00E6",
|
||||
"agrave": u"\u00E0",
|
||||
"alefsym": u"\u2135",
|
||||
"alpha": u"\u03B1",
|
||||
"amp": u"\u0026",
|
||||
"AMP": u"\u0026",
|
||||
"and": u"\u2227",
|
||||
"ang": u"\u2220",
|
||||
"apos": u"\u0027",
|
||||
"aring": u"\u00E5",
|
||||
"asymp": u"\u2248",
|
||||
"atilde": u"\u00E3",
|
||||
"auml": u"\u00E4",
|
||||
"bdquo": u"\u201E",
|
||||
"beta": u"\u03B2",
|
||||
"brvbar": u"\u00A6",
|
||||
"bull": u"\u2022",
|
||||
"cap": u"\u2229",
|
||||
"ccedil": u"\u00E7",
|
||||
"cedil": u"\u00B8",
|
||||
"cent": u"\u00A2",
|
||||
"chi": u"\u03C7",
|
||||
"circ": u"\u02C6",
|
||||
"clubs": u"\u2663",
|
||||
"cong": u"\u2245",
|
||||
"copy": u"\u00A9",
|
||||
"COPY": u"\u00A9",
|
||||
"crarr": u"\u21B5",
|
||||
"cup": u"\u222A",
|
||||
"curren": u"\u00A4",
|
||||
"dArr": u"\u21D3",
|
||||
"dagger": u"\u2020",
|
||||
"darr": u"\u2193",
|
||||
"deg": u"\u00B0",
|
||||
"delta": u"\u03B4",
|
||||
"diams": u"\u2666",
|
||||
"divide": u"\u00F7",
|
||||
"eacute": u"\u00E9",
|
||||
"ecirc": u"\u00EA",
|
||||
"egrave": u"\u00E8",
|
||||
"empty": u"\u2205",
|
||||
"emsp": u"\u2003",
|
||||
"ensp": u"\u2002",
|
||||
"epsilon": u"\u03B5",
|
||||
"equiv": u"\u2261",
|
||||
"eta": u"\u03B7",
|
||||
"eth": u"\u00F0",
|
||||
"euml": u"\u00EB",
|
||||
"euro": u"\u20AC",
|
||||
"exist": u"\u2203",
|
||||
"fnof": u"\u0192",
|
||||
"forall": u"\u2200",
|
||||
"frac12": u"\u00BD",
|
||||
"frac14": u"\u00BC",
|
||||
"frac34": u"\u00BE",
|
||||
"frasl": u"\u2044",
|
||||
"gamma": u"\u03B3",
|
||||
"ge": u"\u2265",
|
||||
"gt": u"\u003E",
|
||||
"GT": u"\u003E",
|
||||
"hArr": u"\u21D4",
|
||||
"harr": u"\u2194",
|
||||
"hearts": u"\u2665",
|
||||
"hellip": u"\u2026",
|
||||
"iacute": u"\u00ED",
|
||||
"icirc": u"\u00EE",
|
||||
"iexcl": u"\u00A1",
|
||||
"igrave": u"\u00EC",
|
||||
"image": u"\u2111",
|
||||
"infin": u"\u221E",
|
||||
"int": u"\u222B",
|
||||
"iota": u"\u03B9",
|
||||
"iquest": u"\u00BF",
|
||||
"isin": u"\u2208",
|
||||
"iuml": u"\u00EF",
|
||||
"kappa": u"\u03BA",
|
||||
"lArr": u"\u21D0",
|
||||
"lambda": u"\u03BB",
|
||||
"lang": u"\u2329",
|
||||
"laquo": u"\u00AB",
|
||||
"larr": u"\u2190",
|
||||
"lceil": u"\u2308",
|
||||
"ldquo": u"\u201C",
|
||||
"le": u"\u2264",
|
||||
"lfloor": u"\u230A",
|
||||
"lowast": u"\u2217",
|
||||
"loz": u"\u25CA",
|
||||
"lrm": u"\u200E",
|
||||
"lsaquo": u"\u2039",
|
||||
"lsquo": u"\u2018",
|
||||
"lt": u"\u003C",
|
||||
"LT": u"\u003C",
|
||||
"macr": u"\u00AF",
|
||||
"mdash": u"\u2014",
|
||||
"micro": u"\u00B5",
|
||||
"middot": u"\u00B7",
|
||||
"minus": u"\u2212",
|
||||
"mu": u"\u03BC",
|
||||
"nabla": u"\u2207",
|
||||
"nbsp": u"\u00A0",
|
||||
"ndash": u"\u2013",
|
||||
"ne": u"\u2260",
|
||||
"ni": u"\u220B",
|
||||
"not": u"\u00AC",
|
||||
"notin": u"\u2209",
|
||||
"nsub": u"\u2284",
|
||||
"ntilde": u"\u00F1",
|
||||
"nu": u"\u03BD",
|
||||
"oacute": u"\u00F3",
|
||||
"ocirc": u"\u00F4",
|
||||
"oelig": u"\u0153",
|
||||
"ograve": u"\u00F2",
|
||||
"oline": u"\u203E",
|
||||
"omega": u"\u03C9",
|
||||
"omicron": u"\u03BF",
|
||||
"oplus": u"\u2295",
|
||||
"or": u"\u2228",
|
||||
"ordf": u"\u00AA",
|
||||
"ordm": u"\u00BA",
|
||||
"oslash": u"\u00F8",
|
||||
"otilde": u"\u00F5",
|
||||
"otimes": u"\u2297",
|
||||
"ouml": u"\u00F6",
|
||||
"para": u"\u00B6",
|
||||
"part": u"\u2202",
|
||||
"permil": u"\u2030",
|
||||
"perp": u"\u22A5",
|
||||
"phi": u"\u03C6",
|
||||
"pi": u"\u03C0",
|
||||
"piv": u"\u03D6",
|
||||
"plusmn": u"\u00B1",
|
||||
"pound": u"\u00A3",
|
||||
"prime": u"\u2032",
|
||||
"prod": u"\u220F",
|
||||
"prop": u"\u221D",
|
||||
"psi": u"\u03C8",
|
||||
"quot": u"\u0022",
|
||||
"QUOT": u"\u0022",
|
||||
"rArr": u"\u21D2",
|
||||
"radic": u"\u221A",
|
||||
"rang": u"\u232A",
|
||||
"raquo": u"\u00BB",
|
||||
"rarr": u"\u2192",
|
||||
"rceil": u"\u2309",
|
||||
"rdquo": u"\u201D",
|
||||
"real": u"\u211C",
|
||||
"reg": u"\u00AE",
|
||||
"REG": u"\u00AE",
|
||||
"rfloor": u"\u230B",
|
||||
"rho": u"\u03C1",
|
||||
"rlm": u"\u200F",
|
||||
"rsaquo": u"\u203A",
|
||||
"rsquo": u"\u2019",
|
||||
"sbquo": u"\u201A",
|
||||
"scaron": u"\u0161",
|
||||
"sdot": u"\u22C5",
|
||||
"sect": u"\u00A7",
|
||||
"shy": u"\u00AD",
|
||||
"sigma": u"\u03C3",
|
||||
"sigmaf": u"\u03C2",
|
||||
"sim": u"\u223C",
|
||||
"spades": u"\u2660",
|
||||
"sub": u"\u2282",
|
||||
"sube": u"\u2286",
|
||||
"sum": u"\u2211",
|
||||
"sup": u"\u2283",
|
||||
"sup1": u"\u00B9",
|
||||
"sup2": u"\u00B2",
|
||||
"sup3": u"\u00B3",
|
||||
"supe": u"\u2287",
|
||||
"szlig": u"\u00DF",
|
||||
"tau": u"\u03C4",
|
||||
"there4": u"\u2234",
|
||||
"theta": u"\u03B8",
|
||||
"thetasym": u"\u03D1",
|
||||
"thinsp": u"\u2009",
|
||||
"thorn": u"\u00FE",
|
||||
"tilde": u"\u02DC",
|
||||
"times": u"\u00D7",
|
||||
"trade": u"\u2122",
|
||||
"uArr": u"\u21D1",
|
||||
"uacute": u"\u00FA",
|
||||
"uarr": u"\u2191",
|
||||
"ucirc": u"\u00FB",
|
||||
"ugrave": u"\u00F9",
|
||||
"uml": u"\u00A8",
|
||||
"upsih": u"\u03D2",
|
||||
"upsilon": u"\u03C5",
|
||||
"uuml": u"\u00FC",
|
||||
"weierp": u"\u2118",
|
||||
"xi": u"\u03BE",
|
||||
"yacute": u"\u00FD",
|
||||
"yen": u"\u00A5",
|
||||
"yuml": u"\u00FF",
|
||||
"zeta": u"\u03B6",
|
||||
"zwj": u"\u200D",
|
||||
"zwnj": u"\u200C"
|
||||
}
|
1719
planet/html5lib/html5parser.py
Normal file
1719
planet/html5lib/html5parser.py
Normal file
File diff suppressed because it is too large
Load Diff
202
planet/html5lib/inputstream.py
Normal file
202
planet/html5lib/inputstream.py
Normal file
@ -0,0 +1,202 @@
|
||||
import codecs
|
||||
import re
|
||||
|
||||
from constants import EOF
|
||||
|
||||
class HTMLInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
|
||||
This class takes care of character encoding and removing or replacing
|
||||
incorrect byte-sequences and also provides column and line tracking.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, encoding=None):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
for use by the HTML5Lib.
|
||||
|
||||
source can be either a file-object, local filename or a string.
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
"""
|
||||
# List of where new lines occur
|
||||
self.newLines = []
|
||||
|
||||
# Encoding Information
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Raw Stream
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Try to detect the encoding of the stream by looking for a BOM
|
||||
detectedEncoding = self.detectEncoding()
|
||||
|
||||
# If an encoding was specified or detected from the BOM don't allow
|
||||
# the encoding to be changed futher into the stream
|
||||
if self.charEncoding or detectedEncoding:
|
||||
self.allowEncodingOverride = False
|
||||
else:
|
||||
self.allowEncodingOverride = True
|
||||
|
||||
# If an encoding wasn't specified, use the encoding detected from the
|
||||
# BOM, if present, otherwise use the default encoding
|
||||
if not self.charEncoding:
|
||||
self.charEncoding = detectedEncoding or "cp1252"
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||
|
||||
# Normalize new lines and null characters
|
||||
uString = re.sub('\r\n?', '\n', uString)
|
||||
uString = re.sub('\x00', '\xFFFD', uString)
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
self.dataStream = uString
|
||||
|
||||
self.queue = []
|
||||
|
||||
# Reset position in the list to read from
|
||||
self.reset()
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
|
||||
source can be either a file object, local filename or a string.
|
||||
|
||||
"""
|
||||
# Already a file object
|
||||
if hasattr(source, 'read'):
|
||||
stream = source
|
||||
else:
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
|
||||
def detectEncoding(self):
|
||||
# Attempts to detect the character encoding of the stream. If
|
||||
# an encoding can be determined from the BOM return the name of the
|
||||
# encoding otherwise return None
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
self.rawStream.seek(0)
|
||||
string = self.rawStream.read(4)
|
||||
|
||||
# Try detecting the BOM using bytes from the string
|
||||
encoding = bomDict.get(string[:3]) # UTF-8
|
||||
seek = 3
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string[:2]) # UTF-16
|
||||
seek = 2
|
||||
if not encoding:
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def declareEncoding(self, encoding):
|
||||
"""Report the encoding declared by the meta element
|
||||
|
||||
If the encoding is currently only guessed, then this
|
||||
will read subsequent characters in that encoding.
|
||||
|
||||
If the encoding is not compatible with the guessed encoding
|
||||
and non-US-ASCII characters have been seen, return True indicating
|
||||
parsing will have to begin again.
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
def determineNewLines(self):
|
||||
# Looks through the stream to find where new lines occur so
|
||||
# the position method can tell where it is.
|
||||
self.newLines.append(0)
|
||||
for i in xrange(len(self.dataStream)):
|
||||
if self.dataStream[i] == u"\n":
|
||||
self.newLines.append(i)
|
||||
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
# Generate list of new lines first time around
|
||||
if not self.newLines:
|
||||
self.determineNewLines()
|
||||
|
||||
line = 0
|
||||
tell = self.tell
|
||||
for pos in self.newLines:
|
||||
if pos < tell:
|
||||
line += 1
|
||||
else:
|
||||
break
|
||||
col = tell - self.newLines[line-1] - 1
|
||||
return (line, col)
|
||||
|
||||
def reset(self):
|
||||
"""Resets the position in the stream back to the start."""
|
||||
self.tell = 0
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
if self.queue:
|
||||
return self.queue.pop(0)
|
||||
else:
|
||||
try:
|
||||
self.tell += 1
|
||||
return self.dataStream[self.tell - 1]
|
||||
except:
|
||||
return EOF
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
any container that supports the in method being called on it.
|
||||
"""
|
||||
charStack = [self.char()]
|
||||
|
||||
# First from the queue
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite \
|
||||
and self.queue:
|
||||
charStack.append(self.queue.pop(0))
|
||||
|
||||
# Then the rest
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||
try:
|
||||
self.tell += 1
|
||||
charStack.append(self.dataStream[self.tell - 1])
|
||||
except:
|
||||
charStack.append(EOF)
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
self.queue.insert(0, charStack.pop())
|
||||
return "".join(charStack)
|
||||
|
||||
if __name__ == "__main__":
|
||||
stream = HTMLInputStream("../tests/utf-8-bom.html")
|
||||
|
||||
c = stream.char()
|
||||
while c:
|
||||
line, col = stream.position()
|
||||
if c == u"\n":
|
||||
print "Line %s, Column %s: Line Feed" % (line, col)
|
||||
else:
|
||||
print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
|
||||
c = stream.char()
|
||||
print "EOF"
|
106
planet/html5lib/liberalxmlparser.py
Normal file
106
planet/html5lib/liberalxmlparser.py
Normal file
@ -0,0 +1,106 @@
|
||||
"""
|
||||
Warning: this module is experimental and subject to change and even removal
|
||||
at any time.
|
||||
|
||||
For background/rationale, see:
|
||||
* http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
|
||||
* http://tinyurl.com/ylfj8k (and follow-ups)
|
||||
|
||||
References:
|
||||
* http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
|
||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
|
||||
@@TODO:
|
||||
* Build a Treebuilder that produces Python DOM objects:
|
||||
http://docs.python.org/lib/module-xml.dom.html
|
||||
* Produce SAX events based on the produced DOM. This is intended not to
|
||||
support streaming, but rather to support application level compatibility.
|
||||
* Optional namespace support
|
||||
* Special case the output of XHTML <script> elements so that the empty
|
||||
element syntax is never used, even when the src attribute is provided.
|
||||
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
|
||||
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
||||
* Map illegal XML characters to U+FFFD, possibly with additional markup in
|
||||
the case of XHTML
|
||||
* Selectively lowercase only XHTML, but not foreign markup
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
class XHTMLParser(html5parser.HTMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
# AT When Python 2.4 is widespread we should use
|
||||
# dict(reversed(token.data))
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
return token
|
||||
|
||||
class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
def insertHtmlElement(self):
|
||||
element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
|
||||
self.tree.openElements.append(element)
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XMLParser(XHTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
XHTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
self.tree.openElements.append(self.tree.document)
|
||||
self.parser.phase = XmlElementPhase(self.parser, self.tree)
|
||||
return getattr(self.parser.phase, name)
|
||||
|
||||
class XmlElementPhase(html5parser.Phase):
|
||||
""" Generic handling for all XML elements """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.Phase.__init__(self, *args, **kwargs)
|
||||
self.startTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
self.endTagHandler = html5parser.utils.MethodDispatcher([])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
|
||||
def endTagOther(self, name):
|
||||
for node in self.tree.openElements[::-1]:
|
||||
if node.name == name:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
"."))
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
self.parser.parseError()
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.tree.insertText(data)
|
745
planet/html5lib/tokenizer.py
Normal file
745
planet/html5lib/tokenizer.py
Normal file
@ -0,0 +1,745 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from constants import contentModelFlags, spaceCharacters
|
||||
from constants import entitiesWindows1252, entities
|
||||
from constants import asciiLowercase, asciiLetters
|
||||
from constants import digits, hexDigits, EOF
|
||||
|
||||
from inputstream import HTMLInputStream
|
||||
|
||||
class HTMLTokenizer(object):
|
||||
""" This class takes care of tokenizing HTML.
|
||||
|
||||
* self.currentToken
|
||||
Holds the token that is currently being processed.
|
||||
|
||||
* self.state
|
||||
Holds a reference to the method to be invoked... XXX
|
||||
|
||||
* self.states
|
||||
Holds a mapping between states and methods that implement the state.
|
||||
|
||||
* self.stream
|
||||
Points to HTMLInputStream object.
|
||||
"""
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def __init__(self, stream, encoding=None):
|
||||
self.stream = HTMLInputStream(stream, encoding)
|
||||
|
||||
self.states = {
|
||||
"data":self.dataState,
|
||||
"entityData":self.entityDataState,
|
||||
"tagOpen":self.tagOpenState,
|
||||
"closeTagOpen":self.closeTagOpenState,
|
||||
"tagName":self.tagNameState,
|
||||
"beforeAttributeName":self.beforeAttributeNameState,
|
||||
"attributeName":self.attributeNameState,
|
||||
"afterAttributeName":self.afterAttributeNameState,
|
||||
"beforeAttributeValue":self.beforeAttributeValueState,
|
||||
"attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
|
||||
"attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
|
||||
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
||||
"bogusComment":self.bogusCommentState,
|
||||
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
||||
"comment":self.commentState,
|
||||
"commentDash":self.commentDashState,
|
||||
"commentEnd":self.commentEndState,
|
||||
"doctype":self.doctypeState,
|
||||
"beforeDoctypeName":self.beforeDoctypeNameState,
|
||||
"doctypeName":self.doctypeNameState,
|
||||
"afterDoctypeName":self.afterDoctypeNameState,
|
||||
"bogusDoctype":self.bogusDoctypeState
|
||||
}
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
self.state = self.states["data"]
|
||||
|
||||
# The current token being created
|
||||
self.currentToken = None
|
||||
|
||||
# Tokens to be processed.
|
||||
self.tokenQueue = []
|
||||
|
||||
def __iter__(self):
|
||||
""" This is where the magic happens.
|
||||
|
||||
We do our usually processing through the states and when we have a token
|
||||
to return we yield the token which pauses processing until the next token
|
||||
is requested.
|
||||
"""
|
||||
self.stream.reset()
|
||||
self.tokenQueue = []
|
||||
# Start processing. When EOF is reached self.state will return False
|
||||
# instead of True and the loop will terminate.
|
||||
while self.state():
|
||||
while self.tokenQueue:
|
||||
yield self.tokenQueue.pop(0)
|
||||
|
||||
# Below are various helper functions the tokenizer states use worked out.
|
||||
def processSolidusInTag(self):
|
||||
"""If the next character is a '>', convert the currentToken into
|
||||
an EmptyTag
|
||||
"""
|
||||
|
||||
# We need to consume another character to make sure it's a ">"
|
||||
data = self.stream.char()
|
||||
|
||||
if self.currentToken["type"] == "StartTag" and data == u">":
|
||||
self.currentToken["type"] = "EmptyTag"
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
self.stream.queue.append(data)
|
||||
|
||||
def consumeNumberEntity(self, isHex):
|
||||
"""This function returns either U+FFFD or the character based on the
|
||||
decimal or hexadecimal representation. It also discards ";" if present.
|
||||
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
|
||||
"""
|
||||
|
||||
allowed = digits
|
||||
radix = 10
|
||||
if isHex:
|
||||
allowed = hexDigits
|
||||
radix = 16
|
||||
|
||||
char = u"\uFFFD"
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
# don't hit an EOF.
|
||||
c = self.stream.char()
|
||||
while c in allowed and c is not EOF:
|
||||
charStack.append(c)
|
||||
c = self.stream.char()
|
||||
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = int("".join(charStack), radix)
|
||||
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
||||
# smaller) we need to do the "windows trick".
|
||||
if 127 < charAsInt < 160:
|
||||
#XXX - removed parse error from windows 1252 entity for now
|
||||
#we may want to reenable this later
|
||||
#self.tokenQueue.append({"type": "ParseError", "data":
|
||||
# _("Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# 0 is not a good number.
|
||||
if charAsInt == 0:
|
||||
charAsInt = 65533
|
||||
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != u";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
self.stream.queue.append(c)
|
||||
|
||||
return char
|
||||
|
||||
def consumeEntity(self):
|
||||
char = None
|
||||
charStack = [self.stream.char()]
|
||||
if charStack[0] == u"#":
|
||||
# We might have a number entity here.
|
||||
charStack.extend([self.stream.char(), self.stream.char()])
|
||||
if EOF in charStack:
|
||||
# If we reach the end of the file put everything up to EOF
|
||||
# back in the queue
|
||||
charStack = charStack[:charStack.index(EOF)]
|
||||
self.stream.queue.extend(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
else:
|
||||
if charStack[1].lower() == u"x" \
|
||||
and charStack[2] in hexDigits:
|
||||
# Hexadecimal entity detected.
|
||||
self.stream.queue.append(charStack[2])
|
||||
char = self.consumeNumberEntity(True)
|
||||
elif charStack[1] in digits:
|
||||
# Decimal entity detected.
|
||||
self.stream.queue.extend(charStack[1:])
|
||||
char = self.consumeNumberEntity(False)
|
||||
else:
|
||||
# No number entity detected.
|
||||
self.stream.queue.extend(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected but none found.")})
|
||||
# Break out if we reach the end of the file
|
||||
elif charStack[0] == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity expected. Got end of file instead.")})
|
||||
else:
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
#
|
||||
# Consume characters and compare to these to a substring of the
|
||||
# entity names in the list until the substring no longer matches.
|
||||
filteredEntityList = [e for e in entities if \
|
||||
e.startswith(charStack[0])]
|
||||
|
||||
def entitiesStartingWith(name):
|
||||
return [e for e in filteredEntityList if e.startswith(name)]
|
||||
|
||||
while charStack[-1] != EOF and\
|
||||
entitiesStartingWith("".join(charStack)):
|
||||
charStack.append(self.stream.char())
|
||||
|
||||
# At this point we have a string that starts with some characters
|
||||
# that may match an entity
|
||||
entityName = None
|
||||
|
||||
# Try to find the longest entity the string will match
|
||||
for entityLength in xrange(len(charStack)-1,1,-1):
|
||||
possibleEntityName = "".join(charStack[:entityLength])
|
||||
if possibleEntityName in entities:
|
||||
entityName = possibleEntityName
|
||||
break
|
||||
|
||||
if entityName is not None:
|
||||
char = entities[entityName]
|
||||
|
||||
# Check whether or not the last character returned can be
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity did not ';'.")})
|
||||
self.stream.queue.extend(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity expected. Got none.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
return char
|
||||
|
||||
def processEntityInAttribute(self):
|
||||
"""This method replaces the need for "entityInAttributeValueState".
|
||||
"""
|
||||
entity = self.consumeEntity()
|
||||
if entity:
|
||||
self.currentToken["data"][-1][1] += entity
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += u"&"
|
||||
|
||||
def emitCurrentToken(self):
|
||||
"""This method is a generic handler for emitting the StartTag,
|
||||
EndTag, Comment and Doctype. It also sets the state to
|
||||
"data" because that's what's needed after a token has been emitted.
|
||||
"""
|
||||
|
||||
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
|
||||
# considered harmful it should be ok here given that the classes are for
|
||||
# internal usage.
|
||||
|
||||
token = self.currentToken
|
||||
|
||||
# If an end tag has attributes it's a parse error and they should
|
||||
# be removed
|
||||
if token["type"] == "EndTag" and token["data"]:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("End tag contains unexpected attributes.")})
|
||||
token["data"] = {}
|
||||
|
||||
# Add token to the queue to be yielded
|
||||
self.tokenQueue.append(token)
|
||||
self.state = self.states["data"]
|
||||
|
||||
def emitCurrentTokenWithParseError(self, data=None):
|
||||
# XXX if we want useful error messages we need to inline this method
|
||||
"""This method is equivalent to emitCurrentToken (well, it invokes it)
|
||||
except that it also puts "data" back on the characters queue if a data
|
||||
argument is provided and it throws a parse error."""
|
||||
if data:
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("XXX Something is wrong with the emitted token.")})
|
||||
self.emitCurrentToken()
|
||||
|
||||
def attributeValueQuotedStateHandler(self, quoteType):
|
||||
data = self.stream.char()
|
||||
if data == quoteType:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
|
||||
(quoteType, u"&"))
|
||||
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
||||
# documents to figure out what the order of the various if and elif
|
||||
# statements should be.
|
||||
|
||||
def dataState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||
self.state = self.states["entityData"]
|
||||
elif data == u"<" and self.contentModelFlag !=\
|
||||
contentModelFlags["PLAINTEXT"]:
|
||||
self.state = self.states["tagOpen"]
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
elif data in spaceCharacters:
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data":
|
||||
data + self.stream.charsUntil((u"&", u"<"))})
|
||||
return True
|
||||
|
||||
def entityDataState(self):
|
||||
entity = self.consumeEntity()
|
||||
if entity:
|
||||
self.tokenQueue.append({"type": "Characters", "data": entity})
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"&"})
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def tagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
if data == u"!":
|
||||
self.state = self.states["markupDeclarationOpen"]
|
||||
elif data == u"/":
|
||||
self.state = self.states["closeTagOpen"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "StartTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||
self.state = self.states["data"]
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
# XXX can't we do "<" + data here?
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
# now because this state can never be entered with the PLAINTEXT
|
||||
# flag.
|
||||
if data == u"/":
|
||||
self.state = self.states["closeTagOpen"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def closeTagOpenState(self):
|
||||
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
||||
contentModelFlags["CDATA"]):
|
||||
charStack = []
|
||||
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken. We also need
|
||||
# to have the character directly after the characters that could
|
||||
# match the start tag name.
|
||||
for x in xrange(len(self.currentToken["name"]) + 1):
|
||||
charStack.append(self.stream.char())
|
||||
# Make sure we don't get hit by EOF
|
||||
if charStack[-1] == EOF:
|
||||
break
|
||||
|
||||
# Since this is just for checking. We put the characters back on
|
||||
# the stack.
|
||||
self.stream.queue.extend(charStack)
|
||||
|
||||
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
||||
and charStack[-1] in (spaceCharacters |
|
||||
frozenset((u">", u"/", u"<", EOF))):
|
||||
# Because the characters are correct we can safely switch to
|
||||
# PCDATA mode now. This also means we don't have to do it when
|
||||
# emitting the end tag token.
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag after seeing '</'. None found.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
|
||||
# Need to return here since we don't want the rest of the
|
||||
# method to be walked through.
|
||||
return True
|
||||
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken =\
|
||||
{"type": "EndTag", "name": data, "data": []}
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be '...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def tagNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["name"] += data +\
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
else:
|
||||
self.currentToken["name"] += data
|
||||
return True
|
||||
|
||||
def beforeAttributeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
return True
|
||||
|
||||
def attributeNameState(self):
|
||||
data = self.stream.char()
|
||||
leavingThisState = True
|
||||
if data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"][-1][0] += data +\
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
leavingThisState = False
|
||||
elif data == u">":
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
pass
|
||||
elif data in spaceCharacters:
|
||||
self.state = self.states["afterAttributeName"]
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
leavingThisState = False
|
||||
else:
|
||||
self.currentToken["data"][-1][0] += data
|
||||
leavingThisState = False
|
||||
|
||||
if leavingThisState:
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
for name, value in self.currentToken["data"][:-1]:
|
||||
if self.currentToken["data"][-1][0] == name:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
# XXX Fix for above XXX
|
||||
if data == u">":
|
||||
self.emitCurrentToken()
|
||||
return True
|
||||
|
||||
def afterAttributeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data in asciiLetters:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
return True
|
||||
|
||||
def beforeAttributeValueState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.stream.charsUntil(spaceCharacters, True)
|
||||
elif data == u"\"":
|
||||
self.state = self.states["attributeValueDoubleQuoted"]
|
||||
elif data == u"&":
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
self.stream.queue.append(data);
|
||||
elif data == u"'":
|
||||
self.state = self.states["attributeValueSingleQuoted"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
return True
|
||||
|
||||
def attributeValueDoubleQuotedState(self):
|
||||
# AT We could also let self.attributeValueQuotedStateHandler always
|
||||
# return true and then return that directly here. Not sure what is
|
||||
# faster or better...
|
||||
self.attributeValueQuotedStateHandler(u"\"")
|
||||
return True
|
||||
|
||||
def attributeValueSingleQuotedState(self):
|
||||
self.attributeValueQuotedStateHandler(u"'")
|
||||
return True
|
||||
|
||||
def attributeValueUnQuotedState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
frozenset(("&", ">","<")) | spaceCharacters)
|
||||
return True
|
||||
|
||||
def bogusCommentState(self):
|
||||
# Make a new comment token and give it as value all the characters
|
||||
# until the first > or EOF (charsUntil checks for EOF automatically)
|
||||
# and emit it.
|
||||
self.tokenQueue.append(
|
||||
{"type": "Comment", "data": self.stream.charsUntil((u">"))})
|
||||
|
||||
# Eat the character directly after the bogus comment which is either a
|
||||
# ">" or an EOF.
|
||||
self.stream.char()
|
||||
self.state = self.states["data"]
|
||||
return True
|
||||
|
||||
def markupDeclarationOpenState(self):
|
||||
charStack = [self.stream.char(), self.stream.char()]
|
||||
if charStack == [u"-", u"-"]:
|
||||
self.currentToken = {"type": "Comment", "data": ""}
|
||||
self.state = self.states["comment"]
|
||||
else:
|
||||
for x in xrange(5):
|
||||
charStack.append(self.stream.char())
|
||||
# Put in explicit EOF check
|
||||
if (not EOF in charStack and
|
||||
"".join(charStack).upper() == u"DOCTYPE"):
|
||||
self.currentToken =\
|
||||
{"type": "Doctype", "name": "", "data": True}
|
||||
self.state = self.states["doctype"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.queue.extend(charStack)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
|
||||
def commentState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentDash"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
return True
|
||||
|
||||
def commentDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == u"-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["data"] += u"-" + data +\
|
||||
self.stream.charsUntil(u"-")
|
||||
# Consume the next character which is either a "-" or an EOF as
|
||||
# well so if there's a "-" directly after the "-" we go nicely to
|
||||
# the "comment end state" without emitting a ParseError() there.
|
||||
self.stream.char()
|
||||
return True
|
||||
|
||||
def commentEndState(self):
|
||||
data = self.stream.char()
|
||||
if data == u">":
|
||||
# XXX EMIT
|
||||
self.emitCurrentToken()
|
||||
elif data == u"-":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
self.currentToken["data"] += data
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in comment found.")})
|
||||
self.currentToken["data"] += u"--" + data
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
def doctypeState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
return True
|
||||
|
||||
def beforeDoctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data in asciiLowercase:
|
||||
self.currentToken["name"] = data.upper()
|
||||
self.state = self.states["doctypeName"]
|
||||
elif data == u">":
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
self.currentToken["name"] = data
|
||||
self.state = self.states["doctypeName"]
|
||||
return True
|
||||
|
||||
def doctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
needsDoctypeCheck = False
|
||||
if data in spaceCharacters:
|
||||
self.state = self.states["afterDoctypeName"]
|
||||
needsDoctypeCheck = True
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
else:
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
if data in asciiLowercase:
|
||||
data = data.upper()
|
||||
self.currentToken["name"] += data
|
||||
needsDoctypeCheck = True
|
||||
|
||||
# After some iterations through this state it should eventually say
|
||||
# "HTML". Otherwise there's an error.
|
||||
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
||||
self.currentToken["data"] = False
|
||||
return True
|
||||
|
||||
def afterDoctypeNameState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.currentToken["data"] = True
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
self.currentToken["data"] = True
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
def bogusDoctypeState(self):
|
||||
data = self.stream.char()
|
||||
if data == u">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
pass
|
||||
return True
|
36
planet/html5lib/treebuilders/__init__.py
Executable file
36
planet/html5lib/treebuilders/__init__.py
Executable file
@ -0,0 +1,36 @@
|
||||
"""A collection of modules for building different kinds of tree from
|
||||
HTML documents.
|
||||
|
||||
To create a treebuilder for a new type of tree, you need to do
|
||||
implement several things:
|
||||
|
||||
1) A set of classes for various types of elements: Document, Doctype,
|
||||
Comment, Element. These must implement the interface of
|
||||
_base.treebuilders.Node (although comment nodes have a different
|
||||
signature for their constructor, see treebuilders.simpletree.Comment)
|
||||
Textual content may also be implemented as another node type, or not, as
|
||||
your tree implementation requires.
|
||||
|
||||
2) A treebuilder object (called TreeBuilder by convention) that
|
||||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
It also has one required method:
|
||||
getDocument - Returns the root node of the complete document tree
|
||||
|
||||
3) If you wish to run the unit tests, you must also create a
|
||||
testSerializer method on your treebuilder which accepts a node and
|
||||
returns a string containing Node and its children serialized according
|
||||
to the format used in the unittests
|
||||
|
||||
The supplied simpletree module provides a python-only implementation
|
||||
of a full treebuilder and is a useful reference for the semantics of
|
||||
the various methods.
|
||||
"""
|
||||
|
||||
import os.path
|
||||
__path__.append(os.path.dirname(__path__[0]))
|
||||
|
||||
import dom, etree, simpletree
|
312
planet/html5lib/treebuilders/_base.py
Executable file
312
planet/html5lib/treebuilders/_base.py
Executable file
@ -0,0 +1,312 @@
|
||||
from constants import scopingElements, tableInsertModeElements
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
# from "leaking" into tables, buttons, object elements, and marquees.
|
||||
Marker = None
|
||||
|
||||
#XXX - TODO; make the default interface more ElementTree-like
|
||||
# rather than DOM-like
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, name):
|
||||
"""Node representing an item in the tree.
|
||||
name - The tag name associated with the node
|
||||
parent - The parent of the current node (or None for the document node)
|
||||
value - The value of the current node (applies to text nodes and
|
||||
comments
|
||||
attributes - a dict holding name, value pairs for attributes of the node
|
||||
childNodes - a list of child nodes of the current node. This must
|
||||
include all elements but not necessarily other node types
|
||||
_flags - A list of miscellaneous flags that can be set on the node
|
||||
"""
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.attributes = {}
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __unicode__(self):
|
||||
attributesStr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in
|
||||
self.attributes.iteritems()])
|
||||
if attributesStr:
|
||||
return "<%s %s>"%(self.name,attributesStr)
|
||||
else:
|
||||
return "<%s>"%(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
def appendChild(self, node):
|
||||
"""Insert node as a child of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
"""Insert data as text in the current node, positioned before the
|
||||
start of node insertBefore or to the end of the node's text.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
"""Insert node as a child of the current node, before refNode in the
|
||||
list of child nodes. Raises ValueError if refNode is not a child of
|
||||
the current node"""
|
||||
raise NotImplementedError
|
||||
|
||||
def removeChild(self, node):
|
||||
"""Remove node from the children of the current node
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
"""Move all the children of the current node to newParent.
|
||||
This is needed so that trees that don't store text as nodes move the
|
||||
text in the correct way
|
||||
"""
|
||||
#XXX - should this method be made more general?
|
||||
for child in self.childNodes:
|
||||
newParent.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def cloneNode(self):
|
||||
"""Return a shallow copy of the current node i.e. a node with the same
|
||||
name and attributes but with no parent or child nodes
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Base treebuilder implementation
|
||||
documentClass - the class to use for the bottommost node of a document
|
||||
elementClass - the class to use for HTML Elements
|
||||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
"""
|
||||
|
||||
#Document class
|
||||
documentClass = None
|
||||
|
||||
#The class to use for creating a node
|
||||
elementClass = None
|
||||
|
||||
#The class to use for creating comments
|
||||
commentClass = None
|
||||
|
||||
#The class to use for creating doctypes
|
||||
doctypeClass = None
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.openElements = []
|
||||
self.activeFormattingElements = []
|
||||
|
||||
#XXX - rename these to headElement, formElement
|
||||
self.headPointer = None
|
||||
self.formPointer = None
|
||||
|
||||
self.insertFromTable = False
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, tableVariant=False):
|
||||
# Exit early when possible.
|
||||
if self.openElements[-1].name == target:
|
||||
return True
|
||||
|
||||
# AT Use reverse instead of [::-1] when we can rely on Python 2.4
|
||||
# AT How about while True and simply set node to [-1] and set it to
|
||||
# [-2] at the end...
|
||||
for node in self.openElements[::-1]:
|
||||
if node.name == target:
|
||||
return True
|
||||
elif node.name == "table":
|
||||
return False
|
||||
elif not tableVariant and node.name in scopingElements:
|
||||
return False
|
||||
elif node.name == "html":
|
||||
return False
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
# Within this algorithm the order of steps described in the
|
||||
# specification is not quite the same as the order of steps in the
|
||||
# code. It should still do the same though.
|
||||
|
||||
# Step 1: stop the algorithm when there's nothing to do.
|
||||
if not self.activeFormattingElements:
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = -1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
i -= 1
|
||||
try:
|
||||
entry = self.activeFormattingElements[i]
|
||||
except:
|
||||
# Step 4: at this point we need to jump to step 8. By not doing
|
||||
# i += 1 which is also done in step 7 we achieve that.
|
||||
break
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
clone = self.activeFormattingElements[i].cloneNode()
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement(clone.name, clone.attributes)
|
||||
|
||||
# Step 10
|
||||
self.activeFormattingElements[i] = element
|
||||
|
||||
# Step 11
|
||||
if element == self.activeFormattingElements[-1]:
|
||||
break
|
||||
|
||||
def clearActiveFormattingElements(self):
|
||||
entry = self.activeFormattingElements.pop()
|
||||
while self.activeFormattingElements and entry != Marker:
|
||||
entry = self.activeFormattingElements.pop()
|
||||
|
||||
def elementInActiveFormattingElements(self, name):
|
||||
"""Check if an element exists between the end of the active
|
||||
formatting elements and the last marker. If it does, return it, else
|
||||
return false"""
|
||||
|
||||
for item in self.activeFormattingElements[::-1]:
|
||||
# Check for Marker first because if it's a Marker it doesn't have a
|
||||
# name attribute.
|
||||
if item == Marker:
|
||||
break
|
||||
elif item.name == name:
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertDoctype(self, name):
|
||||
self.document.appendChild(self.doctypeClass(name))
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
parent.appendChild(self.commentClass(data))
|
||||
|
||||
def createElement(self, name, attributes):
|
||||
"""Create an element but don't insert it anywhere"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
return element
|
||||
|
||||
def _getInsertFromTable(self):
|
||||
return self._insertFromTable
|
||||
|
||||
def _setInsertFromTable(self, value):
|
||||
"""Switch the function used to insert an element from the
|
||||
normal one to the misnested table one and back again"""
|
||||
self._insertFromTable = value
|
||||
if value:
|
||||
self.insertElement = self.insertElementTable
|
||||
else:
|
||||
self.insertElement = self.insertElementNormal
|
||||
|
||||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
|
||||
|
||||
def insertElementNormal(self, name, attributes):
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
self.openElements[-1].appendChild(element)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertElementTable(self, name, attributes):
|
||||
"""Create an element and insert it into the tree"""
|
||||
element = self.elementClass(name)
|
||||
element.attributes = attributes
|
||||
if self.openElements[-1].name not in tableInsertModeElements:
|
||||
return self.insertElementNormal(name, attributes)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
if insertBefore is None:
|
||||
parent.appendChild(element)
|
||||
else:
|
||||
parent.insertBefore(element, insertBefore)
|
||||
self.openElements.append(element)
|
||||
return element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
"""Insert text data."""
|
||||
if parent is None:
|
||||
parent = self.openElements[-1]
|
||||
|
||||
if (not(self.insertFromTable) or (self.insertFromTable and
|
||||
self.openElements[-1].name not in
|
||||
tableInsertModeElements)):
|
||||
parent.insertText(data)
|
||||
else:
|
||||
#We should be in the InTable mode. This means we want to do
|
||||
#special magic element rearranging
|
||||
parent, insertBefore = self.getTableMisnestedNodePosition()
|
||||
parent.insertText(data, insertBefore)
|
||||
|
||||
def getTableMisnestedNodePosition(self):
|
||||
"""Get the foster parent element, and sibling to insert before
|
||||
(or None) when inserting a misnested table node"""
|
||||
#The foster parent element is the one which comes before the most
|
||||
#recently opened table element
|
||||
#XXX - this is really inelegant
|
||||
lastTable=None
|
||||
fosterParent = None
|
||||
insertBefore = None
|
||||
for elm in self.openElements[::-1]:
|
||||
if elm.name == u"table":
|
||||
lastTable = elm
|
||||
break
|
||||
if lastTable:
|
||||
#XXX - we should really check that this parent is actually a
|
||||
#node here
|
||||
if lastTable.parent:
|
||||
fosterParent = lastTable.parent
|
||||
insertBefore = lastTable
|
||||
else:
|
||||
fosterParent = self.openElements[
|
||||
self.openElements.index(lastTable) - 1]
|
||||
else:
|
||||
assert self.innerHTML
|
||||
fosterParent = self.openElements[0]
|
||||
return fosterParent, insertBefore
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX Until someone has broven that the above breaks stuff I think
|
||||
# we should keep it in.
|
||||
# self.processEndTag(name)
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
"Return the final tree"
|
||||
return self.document
|
||||
|
||||
def testSerializer(self, node):
|
||||
"""Serialize the subtree of node in the format required by unit tests
|
||||
node - the node from which to start serializing"""
|
||||
raise NotImplementedError
|
127
planet/html5lib/treebuilders/dom.py
Executable file
127
planet/html5lib/treebuilders/dom.py
Executable file
@ -0,0 +1,127 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
|
||||
class AttrList:
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
def __iter__(self):
|
||||
return self.element.attributes.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
def items(self):
|
||||
return self.element.attributes.items()
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
else:
|
||||
self.element.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
self.element.insertBefore(node.element, refNode.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
while self.element.hasChildNodes():
|
||||
child = self.element.firstChild
|
||||
self.element.removeChild(child)
|
||||
newParent.element.appendChild(child)
|
||||
self.childNodes = []
|
||||
|
||||
def getAttributes(self):
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
if attributes:
|
||||
for name, value in attributes.items():
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
self.element.setAttribute(name, value)
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
return NodeBuilder(self.element.cloneNode(False))
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.hasChildNodes()
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
return self
|
||||
|
||||
def doctypeClass(self,name):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
return NodeBuilder(domimpl.createDocumentType(name,None,None))
|
||||
|
||||
def elementClass(self, name):
|
||||
return NodeBuilder(self.dom.createElement(name))
|
||||
|
||||
def commentClass(self, data):
|
||||
return NodeBuilder(self.dom.createComment(data))
|
||||
|
||||
def appendChild(self, node):
|
||||
self.dom.appendChild(node.element)
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.dom
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=illegal_xml_chars.sub(u'\uFFFD',data)
|
||||
if parent <> self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
if not Node.TEXT_NODE in self.dom._child_node_types:
|
||||
self.dom._child_node_types=list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
self.dom.appendChild(self.dom.createTextNode(data))
|
||||
|
||||
name = None
|
||||
|
||||
def testSerializer(element):
|
||||
element.normalize()
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.COMMENT_NODE:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
|
||||
elif element.nodeType == Node.TEXT_NODE:
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.nodeName))
|
||||
if element.hasAttributes():
|
||||
for name, value in element.attributes.items():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
indent += 2
|
||||
for child in element.childNodes:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
208
planet/html5lib/treebuilders/etree.py
Executable file
208
planet/html5lib/treebuilders/etree.py
Executable file
@ -0,0 +1,208 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
from elementtree import ElementTree
|
||||
|
||||
import _base
|
||||
|
||||
class Element(_base.Node):
|
||||
def __init__(self, name):
|
||||
self._element = ElementTree.Element(name)
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self._childNodes = []
|
||||
self._flags = []
|
||||
|
||||
#Set the element text and tail to the empty string rather than None
|
||||
#XXX - is this desirable or should we do it on a case by case basis?
|
||||
self._element.text = ""
|
||||
self._element.tail = ""
|
||||
|
||||
def _setName(self, name):
|
||||
self._element.tag = name
|
||||
|
||||
def _getName(self):
|
||||
return self._element.tag
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
def _getAttributes(self):
|
||||
return self._element.attrib
|
||||
|
||||
def _setAttributes(self, attributes):
|
||||
#Delete existing attributes first
|
||||
#XXX - there may be a better way to do this...
|
||||
for key in self._element.attrib.keys():
|
||||
del self._element.attrib[key]
|
||||
for key, value in attributes.iteritems():
|
||||
self._element.set(key, value)
|
||||
|
||||
attributes = property(_getAttributes, _setAttributes)
|
||||
|
||||
def _getChildNodes(self):
|
||||
return self._childNodes
|
||||
|
||||
def _setChildNodes(self, value):
|
||||
del self._element[:]
|
||||
self._childNodes = []
|
||||
for element in value:
|
||||
self.insertChild(element)
|
||||
|
||||
childNodes = property(_getChildNodes, _setChildNodes)
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self._element.text or self._element.getchildren())
|
||||
|
||||
def appendChild(self, node):
|
||||
self._childNodes.append(node)
|
||||
self._element.append(node._element)
|
||||
node.parent = self
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self._element.getchildren().index(refNode._element)
|
||||
self._element.insert(index, node._element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._element.remove(node._element)
|
||||
node.parent=None
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if not(len(self._element)):
|
||||
self._element.text += data
|
||||
elif insertBefore is None:
|
||||
#Insert the text as the tail of the last child element
|
||||
self._element[-1].tail += data
|
||||
else:
|
||||
#Insert the text before the specified node
|
||||
children = self._element.getchildren()
|
||||
index = children.index(insertBefore._element)
|
||||
if index > 0:
|
||||
self._element[index-1].tail += data
|
||||
else:
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element.attributes = self.attributes
|
||||
return element
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
if newParent.childNodes:
|
||||
newParent.childNodes[-1]._element.tail += self._element.text
|
||||
else:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
Element.__init__(self, Comment)
|
||||
self._element.text = data
|
||||
|
||||
def _getData(self):
|
||||
return self._element.text
|
||||
|
||||
def _setData(self, value):
|
||||
self._element.text = value
|
||||
|
||||
data = property(_getData, _setData)
|
||||
|
||||
class DocumentType(Element):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, DocumentType)
|
||||
self._element.text = name
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
Element.__init__(self, Document)
|
||||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element, indent=0):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
|
||||
elif element.tag is Document:
|
||||
rv.append("#document")
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
elif element.tag is Comment:
|
||||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent, element.tag))
|
||||
if hasattr(element, "attrib"):
|
||||
for name, value in element.attrib.iteritems():
|
||||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
|
||||
if element.text:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
|
||||
indent += 2
|
||||
for child in element.getchildren():
|
||||
serializeElement(child, indent)
|
||||
if element.tail:
|
||||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\""%(' '*2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
def serializeElement(element):
|
||||
if element.tag is DocumentType:
|
||||
rv.append("<!DOCTYPE %s>"%(element.text,))
|
||||
elif element.tag is Document:
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
if element.tail:
|
||||
finalText = element.tail
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
elif element.tag is Comment:
|
||||
rv.append("<!--%s-->"%(element.text,))
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
if not element.attrib:
|
||||
rv.append("<%s>"%(element.tag,))
|
||||
else:
|
||||
attr = " ".join(["%s=\"%s\""%(name, value)
|
||||
for name, value in element.attrib.iteritems()])
|
||||
rv.append("<%s %s>"%(element.tag, attr))
|
||||
if element.text:
|
||||
rv.append(element.text)
|
||||
|
||||
for child in element.getchildren():
|
||||
serializeElement(child)
|
||||
|
||||
rv.append("</%s>"%(element.tag,))
|
||||
|
||||
if element.tail:
|
||||
rv.append(element.tail)
|
||||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\""%(' '*2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = Comment
|
||||
|
||||
def testSerializer(self, element):
|
||||
return testSerializer(element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.document._element
|
153
planet/html5lib/treebuilders/simpletree.py
Executable file
153
planet/html5lib/treebuilders/simpletree.py
Executable file
@ -0,0 +1,153 @@
|
||||
import _base
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
class Node(_base.Node):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.parent = None
|
||||
self.value = None
|
||||
self.childNodes = []
|
||||
self._flags = []
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
def printTree(self, indent=0):
|
||||
tree = '\n|%s%s' % (' '* indent, unicode(self))
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent + 2)
|
||||
return tree
|
||||
|
||||
def appendChild(self, node, index=None):
|
||||
if (isinstance(node, TextNode) and self.childNodes and
|
||||
isinstance(self.childNodes[-1], TextNode)):
|
||||
self.childNodes[-1].value += node.value
|
||||
else:
|
||||
self.childNodes.append(node)
|
||||
node.parent = self
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
if insertBefore is None:
|
||||
self.appendChild(TextNode(data))
|
||||
else:
|
||||
self.insertBefore(TextNode(data), insertBefore)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.childNodes.index(refNode)
|
||||
if (isinstance(node, TextNode) and index > 0 and
|
||||
isinstance(self.childNodes[index - 1], TextNode)):
|
||||
self.childNodes[index - 1].value += node.value
|
||||
else:
|
||||
self.childNodes.insert(index, node)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
try:
|
||||
self.childNodes.remove(node)
|
||||
except:
|
||||
# XXX
|
||||
raise
|
||||
node.parent = None
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
newNode.value = self.value
|
||||
return newNode
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
return bool(self.childNodes)
|
||||
|
||||
class Document(Node):
|
||||
def __init__(self):
|
||||
Node.__init__(self, None)
|
||||
|
||||
def __unicode__(self):
|
||||
return "#document"
|
||||
|
||||
def printTree(self):
|
||||
tree = unicode(self)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ''
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
return result.encode(encoding)
|
||||
|
||||
class DocumentType(Node):
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!DOCTYPE %s>" % self.name
|
||||
|
||||
class TextNode(Node):
|
||||
def __init__(self, value):
|
||||
Node.__init__(self, None)
|
||||
self.value = value
|
||||
|
||||
def __unicode__(self):
|
||||
return "\"%s\"" % self.value
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
|
||||
class Element(Node):
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
return "<%s>" % self.name
|
||||
|
||||
def printTree(self, indent):
|
||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
for name,value in self.attributes.iteritems():
|
||||
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += '>'
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
result += '</%s>' % self.name
|
||||
else:
|
||||
result += '/>'
|
||||
return result
|
||||
|
||||
class CommentNode(Node):
|
||||
def __init__(self, data):
|
||||
Node.__init__(self, None)
|
||||
self.data = data
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!-- %s -->" % self.data
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
commentClass = CommentNode
|
||||
|
||||
def testSerializer(self, node):
|
||||
return node.printTree()
|
36
planet/html5lib/utils.py
Normal file
36
planet/html5lib/utils.py
Normal file
@ -0,0 +1,36 @@
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
#Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
class MethodDispatcher(dict):
|
||||
"""Dict with 2 special properties:
|
||||
|
||||
On initiation, keys that are lists, sets or tuples are converted to
|
||||
multiple keys so accessing any one of the items in the original
|
||||
list-like object returns the matching value
|
||||
|
||||
md = MethodDispatcher({("foo", "bar"):"baz"})
|
||||
md["foo"] == "baz"
|
||||
|
||||
A default value which can be set through the default attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, items=()):
|
||||
# Using _dictEntries instead of directly assigning to self is about
|
||||
# twice as fast. Please do careful performance testing before changing
|
||||
# anything here.
|
||||
_dictEntries = []
|
||||
for name,value in items:
|
||||
if type(name) in (list, tuple, frozenset, set):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.get(self, key, self.default)
|
896
planet/httplib2/__init__.py
Normal file
896
planet/httplib2/__init__.py
Normal file
@ -0,0 +1,896 @@
|
||||
from __future__ import generators
|
||||
"""
|
||||
httplib2
|
||||
|
||||
A caching http interface that supports ETags and gzip
|
||||
to conserve bandwidth.
|
||||
|
||||
Requires Python 2.3 or later
|
||||
|
||||
"""
|
||||
|
||||
__author__ = "Joe Gregorio (joe@bitworking.org)"
|
||||
__copyright__ = "Copyright 2006, Joe Gregorio"
|
||||
__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
|
||||
"James Antill",
|
||||
"Xavier Verges Farrero",
|
||||
"Jonathan Feinberg",
|
||||
"Blair Zajac",
|
||||
"Sam Ruby"]
|
||||
__license__ = "MIT"
|
||||
__version__ = "$Rev: 217 $"
|
||||
|
||||
import re
|
||||
import md5
|
||||
import email
|
||||
import email.Utils
|
||||
import email.Message
|
||||
import StringIO
|
||||
import gzip
|
||||
import zlib
|
||||
import httplib
|
||||
import urlparse
|
||||
import base64
|
||||
import os
|
||||
import copy
|
||||
import calendar
|
||||
import time
|
||||
import random
|
||||
import sha
|
||||
import hmac
|
||||
from gettext import gettext as _
|
||||
from socket import gaierror
|
||||
|
||||
__all__ = ['Http', 'Response', 'HttpLib2Error',
|
||||
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
|
||||
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
|
||||
'debuglevel']
|
||||
|
||||
|
||||
# The httplib debug level, set to a non-zero value to get debug output
|
||||
debuglevel = 0
|
||||
|
||||
# Python 2.3 support
|
||||
if 'sorted' not in __builtins__:
|
||||
def sorted(seq):
|
||||
seq.sort()
|
||||
return seq
|
||||
|
||||
# Python 2.3 support
|
||||
def HTTPResponse__getheaders(self):
|
||||
"""Return list of (header, value) tuples."""
|
||||
if self.msg is None:
|
||||
print "================================"
|
||||
raise httplib.ResponseNotReady()
|
||||
return self.msg.items()
|
||||
|
||||
if not hasattr(httplib.HTTPResponse, 'getheaders'):
|
||||
httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
|
||||
|
||||
# All exceptions raised here derive from HttpLib2Error
|
||||
class HttpLib2Error(Exception): pass
|
||||
|
||||
class RedirectMissingLocation(HttpLib2Error): pass
|
||||
class RedirectLimit(HttpLib2Error): pass
|
||||
class FailedToDecompressContent(HttpLib2Error): pass
|
||||
class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
|
||||
class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
|
||||
|
||||
# Open Items:
|
||||
# -----------
|
||||
# Proxy support
|
||||
|
||||
# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
|
||||
|
||||
# Pluggable cache storage (supports storing the cache in
|
||||
# flat files by default. We need a plug-in architecture
|
||||
# that can support Berkeley DB and Squid)
|
||||
|
||||
# == Known Issues ==
|
||||
# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
|
||||
# Does not handle Cache-Control: max-stale
|
||||
# Does not use Age: headers when calculating cache freshness.
|
||||
|
||||
|
||||
# The number of redirections to follow before giving up.
|
||||
# Note that only GET redirects are automatically followed.
|
||||
# Will also honor 301 requests by saving that info and never
|
||||
# requesting that URI again.
|
||||
DEFAULT_MAX_REDIRECTS = 5
|
||||
|
||||
# Which headers are hop-by-hop headers by default
|
||||
HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
|
||||
|
||||
def _get_end2end_headers(response):
|
||||
hopbyhop = list(HOP_BY_HOP)
|
||||
hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
|
||||
return [header for header in response.keys() if header not in hopbyhop]
|
||||
|
||||
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
|
||||
|
||||
def parse_uri(uri):
|
||||
"""Parses a URI using the regex given in Appendix B of RFC 3986.
|
||||
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
"""
|
||||
groups = URI.match(uri).groups()
|
||||
return (groups[1], groups[3], groups[4], groups[6], groups[8])
|
||||
|
||||
def urlnorm(uri):
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
authority = authority.lower()
|
||||
scheme = scheme.lower()
|
||||
if not path:
|
||||
path = "/"
|
||||
# Could do syntax based normalization of the URI before
|
||||
# computing the digest. See Section 6.2.2 of Std 66.
|
||||
request_uri = query and "?".join([path, query]) or path
|
||||
defrag_uri = scheme + "://" + authority + request_uri
|
||||
return scheme, authority, request_uri, defrag_uri
|
||||
|
||||
|
||||
# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
|
||||
re_url_scheme = re.compile(r'^\w+://')
|
||||
re_slash = re.compile(r'[?/:|]+')
|
||||
|
||||
def safename(filename):
|
||||
"""Return a filename suitable for the cache.
|
||||
|
||||
Strips dangerous and common characters to create a filename we
|
||||
can use to store the cache in.
|
||||
"""
|
||||
|
||||
try:
|
||||
if re_url_scheme.match(filename):
|
||||
if isinstance(filename,str):
|
||||
filename=filename.decode('utf-8').encode('idna')
|
||||
else:
|
||||
filename=filename.encode('idna')
|
||||
except:
|
||||
pass
|
||||
if isinstance(filename,unicode):
|
||||
filename=filename.encode('utf-8')
|
||||
filemd5 = md5.new(filename).hexdigest()
|
||||
filename = re_url_scheme.sub("", filename)
|
||||
filename = re_slash.sub(",", filename)
|
||||
|
||||
# limit length of filename
|
||||
if len(filename)>200:
|
||||
filename=filename[:200]
|
||||
return ",".join((filename, filemd5))
|
||||
|
||||
NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
|
||||
def _normalize_headers(headers):
|
||||
return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
|
||||
|
||||
def _parse_cache_control(headers):
|
||||
retval = {}
|
||||
if headers.has_key('cache-control'):
|
||||
parts = headers['cache-control'].split(',')
|
||||
parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
|
||||
parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
|
||||
retval = dict(parts_with_args + parts_wo_args)
|
||||
return retval
|
||||
|
||||
# Whether to use a strict mode to parse WWW-Authenticate headers
|
||||
# Might lead to bad results in case of ill-formed header value,
|
||||
# so disabled by default, falling back to relaxed parsing.
|
||||
# Set to true to turn on, usefull for testing servers.
|
||||
USE_WWW_AUTH_STRICT_PARSING = 0
|
||||
|
||||
# In regex below:
|
||||
# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
|
||||
# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
|
||||
# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
|
||||
# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
|
||||
WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
|
||||
WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
|
||||
UNQUOTE_PAIRS = re.compile(r'\\(.)')
|
||||
def _parse_www_authenticate(headers, headername='www-authenticate'):
|
||||
"""Returns a dictionary of dictionaries, one dict
|
||||
per auth_scheme."""
|
||||
retval = {}
|
||||
if headers.has_key(headername):
|
||||
authenticate = headers[headername].strip()
|
||||
www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
|
||||
while authenticate:
|
||||
# Break off the scheme at the beginning of the line
|
||||
if headername == 'authentication-info':
|
||||
(auth_scheme, the_rest) = ('digest', authenticate)
|
||||
else:
|
||||
(auth_scheme, the_rest) = authenticate.split(" ", 1)
|
||||
# Now loop over all the key value pairs that come after the scheme,
|
||||
# being careful not to roll into the next scheme
|
||||
match = www_auth.search(the_rest)
|
||||
auth_params = {}
|
||||
while match:
|
||||
if match and len(match.groups()) == 3:
|
||||
(key, value, the_rest) = match.groups()
|
||||
auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
|
||||
match = www_auth.search(the_rest)
|
||||
retval[auth_scheme.lower()] = auth_params
|
||||
authenticate = the_rest.strip()
|
||||
return retval
|
||||
|
||||
|
||||
def _entry_disposition(response_headers, request_headers):
|
||||
"""Determine freshness from the Date, Expires and Cache-Control headers.
|
||||
|
||||
We don't handle the following:
|
||||
|
||||
1. Cache-Control: max-stale
|
||||
2. Age: headers are not used in the calculations.
|
||||
|
||||
Not that this algorithm is simpler than you might think
|
||||
because we are operating as a private (non-shared) cache.
|
||||
This lets us ignore 's-maxage'. We can also ignore
|
||||
'proxy-invalidate' since we aren't a proxy.
|
||||
We will never return a stale document as
|
||||
fresh as a design decision, and thus the non-implementation
|
||||
of 'max-stale'. This also lets us safely ignore 'must-revalidate'
|
||||
since we operate as if every server has sent 'must-revalidate'.
|
||||
Since we are private we get to ignore both 'public' and
|
||||
'private' parameters. We also ignore 'no-transform' since
|
||||
we don't do any transformations.
|
||||
The 'no-store' parameter is handled at a higher level.
|
||||
So the only Cache-Control parameters we look at are:
|
||||
|
||||
no-cache
|
||||
only-if-cached
|
||||
max-age
|
||||
min-fresh
|
||||
"""
|
||||
|
||||
retval = "STALE"
|
||||
cc = _parse_cache_control(request_headers)
|
||||
cc_response = _parse_cache_control(response_headers)
|
||||
|
||||
if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
|
||||
retval = "TRANSPARENT"
|
||||
if 'cache-control' not in request_headers:
|
||||
request_headers['cache-control'] = 'no-cache'
|
||||
elif cc.has_key('no-cache'):
|
||||
retval = "TRANSPARENT"
|
||||
elif cc_response.has_key('no-cache'):
|
||||
retval = "STALE"
|
||||
elif cc.has_key('only-if-cached'):
|
||||
retval = "FRESH"
|
||||
elif response_headers.has_key('date'):
|
||||
date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
|
||||
now = time.time()
|
||||
current_age = max(0, now - date)
|
||||
if cc_response.has_key('max-age'):
|
||||
freshness_lifetime = int(cc_response['max-age'])
|
||||
elif response_headers.has_key('expires'):
|
||||
expires = email.Utils.parsedate_tz(response_headers['expires'])
|
||||
freshness_lifetime = max(0, calendar.timegm(expires) - date)
|
||||
else:
|
||||
freshness_lifetime = 0
|
||||
if cc.has_key('max-age'):
|
||||
freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
|
||||
if cc.has_key('min-fresh'):
|
||||
current_age += int(cc['min-fresh'])
|
||||
if freshness_lifetime > current_age:
|
||||
retval = "FRESH"
|
||||
return retval
|
||||
|
||||
def _decompressContent(response, new_content):
|
||||
content = new_content
|
||||
try:
|
||||
encoding = response.get('content-encoding', None)
|
||||
if encoding in ['gzip', 'deflate']:
|
||||
if encoding == 'gzip':
|
||||
content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
|
||||
if encoding == 'deflate':
|
||||
content = zlib.decompress(content)
|
||||
response['content-length'] = str(len(content))
|
||||
del response['content-encoding']
|
||||
except:
|
||||
content = ""
|
||||
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
|
||||
return content
|
||||
|
||||
def _updateCache(request_headers, response_headers, content, cache, cachekey):
|
||||
if cachekey:
|
||||
cc = _parse_cache_control(request_headers)
|
||||
cc_response = _parse_cache_control(response_headers)
|
||||
if cc.has_key('no-store') or cc_response.has_key('no-store'):
|
||||
cache.delete(cachekey)
|
||||
else:
|
||||
info = email.Message.Message()
|
||||
for key, value in response_headers.iteritems():
|
||||
if key not in ['status','content-encoding','transfer-encoding']:
|
||||
info[key] = value
|
||||
|
||||
status = response_headers.status
|
||||
if status == 304:
|
||||
status = 200
|
||||
|
||||
status_header = 'status: %d\r\n' % response_headers.status
|
||||
|
||||
header_str = info.as_string()
|
||||
|
||||
header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
|
||||
text = "".join([status_header, header_str, content])
|
||||
|
||||
cache.set(cachekey, text)
|
||||
|
||||
def _cnonce():
|
||||
dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
|
||||
return dig[:16]
|
||||
|
||||
def _wsse_username_token(cnonce, iso_now, password):
|
||||
return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
|
||||
|
||||
|
||||
# For credentials we need two things, first
|
||||
# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
|
||||
# Then we also need a list of URIs that have already demanded authentication
|
||||
# That list is tricky since sub-URIs can take the same auth, or the
|
||||
# auth scheme may change as you descend the tree.
|
||||
# So we also need each Auth instance to be able to tell us
|
||||
# how close to the 'top' it is.
|
||||
|
||||
class Authentication:
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
self.path = path
|
||||
self.host = host
|
||||
self.credentials = credentials
|
||||
self.http = http
|
||||
|
||||
def depth(self, request_uri):
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
return request_uri[len(self.path):].count("/")
|
||||
|
||||
def inscope(self, host, request_uri):
|
||||
# XXX Should we normalize the request_uri?
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
return (host == self.host) and path.startswith(self.path)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header. Over-rise this in sub-classes."""
|
||||
pass
|
||||
|
||||
def response(self, response, content):
|
||||
"""Gives us a chance to update with new nonces
|
||||
or such returned from the last authorized response.
|
||||
Over-rise this in sub-classes if necessary.
|
||||
|
||||
Return TRUE is the request is to be retried, for
|
||||
example Digest may return stale=true.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
|
||||
class BasicAuthentication(Authentication):
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()
|
||||
|
||||
|
||||
class DigestAuthentication(Authentication):
|
||||
"""Only do qop='auth' and MD5, since that
|
||||
is all Apache currently implements"""
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
||||
self.challenge = challenge['digest']
|
||||
qop = self.challenge.get('qop')
|
||||
self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
|
||||
if self.challenge['qop'] is None:
|
||||
raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
|
||||
self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
|
||||
if self.challenge['algorithm'] != 'MD5':
|
||||
raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
|
||||
self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
|
||||
self.challenge['nc'] = 1
|
||||
|
||||
def request(self, method, request_uri, headers, content, cnonce = None):
|
||||
"""Modify the request headers"""
|
||||
H = lambda x: md5.new(x).hexdigest()
|
||||
KD = lambda s, d: H("%s:%s" % (s, d))
|
||||
A2 = "".join([method, ":", request_uri])
|
||||
self.challenge['cnonce'] = cnonce or _cnonce()
|
||||
request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
|
||||
'%08x' % self.challenge['nc'],
|
||||
self.challenge['cnonce'],
|
||||
self.challenge['qop'], H(A2)
|
||||
))
|
||||
headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
|
||||
self.credentials[0],
|
||||
self.challenge['realm'],
|
||||
self.challenge['nonce'],
|
||||
request_uri,
|
||||
self.challenge['algorithm'],
|
||||
request_digest,
|
||||
self.challenge['qop'],
|
||||
self.challenge['nc'],
|
||||
self.challenge['cnonce'],
|
||||
)
|
||||
self.challenge['nc'] += 1
|
||||
|
||||
def response(self, response, content):
|
||||
if not response.has_key('authentication-info'):
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
|
||||
if 'true' == challenge.get('stale'):
|
||||
self.challenge['nonce'] = challenge['nonce']
|
||||
self.challenge['nc'] = 1
|
||||
return True
|
||||
else:
|
||||
updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
|
||||
|
||||
if updated_challenge.has_key('nextnonce'):
|
||||
self.challenge['nonce'] = updated_challenge['nextnonce']
|
||||
self.challenge['nc'] = 1
|
||||
return False
|
||||
|
||||
|
||||
class HmacDigestAuthentication(Authentication):
|
||||
"""Adapted from Robert Sayre's code and DigestAuthentication above."""
|
||||
__author__ = "Thomas Broyer (t.broyer@ltgt.net)"
|
||||
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
||||
self.challenge = challenge['hmacdigest']
|
||||
print self.challenge
|
||||
# TODO: self.challenge['domain']
|
||||
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
|
||||
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
|
||||
self.challenge['reason'] = 'unauthorized'
|
||||
self.challenge['salt'] = self.challenge.get('salt', '')
|
||||
if not self.challenge.get('snonce'):
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
|
||||
self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
|
||||
if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
|
||||
self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
|
||||
if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
|
||||
if self.challenge['algorithm'] == 'HMAC-MD5':
|
||||
self.hashmod = md5
|
||||
else:
|
||||
self.hashmod = sha
|
||||
if self.challenge['pw-algorithm'] == 'MD5':
|
||||
self.pwhashmod = md5
|
||||
else:
|
||||
self.pwhashmod = sha
|
||||
self.key = "".join([self.credentials[0], ":",
|
||||
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
|
||||
":", self.challenge['realm']
|
||||
])
|
||||
print response['www-authenticate']
|
||||
print "".join([self.credentials[1], self.challenge['salt']])
|
||||
print "key_str = %s" % self.key
|
||||
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers"""
|
||||
keys = _get_end2end_headers(headers)
|
||||
keylist = "".join(["%s " % k for k in keys])
|
||||
headers_val = "".join([headers[k] for k in keys])
|
||||
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
|
||||
cnonce = _cnonce()
|
||||
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
|
||||
print "key = %s" % self.key
|
||||
print "msg = %s" % request_digest
|
||||
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
|
||||
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
|
||||
self.credentials[0],
|
||||
self.challenge['realm'],
|
||||
self.challenge['snonce'],
|
||||
cnonce,
|
||||
request_uri,
|
||||
created,
|
||||
request_digest,
|
||||
keylist,
|
||||
)
|
||||
|
||||
def response(self, response, content):
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
|
||||
if challenge.get('reason') in ['integrity', 'stale']:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class WsseAuthentication(Authentication):
|
||||
"""This is thinly tested and should not be relied upon.
|
||||
At this time there isn't any third party server to test against.
|
||||
Blogger and TypePad implemented this algorithm at one point
|
||||
but Blogger has since switched to Basic over HTTPS and
|
||||
TypePad has implemented it wrong, by never issuing a 401
|
||||
challenge but instead requiring your client to telepathically know that
|
||||
their endpoint is expecting WSSE profile="UsernameToken"."""
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['Authorization'] = 'WSSE profile="UsernameToken"'
|
||||
iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
cnonce = _cnonce()
|
||||
password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
|
||||
headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
|
||||
self.credentials[0],
|
||||
password_digest,
|
||||
cnonce,
|
||||
iso_now)
|
||||
|
||||
class GoogleLoginAuthentication(Authentication):
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
from urllib import urlencode
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
auth = dict(Email=credentials[0], Passwd=credentials[1], service='cl', source=headers['user-agent'])
|
||||
resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
|
||||
lines = content.split('\n')
|
||||
d = dict([tuple(line.split("=", 1)) for line in lines if line])
|
||||
if resp.status == 403:
|
||||
self.Auth = ""
|
||||
else:
|
||||
self.Auth = d['Auth']
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
|
||||
|
||||
|
||||
AUTH_SCHEME_CLASSES = {
|
||||
"basic": BasicAuthentication,
|
||||
"wsse": WsseAuthentication,
|
||||
"digest": DigestAuthentication,
|
||||
"hmacdigest": HmacDigestAuthentication,
|
||||
"googlelogin": GoogleLoginAuthentication
|
||||
}
|
||||
|
||||
AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
|
||||
|
||||
def _md5(s):
|
||||
return
|
||||
|
||||
class FileCache:
|
||||
"""Uses a local directory as a store for cached files.
|
||||
Not really safe to use if multiple threads or processes are going to
|
||||
be running on the same cache.
|
||||
"""
|
||||
def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
|
||||
self.cache = cache
|
||||
self.safe = safe
|
||||
if not os.path.exists(cache):
|
||||
os.makedirs(self.cache)
|
||||
|
||||
def get(self, key):
|
||||
retval = None
|
||||
cacheFullPath = os.path.join(self.cache, self.safe(key))
|
||||
try:
|
||||
f = file(cacheFullPath, "r")
|
||||
retval = f.read()
|
||||
f.close()
|
||||
except:
|
||||
pass
|
||||
return retval
|
||||
|
||||
def set(self, key, value):
|
||||
cacheFullPath = os.path.join(self.cache, self.safe(key))
|
||||
f = file(cacheFullPath, "w")
|
||||
f.write(value)
|
||||
f.close()
|
||||
|
||||
def delete(self, key):
|
||||
cacheFullPath = os.path.join(self.cache, self.safe(key))
|
||||
if os.path.exists(cacheFullPath):
|
||||
os.remove(cacheFullPath)
|
||||
|
||||
class Http:
|
||||
"""An HTTP client that handles all
|
||||
methods, caching, ETags, compression,
|
||||
HTTPS, Basic, Digest, WSSE, etc.
|
||||
"""
|
||||
def __init__(self, cache=None):
|
||||
# Map domain name to an httplib connection
|
||||
self.connections = {}
|
||||
# The location of the cache, for now a directory
|
||||
# where cached responses are held.
|
||||
if cache and isinstance(cache, str):
|
||||
self.cache = FileCache(cache)
|
||||
else:
|
||||
self.cache = cache
|
||||
|
||||
# tuples of name, password
|
||||
self.credentials = []
|
||||
|
||||
# authorization objects
|
||||
self.authorizations = []
|
||||
|
||||
self.follow_all_redirects = False
|
||||
|
||||
self.ignore_etag = False
|
||||
|
||||
def _auth_from_challenge(self, host, request_uri, headers, response, content):
|
||||
"""A generator that creates Authorization objects
|
||||
that can be applied to requests.
|
||||
"""
|
||||
challenges = _parse_www_authenticate(response, 'www-authenticate')
|
||||
for cred in self.credentials:
|
||||
for scheme in AUTH_SCHEME_ORDER:
|
||||
if challenges.has_key(scheme):
|
||||
yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
|
||||
|
||||
def add_credentials(self, name, password):
|
||||
"""Add a name and password that will be used
|
||||
any time a request requires authentication."""
|
||||
self.credentials.append((name, password))
|
||||
|
||||
def clear_credentials(self):
|
||||
"""Remove all the names and passwords
|
||||
that are used for authentication"""
|
||||
self.credentials = []
|
||||
self.authorizations = []
|
||||
|
||||
def _conn_request(self, conn, request_uri, method, body, headers):
|
||||
for i in range(2):
|
||||
try:
|
||||
conn.request(method, request_uri, body, headers)
|
||||
response = conn.getresponse()
|
||||
except:
|
||||
if i == 0:
|
||||
conn.close()
|
||||
conn.connect()
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
content = response.read()
|
||||
response = Response(response)
|
||||
content = _decompressContent(response, content)
|
||||
|
||||
break;
|
||||
return (response, content)
|
||||
|
||||
|
||||
def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
|
||||
"""Do the actual request using the connection object
|
||||
and also follow one level of redirects if necessary"""
|
||||
|
||||
auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
|
||||
auth = auths and sorted(auths)[0][1] or None
|
||||
if auth:
|
||||
auth.request(method, request_uri, headers, body)
|
||||
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
|
||||
|
||||
if auth:
|
||||
if auth.response(response, body):
|
||||
auth.request(method, request_uri, headers, body)
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers )
|
||||
response._stale_digest = 1
|
||||
|
||||
if response.status == 401:
|
||||
for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
|
||||
authorization.request(method, request_uri, headers, body)
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers, )
|
||||
if response.status != 401:
|
||||
self.authorizations.append(authorization)
|
||||
authorization.response(response, body)
|
||||
break
|
||||
|
||||
if (self.follow_all_redirects or method in ["GET", "HEAD"]) or response.status == 303:
|
||||
if response.status in [300, 301, 302, 303, 307]:
|
||||
# Pick out the location header and basically start from the beginning
|
||||
# remembering first to strip the ETag header and decrement our 'depth'
|
||||
if redirections:
|
||||
if not response.has_key('location') and response.status != 300:
|
||||
raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."))
|
||||
# Fix-up relative redirects (which violate an RFC 2616 MUST)
|
||||
if response.has_key('location'):
|
||||
location = response['location']
|
||||
(scheme, authority, path, query, fragment) = parse_uri(location)
|
||||
if authority == None:
|
||||
response['location'] = urlparse.urljoin(absolute_uri, location)
|
||||
if response.status == 301 and method in ["GET", "HEAD"]:
|
||||
response['-x-permanent-redirect-url'] = response['location']
|
||||
if not response.has_key('content-location'):
|
||||
response['content-location'] = absolute_uri
|
||||
_updateCache(headers, response, content, self.cache, cachekey)
|
||||
if headers.has_key('if-none-match'):
|
||||
del headers['if-none-match']
|
||||
if headers.has_key('if-modified-since'):
|
||||
del headers['if-modified-since']
|
||||
if response.has_key('location'):
|
||||
location = response['location']
|
||||
old_response = copy.deepcopy(response)
|
||||
if not old_response.has_key('content-location'):
|
||||
old_response['content-location'] = absolute_uri
|
||||
redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
|
||||
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
|
||||
response.previous = old_response
|
||||
else:
|
||||
raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
|
||||
elif response.status in [200, 203] and method == "GET":
|
||||
# Don't cache 206's since we aren't going to handle byte range requests
|
||||
if not response.has_key('content-location'):
|
||||
response['content-location'] = absolute_uri
|
||||
_updateCache(headers, response, content, self.cache, cachekey)
|
||||
|
||||
return (response, content)
|
||||
|
||||
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS):
|
||||
""" Performs a single HTTP request.
|
||||
The 'uri' is the URI of the HTTP resource and can begin
|
||||
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
|
||||
|
||||
The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
|
||||
There is no restriction on the methods allowed.
|
||||
|
||||
The 'body' is the entity body to be sent with the request. It is a string
|
||||
object.
|
||||
|
||||
Any extra headers that are to be sent with the request should be provided in the
|
||||
'headers' dictionary.
|
||||
|
||||
The maximum number of redirect to follow before raising an
|
||||
exception is 'redirections. The default is 5.
|
||||
|
||||
The return value is a tuple of (response, content), the first
|
||||
being and instance of the 'Response' class, the second being
|
||||
a string that contains the response entity body.
|
||||
"""
|
||||
if headers is None:
|
||||
headers = {}
|
||||
else:
|
||||
headers = _normalize_headers(headers)
|
||||
|
||||
if not headers.has_key('user-agent'):
|
||||
headers['user-agent'] = "Python-httplib2/%s" % __version__
|
||||
|
||||
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
|
||||
|
||||
if not self.connections.has_key(scheme+":"+authority):
|
||||
connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
|
||||
conn = self.connections[scheme+":"+authority] = connection_type(authority)
|
||||
conn.set_debuglevel(debuglevel)
|
||||
else:
|
||||
conn = self.connections[scheme+":"+authority]
|
||||
|
||||
if method in ["GET", "HEAD"] and 'range' not in headers:
|
||||
headers['accept-encoding'] = 'compress, gzip'
|
||||
|
||||
info = email.Message.Message()
|
||||
cached_value = None
|
||||
if self.cache:
|
||||
cachekey = defrag_uri
|
||||
cached_value = self.cache.get(cachekey)
|
||||
if cached_value:
|
||||
try:
|
||||
info = email.message_from_string(cached_value)
|
||||
content = cached_value.split('\r\n\r\n', 1)[1]
|
||||
except Exception, e:
|
||||
self.cache.delete(cachekey)
|
||||
cachekey = None
|
||||
cached_value = None
|
||||
else:
|
||||
cachekey = None
|
||||
|
||||
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
|
||||
# http://www.w3.org/1999/04/Editing/
|
||||
headers['if-match'] = info['etag']
|
||||
|
||||
if method not in ["GET", "HEAD"] and self.cache and cachekey:
|
||||
# RFC 2616 Section 13.10
|
||||
self.cache.delete(cachekey)
|
||||
|
||||
if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
|
||||
if info.has_key('-x-permanent-redirect-url'):
|
||||
# Should cached permanent redirects be counted in our redirection count? For now, yes.
|
||||
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
|
||||
response.previous = Response(info)
|
||||
response.previous.fromcache = True
|
||||
else:
|
||||
# Determine our course of action:
|
||||
# Is the cached entry fresh or stale?
|
||||
# Has the client requested a non-cached response?
|
||||
#
|
||||
# There seems to be three possible answers:
|
||||
# 1. [FRESH] Return the cache entry w/o doing a GET
|
||||
# 2. [STALE] Do the GET (but add in cache validators if available)
|
||||
# 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
|
||||
entry_disposition = _entry_disposition(info, headers)
|
||||
|
||||
if entry_disposition == "FRESH":
|
||||
if not cached_value:
|
||||
info['status'] = '504'
|
||||
content = ""
|
||||
response = Response(info)
|
||||
if cached_value:
|
||||
response.fromcache = True
|
||||
return (response, content)
|
||||
|
||||
if entry_disposition == "STALE":
|
||||
if info.has_key('etag') and not self.ignore_etag:
|
||||
headers['if-none-match'] = info['etag']
|
||||
if info.has_key('last-modified'):
|
||||
headers['if-modified-since'] = info['last-modified']
|
||||
elif entry_disposition == "TRANSPARENT":
|
||||
pass
|
||||
|
||||
(response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
|
||||
|
||||
if response.status == 304 and method == "GET":
|
||||
# Rewrite the cache entry with the new end-to-end headers
|
||||
# Take all headers that are in response
|
||||
# and overwrite their values in info.
|
||||
# unless they are hop-by-hop, or are listed in the connection header.
|
||||
|
||||
for key in _get_end2end_headers(response):
|
||||
info[key] = response[key]
|
||||
merged_response = Response(info)
|
||||
if hasattr(response, "_stale_digest"):
|
||||
merged_response._stale_digest = response._stale_digest
|
||||
try:
|
||||
_updateCache(headers, merged_response, content, self.cache, cachekey)
|
||||
except:
|
||||
print locals()
|
||||
raise
|
||||
response = merged_response
|
||||
response.status = 200
|
||||
response.fromcache = True
|
||||
|
||||
elif response.status == 200:
|
||||
content = new_content
|
||||
else:
|
||||
self.cache.delete(cachekey)
|
||||
content = new_content
|
||||
else:
|
||||
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
|
||||
return (response, content)
|
||||
|
||||
|
||||
|
||||
class Response(dict):
|
||||
"""An object more like email.Message than httplib.HTTPResponse."""
|
||||
|
||||
"""Is this response from our local cache"""
|
||||
fromcache = False
|
||||
|
||||
"""HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
|
||||
version = 11
|
||||
|
||||
"Status code returned by server. "
|
||||
status = 200
|
||||
|
||||
"""Reason phrase returned by server."""
|
||||
reason = "Ok"
|
||||
|
||||
previous = None
|
||||
|
||||
def __init__(self, info):
|
||||
# info is either an email.Message or
|
||||
# an httplib.HTTPResponse object.
|
||||
if isinstance(info, httplib.HTTPResponse):
|
||||
for key, value in info.getheaders():
|
||||
self[key] = value
|
||||
self.status = info.status
|
||||
self['status'] = str(self.status)
|
||||
self.reason = info.reason
|
||||
self.version = info.version
|
||||
elif isinstance(info, email.Message.Message):
|
||||
for key, value in info.items():
|
||||
self[key] = value
|
||||
self.status = int(self['status'])
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name == 'dict':
|
||||
return self
|
||||
else:
|
||||
raise AttributeError, name
|
||||
|
||||
|
@ -15,9 +15,8 @@ Todo:
|
||||
"""
|
||||
import re, time, md5, sgmllib
|
||||
from xml.sax.saxutils import escape
|
||||
from xml.dom import minidom
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
from xml.parsers.expat import ExpatError
|
||||
from xml.dom import minidom, Node
|
||||
from planet.html5lib import liberalxmlparser, treebuilders
|
||||
import planet, config
|
||||
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -50,21 +49,14 @@ def ncr2c(value):
|
||||
value=unichr(int(value))
|
||||
return value
|
||||
|
||||
def normalize(text, bozo):
|
||||
""" convert everything to well formed XML """
|
||||
if text.has_key('type'):
|
||||
if text.type.lower().find('html')<0:
|
||||
text['value'] = escape(text.value)
|
||||
text['type'] = 'text/html'
|
||||
if text.type.lower() == 'text/html' or bozo:
|
||||
dom=BeautifulSoup(text.value,convertEntities="html")
|
||||
for tag in dom.findAll(True):
|
||||
for attr,value in tag.attrs:
|
||||
value=sgmllib.charref.sub(ncr2c,value)
|
||||
value=illegal_xml_chars.sub(u'\uFFFD',value)
|
||||
tag[attr]=value
|
||||
text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
|
||||
return text
|
||||
nonalpha=re.compile('\W+',re.UNICODE)
|
||||
def cssid(name):
|
||||
""" generate a css id from a name """
|
||||
try:
|
||||
name = nonalpha.sub('-',name.decode('utf-8')).lower().encode('utf-8')
|
||||
except:
|
||||
name = nonalpha.sub('-',name).lower()
|
||||
return name.strip('-')
|
||||
|
||||
def id(xentry, entry):
|
||||
""" copy or compute an id for the entry """
|
||||
@ -96,7 +88,7 @@ def links(xentry, entry):
|
||||
if entry.has_key('link'):
|
||||
entry['links'].append({'rel':'alternate', 'href':entry.link})
|
||||
xdoc = xentry.ownerDocument
|
||||
for link in entry.links:
|
||||
for link in entry['links']:
|
||||
if not 'href' in link.keys(): continue
|
||||
xlink = xdoc.createElement('link')
|
||||
xlink.setAttribute('href', link.get('href'))
|
||||
@ -141,27 +133,41 @@ def author(xentry, name, detail):
|
||||
def content(xentry, name, detail, bozo):
|
||||
""" insert a content-like element into the entry """
|
||||
if not detail or not detail.value: return
|
||||
normalize(detail, bozo)
|
||||
|
||||
data = None
|
||||
xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||||
xdoc = xentry.ownerDocument
|
||||
xcontent = xdoc.createElement(name)
|
||||
|
||||
try:
|
||||
# see if the resulting text is a well-formed XML fragment
|
||||
div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
|
||||
if isinstance(detail.value,unicode):
|
||||
detail.value=detail.value.encode('utf-8')
|
||||
data = minidom.parseString(div % detail.value).documentElement
|
||||
if isinstance(detail.value,unicode):
|
||||
detail.value=detail.value.encode('utf-8')
|
||||
|
||||
if detail.value.find('<') < 0:
|
||||
xcontent.appendChild(data.firstChild)
|
||||
else:
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
xcontent.appendChild(data)
|
||||
if not detail.has_key('type') or detail.type.lower().find('html')<0:
|
||||
detail['value'] = escape(detail.value)
|
||||
detail['type'] = 'text/html'
|
||||
|
||||
except ExpatError:
|
||||
# leave as html
|
||||
xcontent.setAttribute('type', 'html')
|
||||
xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
|
||||
if detail.type.find('xhtml')>=0 and not bozo:
|
||||
data = minidom.parseString(xdiv % detail.value).documentElement
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
else:
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
|
||||
html = parser.parse(xdiv % detail.value, encoding="utf-8")
|
||||
for body in html.documentElement.childNodes:
|
||||
if body.nodeType != Node.ELEMENT_NODE: continue
|
||||
if body.nodeName != 'body': continue
|
||||
for div in body.childNodes:
|
||||
if div.nodeType != Node.ELEMENT_NODE: continue
|
||||
if div.nodeName != 'div': continue
|
||||
div.normalize()
|
||||
if len(div.childNodes) == 1 and \
|
||||
div.firstChild.nodeType == Node.TEXT_NODE:
|
||||
data = div.firstChild
|
||||
else:
|
||||
data = div
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
break
|
||||
|
||||
if data: xcontent.appendChild(data)
|
||||
|
||||
if detail.get("language"):
|
||||
xcontent.setAttribute('xml:lang', detail.language)
|
||||
@ -198,6 +204,8 @@ def source(xsource, source, bozo, format):
|
||||
if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
|
||||
|
||||
# propagate planet inserted information
|
||||
if source.has_key('planet_name') and not source.has_key('planet_css-id'):
|
||||
source['planet_css-id'] = cssid(source['planet_name'])
|
||||
for key, value in source.items():
|
||||
if key.startswith('planet_'):
|
||||
createTextElement(xsource, key.replace('_',':',1), value)
|
||||
@ -239,6 +247,7 @@ def reconstitute(feed, entry):
|
||||
entry['%s_%s' % (ns,name)])
|
||||
xoriglink.setAttribute('xmlns:%s' % ns, feed.namespaces[ns])
|
||||
|
||||
# author / contributor
|
||||
author_detail = entry.get('author_detail',{})
|
||||
if author_detail and not author_detail.has_key('name') and \
|
||||
feed.feed.has_key('planet_name'):
|
||||
@ -247,14 +256,26 @@ def reconstitute(feed, entry):
|
||||
for contributor in entry.get('contributors',[]):
|
||||
author(xentry, 'contributor', contributor)
|
||||
|
||||
xsource = xdoc.createElement('source')
|
||||
src = entry.get('source') or feed.feed
|
||||
# merge in planet:* from feed (or simply use the feed if no source)
|
||||
src = entry.get('source')
|
||||
if src:
|
||||
for name,value in feed.feed.items():
|
||||
if name.startswith('planet_'): src[name]=value
|
||||
if feed.feed.has_key('id'):
|
||||
src['planet_id'] = feed.feed.id
|
||||
else:
|
||||
src = feed.feed
|
||||
|
||||
# source:author
|
||||
src_author = src.get('author_detail',{})
|
||||
if (not author_detail or not author_detail.has_key('name')) and \
|
||||
not src_author.has_key('name') and feed.feed.has_key('planet_name'):
|
||||
if src_author: src_author = src_author.__class__(src_author.copy())
|
||||
src['author_detail'] = src_author
|
||||
src_author['name'] = feed.feed['planet_name']
|
||||
|
||||
# source
|
||||
xsource = xdoc.createElement('source')
|
||||
source(xsource, src, bozo, feed.version)
|
||||
xentry.appendChild(xsource)
|
||||
|
||||
|
94
planet/scrub.py
Normal file
94
planet/scrub.py
Normal file
@ -0,0 +1,94 @@
|
||||
"""
|
||||
Process a set of configuration defined sanitations on a given feed.
|
||||
"""
|
||||
|
||||
# Standard library modules
|
||||
import time
|
||||
# Planet modules
|
||||
import planet, config, shell
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
|
||||
def scrub(feed_uri, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed_uri).split():
|
||||
if tag.find('lang')>=0: tag='language'
|
||||
if data.feed.has_key(tag): del data.feed[tag]
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
for key in entry.keys():
|
||||
if not key.endswith('_detail'): continue
|
||||
for detail in entry[key].copy():
|
||||
if detail == tag: del entry[key][detail]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed_uri):
|
||||
title_type = config.title_type(feed_uri)
|
||||
title_type = type_map.get(title_type, title_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail'):
|
||||
entry.title_detail['type'] = title_type
|
||||
|
||||
# adjust summary types
|
||||
if config.summary_type(feed_uri):
|
||||
summary_type = config.summary_type(feed_uri)
|
||||
summary_type = type_map.get(summary_type, summary_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('summary_detail'):
|
||||
entry.summary_detail['type'] = summary_type
|
||||
|
||||
# adjust content types
|
||||
if config.content_type(feed_uri):
|
||||
content_type = config.content_type(feed_uri)
|
||||
content_type = type_map.get(content_type, content_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('content'):
|
||||
entry.content[0]['type'] = content_type
|
||||
|
||||
# some people put html in author names
|
||||
if config.name_type(feed_uri).find('html')>=0:
|
||||
from shell.tmpl import stripHtml
|
||||
if data.feed.has_key('author_detail') and \
|
||||
data.feed.author_detail.has_key('name'):
|
||||
data.feed.author_detail['name'] = \
|
||||
str(stripHtml(data.feed.author_detail.name))
|
||||
for entry in data.entries:
|
||||
if entry.has_key('author_detail') and \
|
||||
entry.author_detail.has_key('name'):
|
||||
entry.author_detail['name'] = \
|
||||
str(stripHtml(entry.author_detail.name))
|
||||
if entry.has_key('source'):
|
||||
source = entry.source
|
||||
if source.has_key('author_detail') and \
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
|
||||
# handle dates in the future
|
||||
future_dates = config.future_dates(feed_uri).lower()
|
||||
if future_dates == 'ignore_date':
|
||||
now = time.gmtime()
|
||||
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
|
||||
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
|
||||
for entry in data.entries:
|
||||
if entry.has_key('published_parsed') and entry['published_parsed']:
|
||||
if entry['published_parsed'] > now:
|
||||
del entry['published_parsed']
|
||||
del entry['published']
|
||||
if entry.has_key('updated_parsed') and entry['updated_parsed']:
|
||||
if entry['updated_parsed'] > now:
|
||||
del entry['updated_parsed']
|
||||
del entry['updated']
|
||||
elif future_dates == 'ignore_entry':
|
||||
now = time.time()
|
||||
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
|
||||
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
|
||||
data.entries = [entry for entry in data.entries if
|
||||
(not entry.has_key('published_parsed') or not entry['published_parsed']
|
||||
or entry['published_parsed'] <= now) and
|
||||
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
|
||||
or entry['updated_parsed'] <= now)]
|
343
planet/spider.py
343
planet/spider.py
@ -4,10 +4,11 @@ and write each as a set of entries in a cache directory.
|
||||
"""
|
||||
|
||||
# Standard library modules
|
||||
import time, calendar, re, os
|
||||
import time, calendar, re, os, urlparse
|
||||
from xml.dom import minidom
|
||||
# Planet modules
|
||||
import planet, config, feedparser, reconstitute, shell
|
||||
import planet, config, feedparser, reconstitute, shell, socket, scrub
|
||||
from StringIO import StringIO
|
||||
|
||||
# Regular expressions to sanitise cache filenames
|
||||
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
|
||||
@ -56,118 +57,39 @@ def write(xdoc, out):
|
||||
file.write(xdoc)
|
||||
file.close()
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
def _is_http_uri(uri):
|
||||
parsed = urlparse.urlparse(uri)
|
||||
return parsed[0] in ['http', 'https']
|
||||
|
||||
def scrub(feed, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed).split():
|
||||
if tag.find('lang')>=0: tag='language'
|
||||
if data.feed.has_key(tag): del data.feed[tag]
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
for key in entry.keys():
|
||||
if not key.endswith('_detail'): continue
|
||||
for detail in entry[key].copy():
|
||||
if detail == tag: del entry[key][detail]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed):
|
||||
title_type = config.title_type(feed)
|
||||
title_type = type_map.get(title_type, title_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail'):
|
||||
entry.title_detail['type'] = title_type
|
||||
|
||||
# adjust summary types
|
||||
if config.summary_type(feed):
|
||||
summary_type = config.summary_type(feed)
|
||||
summary_type = type_map.get(summary_type, summary_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('summary_detail'):
|
||||
entry.summary_detail['type'] = summary_type
|
||||
|
||||
# adjust content types
|
||||
if config.content_type(feed):
|
||||
content_type = config.content_type(feed)
|
||||
content_type = type_map.get(content_type, content_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('content'):
|
||||
entry.content[0]['type'] = content_type
|
||||
|
||||
# some people put html in author names
|
||||
if config.name_type(feed).find('html')>=0:
|
||||
from planet.shell.tmpl import stripHtml
|
||||
if data.feed.has_key('author_detail') and \
|
||||
data.feed.author_detail.has_key('name'):
|
||||
data.feed.author_detail['name'] = \
|
||||
str(stripHtml(data.feed.author_detail.name))
|
||||
for entry in data.entries:
|
||||
if entry.has_key('author_detail') and \
|
||||
entry.author_detail.has_key('name'):
|
||||
entry.author_detail['name'] = \
|
||||
str(stripHtml(entry.author_detail.name))
|
||||
if entry.has_key('source'):
|
||||
source = entry.source
|
||||
if source.has_key('author_detail') and \
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
|
||||
def spiderFeed(feed, only_if_new=0):
|
||||
""" Spider (fetch) a single feed """
|
||||
def writeCache(feed_uri, feed_info, data):
|
||||
log = planet.logger
|
||||
|
||||
# read cached feed info
|
||||
sources = config.cache_sources_directory()
|
||||
if not os.path.exists(sources):
|
||||
os.makedirs(sources, 0700)
|
||||
feed_source = filename(sources, feed)
|
||||
feed_info = feedparser.parse(feed_source)
|
||||
if feed_info.feed and only_if_new:
|
||||
log.info("Feed %s already in cache", feed)
|
||||
return
|
||||
if feed_info.feed.get('planet_http_status',None) == '410':
|
||||
log.info("Feed %s gone", feed)
|
||||
return
|
||||
|
||||
# read feed itself
|
||||
modified = None
|
||||
try:
|
||||
modified=time.strptime(
|
||||
feed_info.feed.get('planet_http_last_modified', None))
|
||||
except:
|
||||
pass
|
||||
data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
|
||||
etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
|
||||
|
||||
# capture http status
|
||||
if not data.has_key("status"):
|
||||
if data.has_key("entries") and len(data.entries)>0:
|
||||
data.status = 200
|
||||
elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout':
|
||||
elif data.bozo and \
|
||||
data.bozo_exception.__class__.__name__.lower()=='timeout':
|
||||
data.status = 408
|
||||
else:
|
||||
data.status = 500
|
||||
|
||||
activity_horizon = \
|
||||
time.gmtime(time.time()-86400*config.activity_threshold(feed))
|
||||
time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))
|
||||
|
||||
# process based on the HTTP status code
|
||||
if data.status == 200 and data.has_key("url"):
|
||||
data.feed['planet_http_location'] = data.url
|
||||
if feed == data.url:
|
||||
log.info("Updating feed %s", feed)
|
||||
if feed_uri == data.url:
|
||||
log.info("Updating feed %s", feed_uri)
|
||||
else:
|
||||
log.info("Updating feed %s @ %s", feed, data.url)
|
||||
log.info("Updating feed %s @ %s", feed_uri, data.url)
|
||||
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
||||
log.warning("Feed has moved from <%s> to <%s>", feed, data.url)
|
||||
log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
|
||||
data.feed['planet_http_location'] = data.url
|
||||
elif data.status == 304:
|
||||
log.info("Feed %s unchanged", feed)
|
||||
log.info("Feed %s unchanged", feed_uri)
|
||||
|
||||
if not feed_info.feed.has_key('planet_message'):
|
||||
if feed_info.feed.has_key('planet_updated'):
|
||||
@ -180,13 +102,13 @@ def spiderFeed(feed, only_if_new=0):
|
||||
del feed_info.feed['planet_message']
|
||||
|
||||
elif data.status == 410:
|
||||
log.info("Feed %s gone", feed)
|
||||
log.info("Feed %s gone", feed_uri)
|
||||
elif data.status == 408:
|
||||
log.warning("Feed %s timed out", feed)
|
||||
log.warning("Feed %s timed out", feed_uri)
|
||||
elif data.status >= 400:
|
||||
log.error("Error %d while updating feed %s", data.status, feed)
|
||||
log.error("Error %d while updating feed %s", data.status, feed_uri)
|
||||
else:
|
||||
log.info("Updating feed %s", feed)
|
||||
log.info("Updating feed %s", feed_uri)
|
||||
|
||||
# if read failed, retain cached information
|
||||
if not data.version and feed_info.version:
|
||||
@ -199,11 +121,16 @@ def spiderFeed(feed, only_if_new=0):
|
||||
if data.has_key('headers'):
|
||||
if data.has_key('etag') and data.etag:
|
||||
data.feed['planet_http_etag'] = data.etag
|
||||
log.debug("E-Tag: %s", data.etag)
|
||||
if data.has_key('modified') and data.modified:
|
||||
elif data.headers.has_key('etag') and data.headers['etag']:
|
||||
data.feed['planet_http_etag'] = data.headers['etag']
|
||||
|
||||
if data.headers.has_key('last-modified'):
|
||||
data.feed['planet_http_last_modified']=data.headers['last-modified']
|
||||
elif data.has_key('modified') and data.modified:
|
||||
data.feed['planet_http_last_modified'] = time.asctime(data.modified)
|
||||
log.debug("Last Modified: %s",
|
||||
data.feed['planet_http_last_modified'])
|
||||
|
||||
if data.headers.has_key('-content-hash'):
|
||||
data.feed['planet_content_hash'] = data.headers['-content-hash']
|
||||
|
||||
# capture feed and data from the planet configuration file
|
||||
if data.version:
|
||||
@ -217,12 +144,12 @@ def spiderFeed(feed, only_if_new=0):
|
||||
break
|
||||
else:
|
||||
data.feed.links.append(feedparser.FeedParserDict(
|
||||
{'rel':'self', 'type':feedtype, 'href':feed}))
|
||||
for name, value in config.feed_options(feed).items():
|
||||
{'rel':'self', 'type':feedtype, 'href':feed_uri}))
|
||||
for name, value in config.feed_options(feed_uri).items():
|
||||
data.feed['planet_'+name] = value
|
||||
|
||||
# perform user configured scrub operations on the data
|
||||
scrub(feed, data)
|
||||
scrub.scrub(feed_uri, data)
|
||||
|
||||
from planet import idindex
|
||||
global index
|
||||
@ -241,10 +168,9 @@ def spiderFeed(feed, only_if_new=0):
|
||||
|
||||
# get updated-date either from the entry or the cache (default to now)
|
||||
mtime = None
|
||||
if not entry.has_key('updated_parsed'):
|
||||
if entry.has_key('published_parsed'):
|
||||
entry['updated_parsed'] = entry['published_parsed']
|
||||
if not entry.has_key('updated_parsed'):
|
||||
if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
|
||||
entry['updated_parsed'] = entry.get('published_parsed',None)
|
||||
if entry.has_key('updated_parsed'):
|
||||
try:
|
||||
mtime = calendar.timegm(entry.updated_parsed)
|
||||
except:
|
||||
@ -254,15 +180,18 @@ def spiderFeed(feed, only_if_new=0):
|
||||
mtime = os.stat(cache_file).st_mtime
|
||||
except:
|
||||
if data.feed.has_key('updated_parsed'):
|
||||
mtime = calendar.timegm(data.feed.updated_parsed)
|
||||
if not mtime or mtime > time.time(): mtime = time.time()
|
||||
try:
|
||||
mtime = calendar.timegm(data.feed.updated_parsed)
|
||||
except:
|
||||
pass
|
||||
if not mtime: mtime = time.time()
|
||||
entry['updated_parsed'] = time.gmtime(mtime)
|
||||
|
||||
# apply any filters
|
||||
xdoc = reconstitute.reconstitute(data, entry)
|
||||
output = xdoc.toxml('utf-8')
|
||||
output = xdoc.toxml().encode('utf-8')
|
||||
xdoc.unlink()
|
||||
for filter in config.filters(feed):
|
||||
for filter in config.filters(feed_uri):
|
||||
output = shell.run(filter, output, mode="filter")
|
||||
if not output: break
|
||||
if not output: continue
|
||||
@ -281,7 +210,7 @@ def spiderFeed(feed, only_if_new=0):
|
||||
if index: index.close()
|
||||
|
||||
# identify inactive feeds
|
||||
if config.activity_threshold(feed):
|
||||
if config.activity_threshold(feed_uri):
|
||||
updated = [entry.updated_parsed for entry in data.entries
|
||||
if entry.has_key('updated_parsed')]
|
||||
updated.sort()
|
||||
@ -293,7 +222,7 @@ def spiderFeed(feed, only_if_new=0):
|
||||
updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]
|
||||
|
||||
if not updated or updated[-1] < activity_horizon:
|
||||
msg = "no activity in %d days" % config.activity_threshold(feed)
|
||||
msg = "no activity in %d days" % config.activity_threshold(feed_uri)
|
||||
log.info(msg)
|
||||
data.feed['planet_message'] = msg
|
||||
|
||||
@ -320,24 +249,188 @@ def spiderFeed(feed, only_if_new=0):
|
||||
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
||||
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
|
||||
reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
|
||||
write(xdoc.toxml('utf-8'), filename(sources, feed))
|
||||
write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
|
||||
xdoc.unlink()
|
||||
|
||||
def httpThread(thread_index, input_queue, output_queue, log):
|
||||
import httplib2, md5
|
||||
from socket import gaierror, error
|
||||
from httplib import BadStatusLine
|
||||
|
||||
h = httplib2.Http(config.http_cache_directory())
|
||||
uri, feed_info = input_queue.get(block=True)
|
||||
while uri:
|
||||
log.info("Fetching %s via %d", uri, thread_index)
|
||||
feed = StringIO('')
|
||||
setattr(feed, 'url', uri)
|
||||
setattr(feed, 'headers',
|
||||
feedparser.FeedParserDict({'status':'500'}))
|
||||
try:
|
||||
# map IRI => URI
|
||||
try:
|
||||
if isinstance(uri,unicode):
|
||||
idna = uri.encode('idna')
|
||||
else:
|
||||
idna = uri.decode('utf-8').encode('idna')
|
||||
if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
|
||||
except:
|
||||
log.info("unable to map %s to a URI", uri)
|
||||
idna = uri
|
||||
|
||||
# cache control headers
|
||||
headers = {}
|
||||
if feed_info.feed.has_key('planet_http_etag'):
|
||||
headers['If-None-Match'] = feed_info.feed['planet_http_etag']
|
||||
if feed_info.feed.has_key('planet_http_last_modified'):
|
||||
headers['If-Modified-Since'] = \
|
||||
feed_info.feed['planet_http_last_modified']
|
||||
|
||||
# issue request
|
||||
(resp, content) = h.request(idna, 'GET', headers=headers)
|
||||
|
||||
# unchanged detection
|
||||
resp['-content-hash'] = md5.new(content or '').hexdigest()
|
||||
if resp.status == 200:
|
||||
if resp.fromcache:
|
||||
resp.status = 304
|
||||
elif feed_info.feed.has_key('planet_content_hash') and \
|
||||
feed_info.feed['planet_content_hash'] == \
|
||||
resp['-content-hash']:
|
||||
resp.status = 304
|
||||
|
||||
# build a file-like object
|
||||
feed = StringIO(content)
|
||||
setattr(feed, 'url', resp.get('content-location', uri))
|
||||
if resp.has_key('content-encoding'):
|
||||
del resp['content-encoding']
|
||||
setattr(feed, 'headers', resp)
|
||||
except gaierror:
|
||||
log.error("Fail to resolve server name %s via %d",
|
||||
uri, thread_index)
|
||||
except BadStatusLine:
|
||||
log.error("Bad Status Line received for %s via %d",
|
||||
uri, thread_index)
|
||||
except error, e:
|
||||
if e.__class__.__name__.lower()=='timeout':
|
||||
feed.headers['status'] = '408'
|
||||
log.warn("Timeout in thread-%d", thread_index)
|
||||
else:
|
||||
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
continue
|
||||
|
||||
output_queue.put(block=True, item=(uri, feed_info, feed))
|
||||
uri, feed_info = input_queue.get(block=True)
|
||||
|
||||
def spiderPlanet(only_if_new = False):
|
||||
""" Spider (fetch) an entire planet """
|
||||
# log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
planet.setTimeout(config.feed_timeout())
|
||||
|
||||
global index
|
||||
index = True
|
||||
|
||||
for feed in config.subscriptions():
|
||||
timeout = config.feed_timeout()
|
||||
try:
|
||||
socket.setdefaulttimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
try:
|
||||
spiderFeed(feed, only_if_new=only_if_new)
|
||||
except Exception,e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', feed)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
from planet import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
log.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
fetch_queue = Queue()
|
||||
parse_queue = Queue()
|
||||
|
||||
threads = {}
|
||||
http_cache = config.http_cache_directory()
|
||||
# Should this be done in config?
|
||||
if http_cache and not os.path.exists(http_cache):
|
||||
os.makedirs(http_cache)
|
||||
|
||||
|
||||
if int(config.spider_threads()):
|
||||
# Start all the worker threads
|
||||
for i in range(int(config.spider_threads())):
|
||||
threads[i] = Thread(target=httpThread,
|
||||
args=(i,fetch_queue, parse_queue, log))
|
||||
threads[i].start()
|
||||
else:
|
||||
log.info("Building work queue")
|
||||
|
||||
# Load the fetch and parse work queues
|
||||
for uri in config.subscriptions():
|
||||
# read cached feed info
|
||||
sources = config.cache_sources_directory()
|
||||
feed_source = filename(sources, uri)
|
||||
feed_info = feedparser.parse(feed_source)
|
||||
|
||||
if feed_info.feed and only_if_new:
|
||||
log.info("Feed %s already in cache", uri)
|
||||
continue
|
||||
if feed_info.feed.get('planet_http_status',None) == '410':
|
||||
log.info("Feed %s gone", uri)
|
||||
continue
|
||||
|
||||
if threads and _is_http_uri(uri):
|
||||
fetch_queue.put(item=(uri, feed_info))
|
||||
else:
|
||||
parse_queue.put(item=(uri, feed_info, uri))
|
||||
|
||||
# Mark the end of the fetch queue
|
||||
for thread in threads.keys():
|
||||
fetch_queue.put(item=(None, None))
|
||||
|
||||
# Process the results as they arrive
|
||||
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
||||
while parse_queue.qsize() == 0 and threads:
|
||||
time.sleep(0.1)
|
||||
while parse_queue.qsize():
|
||||
(uri, feed_info, feed) = parse_queue.get(False)
|
||||
try:
|
||||
|
||||
if not hasattr(feed,'headers') or int(feed.headers.status)<300:
|
||||
options = {}
|
||||
if hasattr(feed_info,'feed'):
|
||||
options['etag'] = \
|
||||
feed_info.feed.get('planet_http_etag',None)
|
||||
try:
|
||||
modified=time.strptime(
|
||||
feed_info.feed.get('planet_http_last_modified',
|
||||
None))
|
||||
except:
|
||||
pass
|
||||
|
||||
data = feedparser.parse(feed, **options)
|
||||
else:
|
||||
data = feedparser.FeedParserDict({'version': None,
|
||||
'headers': feed.headers, 'entries': [], 'feed': {},
|
||||
'bozo': 0, 'status': int(feed.headers.status)})
|
||||
|
||||
writeCache(uri, feed_info, data)
|
||||
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
|
||||
for index in threads.keys():
|
||||
if not threads[index].isAlive():
|
||||
del threads[index]
|
||||
if not threads:
|
||||
log.info("Finished threaded part of processing.")
|
||||
|
@ -68,8 +68,8 @@ def splice():
|
||||
# insert entry information
|
||||
items = 0
|
||||
for mtime,file in dir:
|
||||
if index:
|
||||
base = file.split('/')[-1]
|
||||
if index != None:
|
||||
base = os.path.basename(file)
|
||||
if index.has_key(base) and index[base] not in sub_ids: continue
|
||||
|
||||
try:
|
||||
@ -81,7 +81,9 @@ def splice():
|
||||
if sources:
|
||||
ids = sources[0].getElementsByTagName('id')
|
||||
if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
|
||||
continue
|
||||
ids = sources[0].getElementsByTagName('planet:id')
|
||||
if not ids: continue
|
||||
if ids[0].childNodes[0].nodeValue not in sub_ids: continue
|
||||
|
||||
# add entry to feed
|
||||
feed.appendChild(entry.documentElement)
|
||||
|
@ -4,6 +4,7 @@ link = http://example.com/
|
||||
template_files = index.html.tmpl atom.xml.tmpl
|
||||
items_per_page = 50
|
||||
filters = foo
|
||||
feed_timeout=30
|
||||
|
||||
[index.html.tmpl]
|
||||
days_per_page = 7
|
||||
|
@ -1,6 +1,6 @@
|
||||
<!--
|
||||
Description: illegal control character
|
||||
Expect: content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
|
||||
Expect: content[0].value == u'Page 1\ufffdPage 2'
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
|
11
tests/data/reconstitute/planet_name.xml
Normal file
11
tests/data/reconstitute/planet_name.xml
Normal file
@ -0,0 +1,11 @@
|
||||
<!--
|
||||
Description: planet name
|
||||
Expect: source.planet_name == 'John Doe'
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:planet="http://planet.intertwingly.net/">
|
||||
<planet:name>John Doe</planet:name>
|
||||
<entry/>
|
||||
</feed>
|
||||
|
15
tests/data/reconstitute/planet_name_source.xml
Normal file
15
tests/data/reconstitute/planet_name_source.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<!--
|
||||
Description: ensure that planet attributes make it into the source
|
||||
Expect: source.planet_name == 'John Doe'
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:planet="http://planet.intertwingly.net/">
|
||||
<planet:name>John Doe</planet:name>
|
||||
<entry>
|
||||
<source>
|
||||
<id>http://example.com/</id>
|
||||
</source>
|
||||
</entry>
|
||||
</feed>
|
||||
|
14
tests/data/reconstitute/source_planet_id.xml
Normal file
14
tests/data/reconstitute/source_planet_id.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<!--
|
||||
Description: source id
|
||||
Expect: source.planet_id == 'http://example.com/'
|
||||
-->
|
||||
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>http://example.com/</id>
|
||||
<entry>
|
||||
<source>
|
||||
<id>http://example.org/</id>
|
||||
</source>
|
||||
</entry>
|
||||
</feed>
|
||||
|
19
tests/data/spider/threaded.ini
Normal file
19
tests/data/spider/threaded.ini
Normal file
@ -0,0 +1,19 @@
|
||||
[Planet]
|
||||
name = test planet
|
||||
cache_directory = tests/work/spider/cache
|
||||
spider_threads = 2
|
||||
|
||||
# for testing purposes, must equal port number below
|
||||
test_port = 8098
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed0.atom]
|
||||
name = not found
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed1b.atom]
|
||||
name = one
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed2.atom]
|
||||
name = two
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed3.rss]
|
||||
name = three
|
@ -58,3 +58,11 @@ class ConfigTest(unittest.TestCase):
|
||||
def test_filters(self):
|
||||
self.assertEqual(['foo','bar'], config.filters('feed2'))
|
||||
self.assertEqual(['foo'], config.filters('feed1'))
|
||||
|
||||
# ints
|
||||
|
||||
def test_timeout(self):
|
||||
self.assertEqual(30,
|
||||
config.feed_timeout())
|
||||
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, StringIO
|
||||
from planet.spider import scrub
|
||||
import unittest, StringIO, time
|
||||
from copy import deepcopy
|
||||
from planet.scrub import scrub
|
||||
from planet import feedparser, config
|
||||
|
||||
feed = '''
|
||||
@ -10,7 +11,7 @@ feed = '''
|
||||
<entry xml:lang="en">
|
||||
<id>ignoreme</id>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<updated>2000-01-01T00:00:00Z</updated>
|
||||
<updated>%d-12-31T23:59:59Z</updated>
|
||||
<title>F&ouml;o</title>
|
||||
<summary>F&ouml;o</summary>
|
||||
<content>F&ouml;o</content>
|
||||
@ -19,11 +20,10 @@ feed = '''
|
||||
</source>
|
||||
</entry>
|
||||
</feed>
|
||||
'''
|
||||
''' % (time.gmtime()[0] + 1)
|
||||
|
||||
configData = '''
|
||||
[testfeed]
|
||||
ignore_in_feed = id updated xml:lang
|
||||
name_type = html
|
||||
title_type = html
|
||||
summary_type = html
|
||||
@ -32,16 +32,17 @@ content_type = html
|
||||
|
||||
class ScrubTest(unittest.TestCase):
|
||||
|
||||
def test_scrub(self):
|
||||
data = feedparser.parse(feed)
|
||||
def test_scrub_ignore(self):
|
||||
base = feedparser.parse(feed)
|
||||
|
||||
self.assertTrue(base.entries[0].has_key('id'))
|
||||
self.assertTrue(base.entries[0].has_key('updated'))
|
||||
self.assertTrue(base.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(base.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
|
||||
self.assertEqual('Föo', data.feed.author_detail.name)
|
||||
self.assertTrue(data.entries[0].has_key('id'))
|
||||
self.assertTrue(data.entries[0].has_key('updated'))
|
||||
self.assertTrue(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertFalse(data.entries[0].has_key('id'))
|
||||
@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase):
|
||||
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
def test_scrub_type(self):
|
||||
base = feedparser.parse(feed)
|
||||
|
||||
self.assertEqual('Föo', base.feed.author_detail.name)
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
|
||||
@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase):
|
||||
self.assertEqual('text/html', data.entries[0].summary_detail.type)
|
||||
self.assertEqual('text/html', data.entries[0].content[0].type)
|
||||
|
||||
def test_scrub_future(self):
|
||||
base = feedparser.parse(feed)
|
||||
self.assertEqual(1, len(base.entries))
|
||||
self.assertTrue(base.entries[0].has_key('updated'))
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
config.parser.set('testfeed', 'future_dates', 'ignore_date')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertFalse(data.entries[0].has_key('updated'))
|
||||
|
||||
config.parser.set('testfeed', 'future_dates', 'ignore_entry')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual(0, len(data.entries))
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, os, glob, calendar, shutil
|
||||
from planet.spider import filename, spiderFeed, spiderPlanet
|
||||
import unittest, os, glob, calendar, shutil, time
|
||||
from planet.spider import filename, spiderPlanet, writeCache
|
||||
from planet import feedparser, config
|
||||
import planet
|
||||
|
||||
@ -43,9 +43,12 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
||||
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
||||
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
spiderFeed(testfeed % '1b')
|
||||
def spiderFeed(self, feed_uri):
|
||||
feed_info = feedparser.parse('<feed/>')
|
||||
data = feedparser.parse(feed_uri)
|
||||
writeCache(feed_uri, feed_info, data)
|
||||
|
||||
def verify_spiderFeed(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
files.sort()
|
||||
|
||||
@ -61,20 +64,26 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(['application/atom+xml'], [link.type
|
||||
for link in data.entries[0].source.links if link.rel=='self'])
|
||||
self.assertEqual('one', data.entries[0].source.planet_name)
|
||||
self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
|
||||
self.assertEqual(os.stat(files[2]).st_mtime,
|
||||
calendar.timegm(data.entries[0].updated_parsed))
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
spiderFeed(testfeed % '1a')
|
||||
self.test_spiderFeed()
|
||||
|
||||
def test_spiderPlanet(self):
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
spiderPlanet()
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
config.load(configfile)
|
||||
self.spiderFeed(testfeed % '1a')
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
|
||||
# verify that exactly eight files + 1 source dir were produced
|
||||
self.assertEqual(13, len(files))
|
||||
self.assertEqual(14, len(files))
|
||||
|
||||
# verify that the file names are as expected
|
||||
self.assertTrue(os.path.join(workdir,
|
||||
@ -87,4 +96,50 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(['application/rss+xml'], [link.type
|
||||
for link in data.entries[0].source.links if link.rel=='self'])
|
||||
self.assertEqual('three', data.entries[0].source.author_detail.name)
|
||||
self.assertEqual('three', data.entries[0].source['planet_css-id'])
|
||||
|
||||
def test_spiderPlanet(self):
|
||||
config.load(configfile)
|
||||
spiderPlanet()
|
||||
self.verify_spiderPlanet()
|
||||
|
||||
def test_spiderThreads(self):
|
||||
config.load(configfile.replace('config','threaded'))
|
||||
_PORT = config.parser.getint('Planet','test_port')
|
||||
|
||||
log = []
|
||||
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
class TestRequestHandler(SimpleHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
log.append(args)
|
||||
|
||||
from threading import Thread
|
||||
class TestServerThread(Thread):
|
||||
def __init__(self):
|
||||
self.ready = 0
|
||||
self.done = 0
|
||||
Thread.__init__(self)
|
||||
def run(self):
|
||||
from BaseHTTPServer import HTTPServer
|
||||
httpd = HTTPServer(('',_PORT), TestRequestHandler)
|
||||
self.ready = 1
|
||||
while not self.done:
|
||||
httpd.handle_request()
|
||||
|
||||
httpd = TestServerThread()
|
||||
httpd.start()
|
||||
while not httpd.ready:
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
spiderPlanet()
|
||||
finally:
|
||||
httpd.done = 1
|
||||
import urllib
|
||||
urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
|
||||
|
||||
status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
|
||||
status.sort()
|
||||
self.assertEqual([200,200,200,200,404], status)
|
||||
|
||||
self.verify_spiderPlanet()
|
||||
|
@ -30,6 +30,10 @@ a:active {
|
||||
a:focus {
|
||||
}
|
||||
|
||||
a.active {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: x-large;
|
||||
text-transform: uppercase;
|
||||
@ -97,6 +101,33 @@ h1 {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
#sidebar ul li a:visited {
|
||||
color: #000;
|
||||
}
|
||||
|
||||
#sidebar ul li ul {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#sidebar ul li {
|
||||
position: relative;
|
||||
}
|
||||
|
||||
#sidebar ul li:hover ul {
|
||||
background-color: #EEE;
|
||||
border: 2px solid #BBB;
|
||||
color:#000;
|
||||
display: block;
|
||||
margin-left: -300px;
|
||||
margin-right: 115px;
|
||||
padding: 10px;
|
||||
padding-left: 25px;
|
||||
position: absolute;
|
||||
right: 80px;
|
||||
top: -12px;
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
#sidebar img {
|
||||
border: 0;
|
||||
}
|
||||
@ -135,15 +166,19 @@ h1 {
|
||||
/* ---------------------------- Footer --------------------------- */
|
||||
|
||||
#footer ul {
|
||||
margin: 0 20px 0 -25px;
|
||||
padding: 0;
|
||||
margin: 0 20px 0 -25px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
#footer li {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
list-style: none;
|
||||
display: inline;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
list-style: none;
|
||||
display: inline;
|
||||
}
|
||||
|
||||
#footer img {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* ----------------------------- Body ---------------------------- */
|
||||
@ -262,9 +297,9 @@ img.icon {
|
||||
}
|
||||
|
||||
.news code {
|
||||
font-family: monospace;
|
||||
font-size: medium;
|
||||
font-weight: bold;
|
||||
font-family: monospace;
|
||||
font-size: medium;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.news .content a {
|
||||
@ -404,11 +439,6 @@ img.floatright {
|
||||
background-color: #f8f8f8;
|
||||
}
|
||||
|
||||
/* GigaOM */
|
||||
p img {
|
||||
float: left;
|
||||
}
|
||||
|
||||
/* Tantek */
|
||||
ul.tags,ul.tags li,h4.tags {
|
||||
display:inline;
|
||||
|
@ -2,9 +2,14 @@
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:xhtml="http://www.w3.org/1999/xhtml"
|
||||
xmlns:planet="http://planet.intertwingly.net/"
|
||||
xmlns="http://www.w3.org/1999/xhtml">
|
||||
xmlns="http://www.w3.org/1999/xhtml"
|
||||
exclude-result-prefixes="atom planet xhtml">
|
||||
|
||||
<xsl:output method="xml" omit-xml-declaration="yes"/>
|
||||
|
||||
<xsl:template match="atom:feed">
|
||||
<xsl:text disable-output-escaping="yes"><!DOCTYPE html></xsl:text>
|
||||
<xsl:text> </xsl:text>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
|
||||
<!-- head -->
|
||||
@ -20,7 +25,7 @@
|
||||
</xsl:if>
|
||||
<link rel="shortcut icon" href="/favicon.ico" />
|
||||
<script type="text/javascript" src="personalize.js">
|
||||
<xsl:comment>HTML Compatibility</xsl:comment>
|
||||
<xsl:comment><!--HTML Compatibility--></xsl:comment>
|
||||
</script>
|
||||
</head>
|
||||
|
||||
@ -29,58 +34,16 @@
|
||||
<xsl:text> </xsl:text>
|
||||
<h1><xsl:value-of select="atom:title"/></h1>
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:text> </xsl:text>
|
||||
<div id="body">
|
||||
<xsl:apply-templates select="atom:entry"/>
|
||||
<xsl:text> </xsl:text>
|
||||
</div>
|
||||
|
||||
<h1>Subscriptions </h1>
|
||||
<xsl:text> </xsl:text>
|
||||
|
||||
<div id="sidebar">
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
<h2>Subscriptions</h2>
|
||||
<xsl:text> </xsl:text>
|
||||
<ul>
|
||||
<xsl:for-each select="planet:source">
|
||||
<xsl:sort select="planet:name"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<li>
|
||||
<!-- icon -->
|
||||
<a title="subscribe">
|
||||
<xsl:choose>
|
||||
<xsl:when test="planet:http_location">
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="planet:http_location"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:when test="atom:link[@rel='self']/@href">
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="atom:link[@rel='self']/@href"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
<img src="images/feed-icon-10x10.png" alt="(feed)"/>
|
||||
</a>
|
||||
<xsl:text> </xsl:text>
|
||||
|
||||
<!-- name -->
|
||||
<a href="{atom:link[@rel='alternate']/@href}">
|
||||
<xsl:choose>
|
||||
<xsl:when test="planet:message">
|
||||
<xsl:attribute name="class">message</xsl:attribute>
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="planet:message"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:when test="atom:title">
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="atom:title"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
<xsl:value-of select="planet:name"/>
|
||||
</a>
|
||||
</li>
|
||||
</xsl:for-each>
|
||||
<xsl:text> </xsl:text>
|
||||
</ul>
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
<h2>Info</h2>
|
||||
|
||||
<dl>
|
||||
@ -113,14 +76,84 @@
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
|
||||
</div>
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
<div id="body">
|
||||
<xsl:apply-templates select="atom:entry"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<div id="footer">
|
||||
<ul>
|
||||
<xsl:for-each select="planet:source">
|
||||
<xsl:sort select="planet:name"/>
|
||||
<xsl:variable name="id" select="atom:id"/>
|
||||
<xsl:variable name="posts"
|
||||
select="/atom:feed/atom:entry[atom:source/atom:id = $id]"/>
|
||||
<xsl:text> </xsl:text>
|
||||
<li>
|
||||
<!-- icon -->
|
||||
<a title="subscribe">
|
||||
<xsl:choose>
|
||||
<xsl:when test="planet:http_location">
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="planet:http_location"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:when test="atom:link[@rel='self']/@href">
|
||||
<xsl:attribute name="href">
|
||||
<xsl:value-of select="atom:link[@rel='self']/@href"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
<img src="images/feed-icon-10x10.png" alt="(feed)"/>
|
||||
</a>
|
||||
<xsl:text> </xsl:text>
|
||||
|
||||
<!-- name -->
|
||||
<a href="{atom:link[@rel='alternate']/@href}">
|
||||
<xsl:choose>
|
||||
<xsl:when test="planet:message">
|
||||
<xsl:attribute name="class">
|
||||
<xsl:if test="$posts">active message</xsl:if>
|
||||
<xsl:if test="not($posts)">message</xsl:if>
|
||||
</xsl:attribute>
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="planet:message"/>
|
||||
</xsl:attribute>
|
||||
</xsl:when>
|
||||
<xsl:when test="atom:title">
|
||||
<xsl:attribute name="title">
|
||||
<xsl:value-of select="atom:title"/>
|
||||
</xsl:attribute>
|
||||
<xsl:if test="$posts">
|
||||
<xsl:attribute name="class">active</xsl:attribute>
|
||||
</xsl:if>
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
<xsl:value-of select="planet:name"/>
|
||||
</a>
|
||||
|
||||
<xsl:if test="$posts">
|
||||
<ul>
|
||||
<xsl:for-each select="$posts">
|
||||
<xsl:if test="string-length(atom:title) > 0">
|
||||
<li>
|
||||
<a href="{atom:link[@rel='alternate']/@href}">
|
||||
<xsl:if test="atom:title/@xml:lang != @xml:lang">
|
||||
<xsl:attribute name="xml:lang"
|
||||
select="{atom:title/@xml:lang}"/>
|
||||
</xsl:if>
|
||||
<xsl:value-of select="atom:title"/>
|
||||
</a>
|
||||
</li>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
</ul>
|
||||
</xsl:if>
|
||||
</li>
|
||||
</xsl:for-each>
|
||||
<xsl:text> </xsl:text>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
@ -139,7 +172,7 @@
|
||||
</xsl:if>
|
||||
|
||||
<xsl:text> </xsl:text>
|
||||
<div class="news">
|
||||
<div class="news {atom:source/planet:css-id}">
|
||||
|
||||
<xsl:if test="@xml:lang">
|
||||
<xsl:attribute name="xml:lang">
|
||||
|
@ -18,6 +18,7 @@ function stopPropagation(event) {
|
||||
// scroll back to the previous article
|
||||
function prevArticle(event) {
|
||||
for (var i=entries.length; --i>=0;) {
|
||||
if (!entries[i].anchor) continue;
|
||||
if (entries[i].anchor.offsetTop < document.documentElement.scrollTop) {
|
||||
window.location.hash=entries[i].anchor.id;
|
||||
stopPropagation(event);
|
||||
@ -29,6 +30,7 @@ function prevArticle(event) {
|
||||
// advance to the next article
|
||||
function nextArticle(event) {
|
||||
for (var i=1; i<entries.length; i++) {
|
||||
if (!entries[i].anchor) continue;
|
||||
if (entries[i].anchor.offsetTop-20 > document.documentElement.scrollTop) {
|
||||
window.location.hash=entries[i].anchor.id;
|
||||
stopPropagation(event);
|
||||
@ -84,17 +86,20 @@ function selectOption() {
|
||||
|
||||
// add navkeys option to sidebar
|
||||
function addOption(event) {
|
||||
if (entries.length > 1 && entries[entries.length-1].parent.offsetTop > 0) {
|
||||
var sidebar = document.getElementById('sidebar');
|
||||
if (!sidebar) return;
|
||||
var sidebar = document.getElementById('sidebar');
|
||||
if (!sidebar) return;
|
||||
|
||||
for (var i=entries.length; --i>=0;) {
|
||||
var h2 = null;
|
||||
for (var i=entries.length; --i>=0;) {
|
||||
if (entries[i].parent.offsetTop > 0) {
|
||||
var a = entries[i].anchor = document.createElement('a');
|
||||
a.id = "news-" + i;
|
||||
entries[i].parent.insertBefore(a, entries[i].parent.firstChild);
|
||||
if (h2 == null) h2 = document.createElement('h2');
|
||||
}
|
||||
}
|
||||
|
||||
var h2 = document.createElement('h2');
|
||||
if (h2 != null) {
|
||||
h2.appendChild(document.createTextNode('Options'));
|
||||
sidebar.appendChild(h2);
|
||||
|
||||
@ -159,7 +164,8 @@ function findEntries() {
|
||||
var date = localizeDate(span[i]);
|
||||
|
||||
var parent = span[i];
|
||||
while (parent && parent.className != 'news') {
|
||||
while (parent &&
|
||||
(!parent.className || parent.className.split(' ')[0] != 'news')) {
|
||||
parent = parent.parentNode;
|
||||
}
|
||||
|
||||
@ -202,8 +208,49 @@ function moveDateHeaders() {
|
||||
}
|
||||
}
|
||||
|
||||
function moveSidebar() {
|
||||
var sidebar = document.getElementById('sidebar');
|
||||
if (sidebar.currentStyle && sidebar.currentStyle['float'] == 'none') return;
|
||||
if (window.getComputedStyle && document.defaultView.getComputedStyle(sidebar,null).getPropertyValue('float') == 'none') return;
|
||||
|
||||
var h1 = sidebar.previousSibling;
|
||||
while (h1.nodeType != 1) h1=h1.previousSibling;
|
||||
h1.parentNode.removeChild(h1);
|
||||
var footer = document.getElementById('footer');
|
||||
var ul = footer.firstChild;
|
||||
while (ul.nodeType != 1) ul=ul.nextSibling;
|
||||
footer.removeChild(ul);
|
||||
sidebar.insertBefore(ul, sidebar.firstChild);
|
||||
var h2 = document.createElement('h2');
|
||||
h2.appendChild(h1.firstChild);
|
||||
var twisty = document.createElement('a');
|
||||
twisty.appendChild(document.createTextNode('\u25bc'));
|
||||
twisty.title = 'hide';
|
||||
twisty.onclick = function() {
|
||||
var display = 'block';
|
||||
if (this.childNodes[0].nodeValue == '\u25ba') {
|
||||
this.title = 'hide';
|
||||
this.childNodes[0].nodeValue = '\u25bc';
|
||||
} else {
|
||||
this.title = 'show';
|
||||
this.childNodes[0].nodeValue = '\u25ba';
|
||||
display = 'none';
|
||||
}
|
||||
ul.style.display = display;
|
||||
createCookie("subscriptions", display, 365);
|
||||
}
|
||||
var cookie = readCookie("subscriptions");
|
||||
if (cookie && cookie == 'none') twisty.onclick();
|
||||
h2.appendChild(twisty);
|
||||
sidebar.insertBefore(h2, sidebar.firstChild);
|
||||
var body = document.getElementById('body');
|
||||
sidebar.parentNode.removeChild(sidebar);
|
||||
body.parentNode.insertBefore(sidebar, body);
|
||||
}
|
||||
|
||||
// adjust dates to local time zones, optionally provide navigation keys
|
||||
function personalize() {
|
||||
moveSidebar();
|
||||
findEntries();
|
||||
addOption();
|
||||
moveDateHeaders();
|
||||
|
@ -74,7 +74,7 @@
|
||||
<xsl:text> </xsl:text>
|
||||
<tr>
|
||||
<xsl:if test="planet:bozo='true'">
|
||||
<xsl:attribute name="bgcolor">#FCC</xsl:attribute>
|
||||
<xsl:attribute name="style">background-color:#FCC</xsl:attribute>
|
||||
</xsl:if>
|
||||
<td>
|
||||
<a title="feed validator">
|
||||
|
Loading…
x
Reference in New Issue
Block a user