filters/xhtml2html =~ s/Genshi/html5lib/
This commit is contained in:
parent
1fcfbe35c0
commit
4b1e0da922
@ -84,8 +84,8 @@ then the output stream is
|
||||
through the specified filter and the output is planced into the named file; the
|
||||
other unmodified branch continues onto the next filter, if any.
|
||||
One use case for this function is to use
|
||||
<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
|
||||
an HTML output stream from one source.</li>
|
||||
<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
|
||||
and an HTML output stream from one source.</li>
|
||||
|
||||
<li>Templates written using htmltmpl or django currently only have access to a
|
||||
fixed set of fields, whereas XSLT and genshi templates have access to
|
||||
|
21
filters/xhtml2html.plugin
Normal file
21
filters/xhtml2html.plugin
Normal file
@ -0,0 +1,21 @@
|
||||
# Example usages:
|
||||
#
|
||||
# filters:
|
||||
# xhtml2html.plugin?quote_attr_values=True"e_char="'"
|
||||
#
|
||||
# -- or --
|
||||
#
|
||||
# [xhtml2html.plugin]
|
||||
# quote_attr_values=True
|
||||
# quote_char="'"
|
||||
|
||||
import sys
|
||||
opts = zip(sys.argv[1::2],sys.argv[2::2])
|
||||
opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
|
||||
|
||||
from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
|
||||
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
|
||||
tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
|
||||
serializer = serializer.HTMLSerializer(**dict(opts))
|
||||
for text in serializer.serialize(tokens, encoding='utf-8'):
|
||||
sys.stdout.write(text)
|
@ -1,5 +0,0 @@
|
||||
import sys
|
||||
from genshi.input import XMLParser
|
||||
from genshi.output import HTMLSerializer
|
||||
|
||||
print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
|
35
planet/vendor/html5lib/html5parser.py
vendored
35
planet/vendor/html5lib/html5parser.py
vendored
@ -1,16 +1,10 @@
|
||||
|
||||
# Differences from the current specification (23 December 2006) are as follows:
|
||||
# * Phases and insertion modes are one concept in parser.py.
|
||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||
# always exist.
|
||||
# * </br> creates a <br> element.
|
||||
#
|
||||
# We haven't updated DOCTYPE handling yet
|
||||
#
|
||||
# It should be trivial to add the following cases. However, we should probably
|
||||
# also look into comment handling and such then...
|
||||
# * A <p> element end tag creates an empty <p> element when there's no <p>
|
||||
# element in scope.
|
||||
|
||||
|
||||
try:
|
||||
frozenset
|
||||
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
(("html", "head", "body", "br"), self.endTagImplyHead)
|
||||
(("html", "head", "body", "br", "p"), self.endTagImplyHead)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
@ -530,7 +524,7 @@ class InHeadPhase(Phase):
|
||||
|
||||
self. endTagHandler = utils.MethodDispatcher([
|
||||
("head", self.endTagHead),
|
||||
(("html", "body", "br"), self.endTagImplyAfterHead),
|
||||
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
|
||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope("p"):
|
||||
self.tree.generateImpliedEndTags("p")
|
||||
if self.tree.openElements[-1].name != "p":
|
||||
self.parser.parseError("Unexpected end tag (p).")
|
||||
while self.tree.elementInScope("p"):
|
||||
self.tree.openElements.pop()
|
||||
self.parser.parseError(_("Unexpected end tag (p)."))
|
||||
if self.tree.elementInScope("p"):
|
||||
while self.tree.elementInScope("p"):
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.startTagCloseP("p", {})
|
||||
self.endTagP("p")
|
||||
|
||||
def endTagBody(self, name):
|
||||
# XXX Need to take open <p> tags into account here. We shouldn't imply
|
||||
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
|
||||
if inScope:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
if inScope:
|
||||
node = self.tree.openElements.pop()
|
||||
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
|
||||
node = self.tree.openElements.pop()
|
||||
|
||||
def endTagForm(self, name):
|
||||
self.endTagBlock(name)
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
|
||||
else:
|
||||
self.tree.openElements.pop()
|
||||
self.tree.formPointer = None
|
||||
|
||||
def endTagListItem(self, name):
|
||||
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags(name)
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
|
||||
if self.tree.elementInScope(name):
|
||||
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
|
||||
self.tree.generateImpliedEndTags()
|
||||
break
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError((u"Unexpected end tag (" + name + "). "
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
|
||||
u"Expected other end tag."))
|
||||
|
||||
for item in headingElements:
|
||||
|
6
planet/vendor/html5lib/inputstream.py
vendored
6
planet/vendor/html5lib/inputstream.py
vendored
@ -53,6 +53,7 @@ class HTMLInputStream(object):
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
||||
|
||||
self.queue = []
|
||||
self.errors = []
|
||||
|
||||
self.line = self.col = 0
|
||||
self.lineLengths = []
|
||||
@ -214,7 +215,10 @@ class HTMLInputStream(object):
|
||||
return EOF
|
||||
|
||||
# Normalize newlines and null characters
|
||||
if c == '\x00': c = u'\uFFFD'
|
||||
if c == '\x00':
|
||||
self.errors.append('null character found in input stream, '
|
||||
'replaced with U+FFFD')
|
||||
c = u'\uFFFD'
|
||||
if c == '\r':
|
||||
c = self.dataStream.read(1, 1)
|
||||
if c != '\n':
|
||||
|
15
planet/vendor/html5lib/liberalxmlparser.py
vendored
15
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
save = self.tokenizer.contentModelFlag
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
self.tokenizer.contentModelFlag = save
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlInitialPhase(self, self.tree)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XmlInitialPhase(html5parser.InitialPhase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Consume XML Prologs """
|
||||
def processComment(self, data):
|
||||
print repr(data)
|
||||
if not data.startswith('?xml') or not data.endswith('?'):
|
||||
html5parser.InitialPhase.processComment(self, data)
|
||||
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
self.tree.openElements.append(self.tree.document)
|
||||
|
69
planet/vendor/html5lib/sanitizer.py
vendored
69
planet/vendor/html5lib/sanitizer.py
vendored
@ -2,7 +2,7 @@ import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from tokenizer import HTMLTokenizer
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer):
|
||||
class HTMLSanitizerMixin:
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr): continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||
del attrs[attr]
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
yield token
|
||||
else:
|
||||
if token["type"] == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == "EmptyTag":
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = "Characters"
|
||||
del token["name"]
|
||||
yield token
|
||||
elif token["type"] == "Comment":
|
||||
pass
|
||||
def sanitize_token(self, token):
|
||||
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
|
||||
if token["name"] in self.allowed_elements:
|
||||
if token.has_key("data"):
|
||||
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if not attrs.has_key(attr): continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
|
||||
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
|
||||
del attrs[attr]
|
||||
if attrs.has_key('style'):
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name,val] for name,val in attrs.items()]
|
||||
return token
|
||||
else:
|
||||
yield token
|
||||
if token["type"] == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == "EmptyTag":
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = "Characters"
|
||||
del token["name"]
|
||||
return token
|
||||
elif token["type"] == "Comment":
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token: yield token
|
||||
|
@ -7,10 +7,6 @@ except NameError:
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.filters.whitespace import Filter as WhitespaceFilter
|
||||
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
|
||||
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements
|
||||
|
||||
@ -67,17 +63,16 @@ class HTMLSerializer(object):
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
|
||||
omit_optional_tags = True
|
||||
|
||||
strip_whitespace = False
|
||||
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
omit_optional_tags = True
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata")
|
||||
"escape_rcdata", 'use_trailing_solidus', "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs.has_key('quote_char'):
|
||||
@ -91,13 +86,19 @@ class HTMLSerializer(object):
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
if encoding and self.inject_meta_charset:
|
||||
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
|
||||
from html5lib.filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# XXX: WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
treewalker = WhitespaceFilter(treewalker)
|
||||
from html5lib.filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from html5lib.filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
treewalker = OptionalTagFilter(treewalker)
|
||||
from html5lib.filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
|
13
planet/vendor/html5lib/tokenizer.py
vendored
13
planet/vendor/html5lib/tokenizer.py
vendored
@ -93,6 +93,8 @@ class HTMLTokenizer(object):
|
||||
# Start processing. When EOF is reached self.state will return False
|
||||
# instead of True and the loop will terminate.
|
||||
while self.state():
|
||||
while self.stream.errors:
|
||||
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
|
||||
while self.tokenQueue:
|
||||
yield self.tokenQueue.pop(0)
|
||||
|
||||
@ -130,7 +132,6 @@ class HTMLTokenizer(object):
|
||||
allowed = hexDigits
|
||||
radix = 16
|
||||
|
||||
char = u"\uFFFD"
|
||||
charStack = []
|
||||
|
||||
# Consume all the characters that are in range while making sure we
|
||||
@ -155,8 +156,8 @@ class HTMLTokenizer(object):
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# 0 is not a good number, neither are illegal Unicode code points.
|
||||
if charAsInt > 0 and charAsInt <= 1114111:
|
||||
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
|
||||
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character.")})
|
||||
_("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||
else:
|
||||
char = u"\uFFFD"
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
|
4
planet/vendor/html5lib/treebuilders/dom.py
vendored
4
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
def hilite(self, encoding):
|
||||
print 'foo'
|
||||
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
|
||||
setattr(self.dom, 'hilite', method)
|
||||
return self
|
||||
|
||||
def insertDoctype(self, name):
|
||||
|
@ -1,5 +1,5 @@
|
||||
[Planet]
|
||||
output_theme = genshi_fancy
|
||||
output_theme = asf
|
||||
output_dir = tests/work/apply
|
||||
name = test planet
|
||||
cache_directory = tests/work/spider/cache
|
||||
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
|
||||
bill_of_materials:
|
||||
images/#{face}
|
||||
|
||||
[index.html.genshi]
|
||||
[index.html.xslt]
|
||||
filters:
|
||||
xhtml2html.py>index.html4
|
||||
xhtml2html.plugin?quote_attr_values=True"e_char="'">index.html4
|
||||
|
||||
[tests/data/spider/testfeed0.atom]
|
||||
name = not found
|
||||
|
@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
shutil.rmtree(os.path.split(workdir)[0])
|
||||
|
||||
def test_apply_asf(self):
|
||||
config.load(configfile % 'asf')
|
||||
def apply_asf(self):
|
||||
splice.apply(self.feeddata)
|
||||
|
||||
# verify that selected files are there
|
||||
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
|
||||
self.assertEqual(12, content)
|
||||
self.assertEqual(3, lang)
|
||||
|
||||
def test_apply_asf(self):
|
||||
config.load(configfile % 'asf')
|
||||
self.apply_asf()
|
||||
|
||||
def test_apply_classic_fancy(self):
|
||||
config.load(configfile % 'fancy')
|
||||
self.apply_fancy()
|
||||
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
|
||||
|
||||
def test_apply_filter_html(self):
|
||||
config.load(configfile % 'html')
|
||||
self.apply_fancy()
|
||||
self.apply_asf()
|
||||
|
||||
output = open(os.path.join(workdir, 'index.html')).read()
|
||||
self.assertTrue(output.find('/>')>=0)
|
||||
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
|
||||
if method.startswith('test_'): break
|
||||
else:
|
||||
delattr(ApplyTest,'test_apply_genshi_fancy')
|
||||
delattr(ApplyTest,'test_apply_filter_html')
|
||||
|
||||
try:
|
||||
import libxml2
|
||||
|
@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
|
||||
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
def test_xhtml2html_filter(self):
|
||||
testfile = 'tests/data/filter/index.html'
|
||||
filter = 'xhtml2html.py'
|
||||
output = shell.run(filter, open(testfile).read(), mode="filter")
|
||||
self.assertTrue(output.find('/>')<0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
try:
|
||||
import genshi
|
||||
except:
|
||||
|
@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
|
||||
|
||||
self.assertEqual('', output)
|
||||
|
||||
def test_xhtml2html_filter(self):
|
||||
testfile = 'tests/data/filter/index.html'
|
||||
filter = 'xhtml2html.plugin?quote_attr_values=True'
|
||||
output = shell.run(filter, open(testfile).read(), mode="filter")
|
||||
self.assertTrue(output.find('/>')<0)
|
||||
self.assertTrue(output.find('</script>')>=0)
|
||||
|
||||
try:
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user