filters/xhtml2html =~ s/Genshi/html5lib/

This commit is contained in:
Sam Ruby 2007-06-27 13:37:00 -04:00
parent 1fcfbe35c0
commit 4b1e0da922
14 changed files with 137 additions and 90 deletions

View File

@ -84,8 +84,8 @@ then the output stream is
through the specified filter and the output is planced into the named file; the
other unmodified branch continues onto the next filter, if any.
One use case for this function is to use
<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
an HTML output stream from one source.</li>
<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
and an HTML output stream from one source.</li>
<li>Templates written using htmltmpl or django currently only have access to a
fixed set of fields, whereas XSLT and genshi templates have access to

21
filters/xhtml2html.plugin Normal file
View File

@ -0,0 +1,21 @@
# Example usages:
#
# filters:
# xhtml2html.plugin?quote_attr_values=True&quote_char="'"
#
# -- or --
#
# [xhtml2html.plugin]
# quote_attr_values=True
# quote_char="'"
import sys
opts = zip(sys.argv[1::2],sys.argv[2::2])
opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
serializer = serializer.HTMLSerializer(**dict(opts))
for text in serializer.serialize(tokens, encoding='utf-8'):
sys.stdout.write(text)

View File

@ -1,5 +0,0 @@
import sys
from genshi.input import XMLParser
from genshi.output import HTMLSerializer
print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')

View File

@ -1,16 +1,10 @@
# Differences from the current specification (23 December 2006) are as follows:
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body>
# always exist.
# * </br> creates a <br> element.
#
# We haven't updated DOCTYPE handling yet
#
# It should be trivial to add the following cases. However, we should probably
# also look into comment handling and such then...
# * A <p> element end tag creates an empty <p> element when there's no <p>
# element in scope.
try:
frozenset
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
(("html", "head", "body", "br"), self.endTagImplyHead)
(("html", "head", "body", "br", "p"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
@ -530,7 +524,7 @@ class InHeadPhase(Phase):
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
(("html", "body", "br"), self.endTagImplyAfterHead),
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
(("title", "style", "script"), self.endTagTitleStyleScript)
])
self.endTagHandler.default = self.endTagOther
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
if self.tree.elementInScope("p"):
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError("Unexpected end tag (p).")
while self.tree.elementInScope("p"):
self.tree.openElements.pop()
self.parser.parseError(_("Unexpected end tag (p)."))
if self.tree.elementInScope("p"):
while self.tree.elementInScope("p"):
self.tree.openElements.pop()
else:
self.startTagCloseP("p", {})
self.endTagP("p")
def endTagBody(self, name):
# XXX Need to take open <p> tags into account here. We shouldn't imply
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError((u"End tag (" + name + ") seen too "
self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if inScope:
node = self.tree.openElements.pop()
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
node = self.tree.openElements.pop()
def endTagForm(self, name):
self.endTagBlock(name)
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
else:
self.tree.openElements.pop()
self.tree.formPointer = None
def endTagListItem(self, name):
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
self.parser.parseError((u"End tag (" + name + ") seen too "
self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if self.tree.elementInScope(name):
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != name:
self.parser.parseError((u"Unexpected end tag (" + name + "). "
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
u"Expected other end tag."))
for item in headingElements:

View File

@ -53,6 +53,7 @@ class HTMLInputStream(object):
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
self.queue = []
self.errors = []
self.line = self.col = 0
self.lineLengths = []
@ -214,7 +215,10 @@ class HTMLInputStream(object):
return EOF
# Normalize newlines and null characters
if c == '\x00': c = u'\uFFFD'
if c == '\x00':
self.errors.append('null character found in input stream, '
'replaced with U+FFFD')
c = u'\uFFFD'
if c == '\r':
c = self.dataStream.read(1, 1)
if c != '\n':

View File

@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
# For EmptyTags, process both a Start and an End tag
if token["type"] == "EmptyTag":
save = self.tokenizer.contentModelFlag
self.phase.processStartTag(token["name"], token["data"])
self.tokenizer.contentModelFlag = save
token["data"] = {}
token["type"] = "EndTag"
@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlInitialPhase(self, self.tree)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
class XmlInitialPhase(html5parser.InitialPhase):
""" Consume XML Prologs """
def processComment(self, data):
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
class XmlRootPhase(html5parser.Phase):
""" Consume XML Prologs """
def processComment(self, data):
print repr(data)
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
""" Prime the Xml parser """
def __getattr__(self, name):
self.tree.openElements.append(self.tree.document)

View File

@ -2,7 +2,7 @@ import re
from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer
class HTMLSanitizer(HTMLTokenizer):
class HTMLSanitizerMixin:
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
del attrs[attr]
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
yield token
else:
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
yield token
elif token["type"] == "Comment":
pass
def sanitize_token(self, token):
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["name"] in self.allowed_elements:
if token.has_key("data"):
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if not attrs.has_key(attr): continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
del attrs[attr]
if attrs.has_key('style'):
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name,val] for name,val in attrs.items()]
return token
else:
yield token
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
elif token["type"] == "Comment":
pass
else:
return token
def sanitize_css(self, style):
# disallow urls
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token: yield token

View File

@ -7,10 +7,6 @@ except NameError:
import gettext
_ = gettext.gettext
from html5lib.filters.whitespace import Filter as WhitespaceFilter
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements
@ -67,17 +63,16 @@ class HTMLSerializer(object):
escape_lt_in_attrs = False
escape_rcdata = False
omit_optional_tags = True
strip_whitespace = False
inject_meta_charset = True
strip_whitespace = False
sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata")
"escape_rcdata", 'use_trailing_solidus', "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
@ -91,13 +86,19 @@ class HTMLSerializer(object):
in_cdata = False
self.errors = []
if encoding and self.inject_meta_charset:
treewalker = InjectMetaCharsetFilter(treewalker, encoding)
from html5lib.filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter
if self.strip_whitespace:
treewalker = WhitespaceFilter(treewalker)
from html5lib.filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from html5lib.filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags:
treewalker = OptionalTagFilter(treewalker)
from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker:
type = token["type"]
if type == "Doctype":

View File

@ -93,6 +93,8 @@ class HTMLTokenizer(object):
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
while self.state():
while self.stream.errors:
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
while self.tokenQueue:
yield self.tokenQueue.pop(0)
@ -130,7 +132,6 @@ class HTMLTokenizer(object):
allowed = hexDigits
radix = 16
char = u"\uFFFD"
charStack = []
# Consume all the characters that are in range while making sure we
@ -155,8 +156,8 @@ class HTMLTokenizer(object):
charAsInt = entitiesWindows1252[charAsInt - 128]
# 0 is not a good number, neither are illegal Unicode code points.
if charAsInt > 0 and charAsInt <= 1114111:
# 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
try:
# XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
char = eval("u'\\U%08x'" % charAsInt)
except:
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity couldn't be converted to character.")})
_("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
else:
char = u"\uFFFD"
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.

View File

@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
def hilite(self, encoding):
print 'foo'
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
setattr(self.dom, 'hilite', method)
return self
def insertDoctype(self, name):

View File

@ -1,5 +1,5 @@
[Planet]
output_theme = genshi_fancy
output_theme = asf
output_dir = tests/work/apply
name = test planet
cache_directory = tests/work/spider/cache
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
bill_of_materials:
images/#{face}
[index.html.genshi]
[index.html.xslt]
filters:
xhtml2html.py>index.html4
xhtml2html.plugin?quote_attr_values=True&quote_char="'">index.html4
[tests/data/spider/testfeed0.atom]
name = not found

View File

@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
def tearDown(self):
shutil.rmtree(os.path.split(workdir)[0])
def test_apply_asf(self):
config.load(configfile % 'asf')
def apply_asf(self):
splice.apply(self.feeddata)
# verify that selected files are there
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
self.assertEqual(12, content)
self.assertEqual(3, lang)
def test_apply_asf(self):
config.load(configfile % 'asf')
self.apply_asf()
def test_apply_classic_fancy(self):
config.load(configfile % 'fancy')
self.apply_fancy()
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
def test_apply_filter_html(self):
config.load(configfile % 'html')
self.apply_fancy()
self.apply_asf()
output = open(os.path.join(workdir, 'index.html')).read()
self.assertTrue(output.find('/>')>=0)
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
if method.startswith('test_'): break
else:
delattr(ApplyTest,'test_apply_genshi_fancy')
delattr(ApplyTest,'test_apply_filter_html')
try:
import libxml2

View File

@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
self.assertTrue(output.find('</script>')>=0)
def test_xhtml2html_filter(self):
testfile = 'tests/data/filter/index.html'
filter = 'xhtml2html.py'
output = shell.run(filter, open(testfile).read(), mode="filter")
self.assertTrue(output.find('/>')<0)
self.assertTrue(output.find('</script>')>=0)
try:
import genshi
except:

View File

@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
self.assertEqual('', output)
def test_xhtml2html_filter(self):
testfile = 'tests/data/filter/index.html'
filter = 'xhtml2html.plugin?quote_attr_values=True'
output = shell.run(filter, open(testfile).read(), mode="filter")
self.assertTrue(output.find('/>')<0)
self.assertTrue(output.find('</script>')>=0)
try:
from subprocess import Popen, PIPE