filters/xhtml2html =~ s/Genshi/html5lib/

This commit is contained in:
Sam Ruby 2007-06-27 13:37:00 -04:00
parent 1fcfbe35c0
commit 4b1e0da922
14 changed files with 137 additions and 90 deletions

View File

@ -84,8 +84,8 @@ then the output stream is
through the specified filter and the output is planced into the named file; the through the specified filter and the output is planced into the named file; the
other unmodified branch continues onto the next filter, if any. other unmodified branch continues onto the next filter, if any.
One use case for this function is to use One use case for this function is to use
<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and <a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
an HTML output stream from one source.</li> and an HTML output stream from one source.</li>
<li>Templates written using htmltmpl or django currently only have access to a <li>Templates written using htmltmpl or django currently only have access to a
fixed set of fields, whereas XSLT and genshi templates have access to fixed set of fields, whereas XSLT and genshi templates have access to

21
filters/xhtml2html.plugin Normal file
View File

@ -0,0 +1,21 @@
# Example usages:
#
# filters:
# xhtml2html.plugin?quote_attr_values=True&quote_char="'"
#
# -- or --
#
# [xhtml2html.plugin]
# quote_attr_values=True
# quote_char="'"
import sys
opts = zip(sys.argv[1::2],sys.argv[2::2])
opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
serializer = serializer.HTMLSerializer(**dict(opts))
for text in serializer.serialize(tokens, encoding='utf-8'):
sys.stdout.write(text)

View File

@ -1,5 +0,0 @@
import sys
from genshi.input import XMLParser
from genshi.output import HTMLSerializer
print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')

View File

@ -1,16 +1,10 @@
# Differences from the current specification (23 December 2006) are as follows: # Differences from the current specification (23 December 2006) are as follows:
# * Phases and insertion modes are one concept in parser.py. # * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body> # * EOF handling is slightly different to make sure <html>, <head> and <body>
# always exist. # always exist.
# * </br> creates a <br> element.
# #
# We haven't updated DOCTYPE handling yet # We haven't updated DOCTYPE handling yet
#
# It should be trivial to add the following cases. However, we should probably
# also look into comment handling and such then...
# * A <p> element end tag creates an empty <p> element when there's no <p>
# element in scope.
try: try:
frozenset frozenset
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = utils.MethodDispatcher([
(("html", "head", "body", "br"), self.endTagImplyHead) (("html", "head", "body", "br", "p"), self.endTagImplyHead)
]) ])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -530,7 +524,7 @@ class InHeadPhase(Phase):
self. endTagHandler = utils.MethodDispatcher([ self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead), ("head", self.endTagHead),
(("html", "body", "br"), self.endTagImplyAfterHead), (("html", "body", "br", "p"), self.endTagImplyAfterHead),
(("title", "style", "script"), self.endTagTitleStyleScript) (("title", "style", "script"), self.endTagTitleStyleScript)
]) ])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
if self.tree.elementInScope("p"): if self.tree.elementInScope("p"):
self.tree.generateImpliedEndTags("p") self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p": if self.tree.openElements[-1].name != "p":
self.parser.parseError("Unexpected end tag (p).") self.parser.parseError(_("Unexpected end tag (p)."))
while self.tree.elementInScope("p"): if self.tree.elementInScope("p"):
self.tree.openElements.pop() while self.tree.elementInScope("p"):
self.tree.openElements.pop()
else:
self.startTagCloseP("p", {})
self.endTagP("p")
def endTagBody(self, name): def endTagBody(self, name):
# XXX Need to take open <p> tags into account here. We shouldn't imply # XXX Need to take open <p> tags into account here. We shouldn't imply
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
if inScope: if inScope:
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError((u"End tag (" + name + ") seen too " self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag.")) u"early. Expected other end tag."))
if inScope: if inScope:
node = self.tree.openElements.pop() node = self.tree.openElements.pop()
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
node = self.tree.openElements.pop() node = self.tree.openElements.pop()
def endTagForm(self, name): def endTagForm(self, name):
self.endTagBlock(name) if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
else:
self.tree.openElements.pop()
self.tree.formPointer = None self.tree.formPointer = None
def endTagListItem(self, name): def endTagListItem(self, name):
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name) self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError((u"End tag (" + name + ") seen too " self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag.")) u"early. Expected other end tag."))
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
break break
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError((u"Unexpected end tag (" + name + "). " self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
u"Expected other end tag.")) u"Expected other end tag."))
for item in headingElements: for item in headingElements:

View File

@ -53,6 +53,7 @@ class HTMLInputStream(object):
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace') self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
self.queue = [] self.queue = []
self.errors = []
self.line = self.col = 0 self.line = self.col = 0
self.lineLengths = [] self.lineLengths = []
@ -214,7 +215,10 @@ class HTMLInputStream(object):
return EOF return EOF
# Normalize newlines and null characters # Normalize newlines and null characters
if c == '\x00': c = u'\uFFFD' if c == '\x00':
self.errors.append('null character found in input stream, '
'replaced with U+FFFD')
c = u'\uFFFD'
if c == '\r': if c == '\r':
c = self.dataStream.read(1, 1) c = self.dataStream.read(1, 1)
if c != '\n': if c != '\n':

View File

@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
# For EmptyTags, process both a Start and an End tag # For EmptyTags, process both a Start and an End tag
if token["type"] == "EmptyTag": if token["type"] == "EmptyTag":
save = self.tokenizer.contentModelFlag
self.phase.processStartTag(token["name"], token["data"]) self.phase.processStartTag(token["name"], token["data"])
self.tokenizer.contentModelFlag = save
token["data"] = {} token["data"] = {}
token["type"] = "EndTag" token["type"] = "EndTag"
@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs) html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlInitialPhase(self, self.tree)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree) self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token): def normalizeToken(self, token):
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element) self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"] self.parser.phase = self.parser.phases["beforeHead"]
class XmlInitialPhase(html5parser.InitialPhase):
""" Consume XML Prologs """
def processComment(self, data):
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
class XmlRootPhase(html5parser.Phase): class XmlRootPhase(html5parser.Phase):
""" Consume XML Prologs """
def processComment(self, data):
print repr(data)
if not data.startswith('?xml') or not data.endswith('?'):
html5parser.InitialPhase.processComment(self, data)
""" Prime the Xml parser """ """ Prime the Xml parser """
def __getattr__(self, name): def __getattr__(self, name):
self.tree.openElements.append(self.tree.document) self.tree.openElements.append(self.tree.document)

View File

@ -2,7 +2,7 @@ import re
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
from tokenizer import HTMLTokenizer from tokenizer import HTMLTokenizer
class HTMLSanitizer(HTMLTokenizer): class HTMLSanitizerMixin:
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
# => &lt;script> do_nasty_stuff() &lt;/script> # => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a> # => <a>Click here for $100</a>
def __iter__(self): def sanitize_token(self, token):
for token in HTMLTokenizer.__iter__(self): if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
if token["type"] in ["StartTag", "EndTag", "EmptyTag"]: if token["name"] in self.allowed_elements:
if token["name"] in self.allowed_elements: if token.has_key("data"):
if token.has_key("data"): attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) for attr in self.attr_val_is_uri:
for attr in self.attr_val_is_uri: if not attrs.has_key(attr): continue
if not attrs.has_key(attr): continue val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols): del attrs[attr]
del attrs[attr] if attrs.has_key('style'):
if attrs.has_key('style'): attrs['style'] = self.sanitize_css(attrs['style'])
attrs['style'] = self.sanitize_css(attrs['style']) token["data"] = [[name,val] for name,val in attrs.items()]
token["data"] = [[name,val] for name,val in attrs.items()] return token
yield token
else:
if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
yield token
elif token["type"] == "Comment":
pass
else: else:
yield token if token["type"] == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == "EmptyTag":
token["data"]=token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
elif token["type"] == "Comment":
pass
else:
return token
def sanitize_css(self, style): def sanitize_css(self, style):
# disallow urls # disallow urls
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
clean.append(prop + ': ' + value + ';') clean.append(prop + ': ' + value + ';')
return ' '.join(clean) return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token: yield token

View File

@ -7,10 +7,6 @@ except NameError:
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
from html5lib.filters.whitespace import Filter as WhitespaceFilter
from html5lib.filters.optionaltags import Filter as OptionalTagFilter
from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements from html5lib.constants import rcdataElements
@ -67,17 +63,16 @@ class HTMLSerializer(object):
escape_lt_in_attrs = False escape_lt_in_attrs = False
escape_rcdata = False escape_rcdata = False
omit_optional_tags = True
strip_whitespace = False
inject_meta_charset = True inject_meta_charset = True
strip_whitespace = False
sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char", options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus", "minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags", "space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata") "escape_rcdata", 'use_trailing_solidus', "sanitize")
def __init__(self, **kwargs): def __init__(self, **kwargs):
if kwargs.has_key('quote_char'): if kwargs.has_key('quote_char'):
@ -91,13 +86,19 @@ class HTMLSerializer(object):
in_cdata = False in_cdata = False
self.errors = [] self.errors = []
if encoding and self.inject_meta_charset: if encoding and self.inject_meta_charset:
treewalker = InjectMetaCharsetFilter(treewalker, encoding) from html5lib.filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding)
# XXX: WhitespaceFilter should be used before OptionalTagFilter # XXX: WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter # for maximum efficiently of this latter filter
if self.strip_whitespace: if self.strip_whitespace:
treewalker = WhitespaceFilter(treewalker) from html5lib.filters.whitespace import Filter
treewalker = Filter(treewalker)
if self.sanitize:
from html5lib.filters.sanitizer import Filter
treewalker = Filter(treewalker)
if self.omit_optional_tags: if self.omit_optional_tags:
treewalker = OptionalTagFilter(treewalker) from html5lib.filters.optionaltags import Filter
treewalker = Filter(treewalker)
for token in treewalker: for token in treewalker:
type = token["type"] type = token["type"]
if type == "Doctype": if type == "Doctype":

View File

@ -93,6 +93,8 @@ class HTMLTokenizer(object):
# Start processing. When EOF is reached self.state will return False # Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate. # instead of True and the loop will terminate.
while self.state(): while self.state():
while self.stream.errors:
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
while self.tokenQueue: while self.tokenQueue:
yield self.tokenQueue.pop(0) yield self.tokenQueue.pop(0)
@ -130,7 +132,6 @@ class HTMLTokenizer(object):
allowed = hexDigits allowed = hexDigits
radix = 16 radix = 16
char = u"\uFFFD"
charStack = [] charStack = []
# Consume all the characters that are in range while making sure we # Consume all the characters that are in range while making sure we
@ -155,8 +156,8 @@ class HTMLTokenizer(object):
charAsInt = entitiesWindows1252[charAsInt - 128] charAsInt = entitiesWindows1252[charAsInt - 128]
# 0 is not a good number, neither are illegal Unicode code points. # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
if charAsInt > 0 and charAsInt <= 1114111: if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
try: try:
# XXX We should have a separate function that does "int" to # XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work # "unicodestring" conversion since this doesn't always work
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
char = eval("u'\\U%08x'" % charAsInt) char = eval("u'\\U%08x'" % charAsInt)
except: except:
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity couldn't be converted to character.")}) _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
else:
char = u"\uFFFD"
self.tokenQueue.append({"type": "ParseError", "data":
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
# Discard the ; if present. Otherwise, put it back on the queue and # Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser. # invoke parseError on parser.

View File

@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
def documentClass(self): def documentClass(self):
self.dom = minidom.getDOMImplementation().createDocument(None,None,None) self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
def hilite(self, encoding):
print 'foo'
method = new.instancemethod(hilite, self.dom, self.dom.__class__)
setattr(self.dom, 'hilite', method)
return self return self
def insertDoctype(self, name): def insertDoctype(self, name):

View File

@ -1,5 +1,5 @@
[Planet] [Planet]
output_theme = genshi_fancy output_theme = asf
output_dir = tests/work/apply output_dir = tests/work/apply
name = test planet name = test planet
cache_directory = tests/work/spider/cache cache_directory = tests/work/spider/cache
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
bill_of_materials: bill_of_materials:
images/#{face} images/#{face}
[index.html.genshi] [index.html.xslt]
filters: filters:
xhtml2html.py>index.html4 xhtml2html.plugin?quote_attr_values=True&quote_char="'">index.html4
[tests/data/spider/testfeed0.atom] [tests/data/spider/testfeed0.atom]
name = not found name = not found

View File

@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
def tearDown(self): def tearDown(self):
shutil.rmtree(os.path.split(workdir)[0]) shutil.rmtree(os.path.split(workdir)[0])
def test_apply_asf(self): def apply_asf(self):
config.load(configfile % 'asf')
splice.apply(self.feeddata) splice.apply(self.feeddata)
# verify that selected files are there # verify that selected files are there
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
self.assertEqual(12, content) self.assertEqual(12, content)
self.assertEqual(3, lang) self.assertEqual(3, lang)
def test_apply_asf(self):
config.load(configfile % 'asf')
self.apply_asf()
def test_apply_classic_fancy(self): def test_apply_classic_fancy(self):
config.load(configfile % 'fancy') config.load(configfile % 'fancy')
self.apply_fancy() self.apply_fancy()
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
def test_apply_filter_html(self): def test_apply_filter_html(self):
config.load(configfile % 'html') config.load(configfile % 'html')
self.apply_fancy() self.apply_asf()
output = open(os.path.join(workdir, 'index.html')).read() output = open(os.path.join(workdir, 'index.html')).read()
self.assertTrue(output.find('/>')>=0) self.assertTrue(output.find('/>')>=0)
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
if method.startswith('test_'): break if method.startswith('test_'): break
else: else:
delattr(ApplyTest,'test_apply_genshi_fancy') delattr(ApplyTest,'test_apply_genshi_fancy')
delattr(ApplyTest,'test_apply_filter_html')
try: try:
import libxml2 import libxml2

View File

@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0) self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
self.assertTrue(output.find('</script>')>=0) self.assertTrue(output.find('</script>')>=0)
def test_xhtml2html_filter(self):
testfile = 'tests/data/filter/index.html'
filter = 'xhtml2html.py'
output = shell.run(filter, open(testfile).read(), mode="filter")
self.assertTrue(output.find('/>')<0)
self.assertTrue(output.find('</script>')>=0)
try: try:
import genshi import genshi
except: except:

View File

@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
self.assertEqual('', output) self.assertEqual('', output)
def test_xhtml2html_filter(self):
testfile = 'tests/data/filter/index.html'
filter = 'xhtml2html.plugin?quote_attr_values=True'
output = shell.run(filter, open(testfile).read(), mode="filter")
self.assertTrue(output.find('/>')<0)
self.assertTrue(output.find('</script>')>=0)
try: try:
from subprocess import Popen, PIPE from subprocess import Popen, PIPE