796 lines
33 KiB
Python
796 lines
33 KiB
Python
try:
|
|
frozenset
|
|
except NameError:
|
|
# Import from the sets module for python 2.3
|
|
from sets import Set as set
|
|
from sets import ImmutableSet as frozenset
|
|
import gettext
|
|
_ = gettext.gettext
|
|
|
|
from constants import contentModelFlags, spaceCharacters
|
|
from constants import entitiesWindows1252, entities
|
|
from constants import asciiLowercase, asciiLetters
|
|
from constants import digits, hexDigits, EOF
|
|
|
|
from inputstream import HTMLInputStream
|
|
|
|
class HTMLTokenizer(object):
|
|
""" This class takes care of tokenizing HTML.
|
|
|
|
* self.currentToken
|
|
Holds the token that is currently being processed.
|
|
|
|
* self.state
|
|
Holds a reference to the method to be invoked... XXX
|
|
|
|
* self.states
|
|
Holds a mapping between states and methods that implement the state.
|
|
|
|
* self.stream
|
|
Points to HTMLInputStream object.
|
|
"""
|
|
|
|
# XXX need to fix documentation
|
|
|
|
def __init__(self, stream, encoding=None):
|
|
self.stream = HTMLInputStream(stream, encoding)
|
|
|
|
self.states = {
|
|
"data":self.dataState,
|
|
"entityData":self.entityDataState,
|
|
"tagOpen":self.tagOpenState,
|
|
"closeTagOpen":self.closeTagOpenState,
|
|
"tagName":self.tagNameState,
|
|
"beforeAttributeName":self.beforeAttributeNameState,
|
|
"attributeName":self.attributeNameState,
|
|
"afterAttributeName":self.afterAttributeNameState,
|
|
"beforeAttributeValue":self.beforeAttributeValueState,
|
|
"attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
|
|
"attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
|
|
"attributeValueUnQuoted":self.attributeValueUnQuotedState,
|
|
"bogusComment":self.bogusCommentState,
|
|
"markupDeclarationOpen":self.markupDeclarationOpenState,
|
|
"comment":self.commentState,
|
|
"commentDash":self.commentDashState,
|
|
"commentEnd":self.commentEndState,
|
|
"doctype":self.doctypeState,
|
|
"beforeDoctypeName":self.beforeDoctypeNameState,
|
|
"doctypeName":self.doctypeNameState,
|
|
"afterDoctypeName":self.afterDoctypeNameState,
|
|
"bogusDoctype":self.bogusDoctypeState
|
|
}
|
|
|
|
# Setup the initial tokenizer state
|
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
|
self.state = self.states["data"]
|
|
|
|
# The current token being created
|
|
self.currentToken = None
|
|
|
|
# Tokens to be processed.
|
|
self.tokenQueue = []
|
|
|
|
def __iter__(self):
|
|
""" This is where the magic happens.
|
|
|
|
We do our usually processing through the states and when we have a token
|
|
to return we yield the token which pauses processing until the next token
|
|
is requested.
|
|
"""
|
|
self.stream.reset()
|
|
self.tokenQueue = []
|
|
# Start processing. When EOF is reached self.state will return False
|
|
# instead of True and the loop will terminate.
|
|
while self.state():
|
|
while self.tokenQueue:
|
|
yield self.tokenQueue.pop(0)
|
|
|
|
# Below are various helper functions the tokenizer states use worked out.
|
|
def processSolidusInTag(self):
|
|
"""If the next character is a '>', convert the currentToken into
|
|
an EmptyTag
|
|
"""
|
|
|
|
# We need to consume another character to make sure it's a ">"
|
|
data = self.stream.char()
|
|
|
|
if self.currentToken["type"] == "StartTag" and data == u">":
|
|
self.currentToken["type"] = "EmptyTag"
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Solidus (/) incorrectly placed in tag.")})
|
|
|
|
# The character we just consumed need to be put back on the stack so it
|
|
# doesn't get lost...
|
|
self.stream.queue.append(data)
|
|
|
|
def consumeNumberEntity(self, isHex):
|
|
"""This function returns either U+FFFD or the character based on the
|
|
decimal or hexadecimal representation. It also discards ";" if present.
|
|
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
|
|
"""
|
|
|
|
# XXX More need to be done here. For instance, #13 should prolly be
|
|
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
|
# such. Thoughts on this appreciated.
|
|
allowed = digits
|
|
radix = 10
|
|
if isHex:
|
|
allowed = hexDigits
|
|
radix = 16
|
|
|
|
char = u"\uFFFD"
|
|
charStack = []
|
|
|
|
# Consume all the characters that are in range while making sure we
|
|
# don't hit an EOF.
|
|
c = self.stream.char()
|
|
while c in allowed and c is not EOF:
|
|
charStack.append(c)
|
|
c = self.stream.char()
|
|
|
|
# Convert the set of characters consumed to an int.
|
|
charAsInt = int("".join(charStack), radix)
|
|
|
|
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
|
|
# smaller) we need to do the "windows trick".
|
|
if 127 < charAsInt < 160:
|
|
#XXX - removed parse error from windows 1252 entity for now
|
|
#we may want to reenable this later
|
|
#self.tokenQueue.append({"type": "ParseError", "data":
|
|
# _("Entity used with illegal number (windows-1252 reference).")})
|
|
|
|
charAsInt = entitiesWindows1252[charAsInt - 128]
|
|
|
|
# 0 is not a good number.
|
|
if charAsInt == 0:
|
|
charAsInt = 65533
|
|
|
|
try:
|
|
# XXX We should have a separate function that does "int" to
|
|
# "unicodestring" conversion since this doesn't always work
|
|
# according to hsivonen. Also, unichr has a limitation of 65535
|
|
char = unichr(charAsInt)
|
|
except:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Numeric entity couldn't be converted to character.")})
|
|
|
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
|
# invoke parseError on parser.
|
|
if c != u";":
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Numeric entity didn't end with ';'.")})
|
|
self.stream.queue.append(c)
|
|
|
|
return char
|
|
|
|
def consumeEntity(self):
|
|
char = None
|
|
charStack = [self.stream.char()]
|
|
if charStack[0] == u"#":
|
|
# We might have a number entity here.
|
|
charStack.extend([self.stream.char(), self.stream.char()])
|
|
if EOF in charStack:
|
|
# If we reach the end of the file put everything up to EOF
|
|
# back in the queue
|
|
charStack = charStack[:charStack.index(EOF)]
|
|
self.stream.queue.extend(charStack)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Numeric entity expected. Got end of file instead.")})
|
|
else:
|
|
if charStack[1].lower() == u"x" \
|
|
and charStack[2] in hexDigits:
|
|
# Hexadecimal entity detected.
|
|
self.stream.queue.append(charStack[2])
|
|
char = self.consumeNumberEntity(True)
|
|
elif charStack[1] in digits:
|
|
# Decimal entity detected.
|
|
self.stream.queue.extend(charStack[1:])
|
|
char = self.consumeNumberEntity(False)
|
|
else:
|
|
# No number entity detected.
|
|
self.stream.queue.extend(charStack)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Numeric entity expected but none found.")})
|
|
# Break out if we reach the end of the file
|
|
elif charStack[0] == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Entity expected. Got end of file instead.")})
|
|
else:
|
|
# At this point in the process might have named entity. Entities
|
|
# are stored in the global variable "entities".
|
|
#
|
|
# Consume characters and compare to these to a substring of the
|
|
# entity names in the list until the substring no longer matches.
|
|
filteredEntityList = [e for e in entities if \
|
|
e.startswith(charStack[0])]
|
|
|
|
def entitiesStartingWith(name):
|
|
return [e for e in filteredEntityList if e.startswith(name)]
|
|
|
|
while charStack[-1] != EOF and\
|
|
entitiesStartingWith("".join(charStack)):
|
|
charStack.append(self.stream.char())
|
|
|
|
# At this point we have a string that starts with some characters
|
|
# that may match an entity
|
|
entityName = None
|
|
|
|
# Try to find the longest entity the string will match
|
|
for entityLength in xrange(len(charStack)-1,1,-1):
|
|
possibleEntityName = "".join(charStack[:entityLength])
|
|
if possibleEntityName in entities:
|
|
entityName = possibleEntityName
|
|
break
|
|
|
|
if entityName is not None:
|
|
char = entities[entityName]
|
|
|
|
# Check whether or not the last character returned can be
|
|
# discarded or needs to be put back.
|
|
if not charStack[-1] == ";":
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Named entity didn't end with ';'.")})
|
|
self.stream.queue.extend(charStack[entityLength:])
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Named entity expected. Got none.")})
|
|
self.stream.queue.extend(charStack)
|
|
return char
|
|
|
|
def processEntityInAttribute(self):
|
|
"""This method replaces the need for "entityInAttributeValueState".
|
|
"""
|
|
entity = self.consumeEntity()
|
|
if entity:
|
|
self.currentToken["data"][-1][1] += entity
|
|
else:
|
|
self.currentToken["data"][-1][1] += u"&"
|
|
|
|
def emitCurrentToken(self):
|
|
"""This method is a generic handler for emitting the tags. It also sets
|
|
the state to "data" because that's what's needed after a token has been
|
|
emitted.
|
|
"""
|
|
|
|
# Add token to the queue to be yielded
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
|
|
|
|
# Below are the various tokenizer states worked out.
|
|
|
|
# XXX AT Perhaps we should have Hixie run some evaluation on billions of
|
|
# documents to figure out what the order of the various if and elif
|
|
# statements should be.
|
|
|
|
def dataState(self):
|
|
data = self.stream.char()
|
|
if data == u"&" and self.contentModelFlag in\
|
|
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
|
self.state = self.states["entityData"]
|
|
elif data == u"<" and self.contentModelFlag !=\
|
|
contentModelFlags["PLAINTEXT"]:
|
|
self.state = self.states["tagOpen"]
|
|
elif data == EOF:
|
|
# Tokenization ends.
|
|
return False
|
|
elif data in spaceCharacters:
|
|
# Directly after emitting a token you switch back to the "data
|
|
# state". At that point spaceCharacters are important so they are
|
|
# emitted separately.
|
|
# XXX need to check if we don't need a special "spaces" flag on
|
|
# characters.
|
|
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
|
else:
|
|
self.tokenQueue.append({"type": "Characters", "data":
|
|
data + self.stream.charsUntil((u"&", u"<"))})
|
|
return True
|
|
|
|
def entityDataState(self):
|
|
entity = self.consumeEntity()
|
|
if entity:
|
|
self.tokenQueue.append({"type": "Characters", "data": entity})
|
|
else:
|
|
self.tokenQueue.append({"type": "Characters", "data": u"&"})
|
|
self.state = self.states["data"]
|
|
return True
|
|
|
|
def tagOpenState(self):
|
|
data = self.stream.char()
|
|
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
|
if data == u"!":
|
|
self.state = self.states["markupDeclarationOpen"]
|
|
elif data == u"/":
|
|
self.state = self.states["closeTagOpen"]
|
|
elif data in asciiLetters:
|
|
self.currentToken =\
|
|
{"type": "StartTag", "name": data, "data": []}
|
|
self.state = self.states["tagName"]
|
|
elif data == u">":
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected tag name. Got '>' instead.")})
|
|
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
|
self.state = self.states["data"]
|
|
elif data == u"?":
|
|
# XXX In theory it could be something besides a tag name. But
|
|
# do we really care?
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected tag name. Got '?' instead (HTML doesn't "
|
|
"support processing instructions).")})
|
|
self.stream.queue.append(data)
|
|
self.state = self.states["bogusComment"]
|
|
else:
|
|
# XXX
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected tag name. Got something else instead")})
|
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
|
self.stream.queue.append(data)
|
|
self.state = self.states["data"]
|
|
else:
|
|
# We know the content model flag is set to either RCDATA or CDATA
|
|
# now because this state can never be entered with the PLAINTEXT
|
|
# flag.
|
|
if data == u"/":
|
|
self.state = self.states["closeTagOpen"]
|
|
else:
|
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
|
self.stream.queue.append(data)
|
|
self.state = self.states["data"]
|
|
return True
|
|
|
|
def closeTagOpenState(self):
|
|
if self.contentModelFlag in (contentModelFlags["RCDATA"],\
|
|
contentModelFlags["CDATA"]):
|
|
charStack = []
|
|
|
|
# So far we know that "</" has been consumed. We now need to know
|
|
# whether the next few characters match the name of last emitted
|
|
# start tag which also happens to be the currentToken. We also need
|
|
# to have the character directly after the characters that could
|
|
# match the start tag name.
|
|
for x in xrange(len(self.currentToken["name"]) + 1):
|
|
charStack.append(self.stream.char())
|
|
# Make sure we don't get hit by EOF
|
|
if charStack[-1] == EOF:
|
|
break
|
|
|
|
# Since this is just for checking. We put the characters back on
|
|
# the stack.
|
|
self.stream.queue.extend(charStack)
|
|
|
|
if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
|
|
and charStack[-1] in (spaceCharacters |
|
|
frozenset((u">", u"/", u"<", EOF))):
|
|
# Because the characters are correct we can safely switch to
|
|
# PCDATA mode now. This also means we don't have to do it when
|
|
# emitting the end tag token.
|
|
self.contentModelFlag = contentModelFlags["PCDATA"]
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected closing tag after seeing '</'. None found.")})
|
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
|
self.state = self.states["data"]
|
|
|
|
# Need to return here since we don't want the rest of the
|
|
# method to be walked through.
|
|
return True
|
|
|
|
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
|
data = self.stream.char()
|
|
if data in asciiLetters:
|
|
self.currentToken =\
|
|
{"type": "EndTag", "name": data, "data": []}
|
|
self.state = self.states["tagName"]
|
|
elif data == u">":
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
|
self.state = self.states["data"]
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected closing tag. Unexpected end of file.")})
|
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
|
self.state = self.states["data"]
|
|
else:
|
|
# XXX data can be _'_...
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
|
self.stream.queue.append(data)
|
|
self.state = self.states["bogusComment"]
|
|
return True
|
|
|
|
def tagNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data in asciiLetters:
|
|
self.currentToken["name"] += data +\
|
|
self.stream.charsUntil(asciiLetters, True)
|
|
elif data == u">":
|
|
self.emitCurrentToken()
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character when getting the tag name.")})
|
|
self.emitCurrentToken()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in the tag name.")})
|
|
self.emitCurrentToken()
|
|
elif data == u"/":
|
|
self.processSolidusInTag()
|
|
self.state = self.states["beforeAttributeName"]
|
|
else:
|
|
self.currentToken["name"] += data
|
|
return True
|
|
|
|
def beforeAttributeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.states["attributeName"]
|
|
elif data == u">":
|
|
self.emitCurrentToken()
|
|
elif data == u"/":
|
|
self.processSolidusInTag()
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character. Expected attribute name instead.")})
|
|
self.emitCurrentToken()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file. Expected attribute name instead.")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.states["attributeName"]
|
|
return True
|
|
|
|
def attributeNameState(self):
|
|
data = self.stream.char()
|
|
leavingThisState = True
|
|
if data == u"=":
|
|
self.state = self.states["beforeAttributeValue"]
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"][-1][0] += data +\
|
|
self.stream.charsUntil(asciiLetters, True)
|
|
leavingThisState = False
|
|
elif data == u">":
|
|
# XXX If we emit here the attributes are converted to a dict
|
|
# without being checked and when the code below runs we error
|
|
# because data is a dict not a list
|
|
pass
|
|
elif data in spaceCharacters:
|
|
self.state = self.states["afterAttributeName"]
|
|
elif data == u"/":
|
|
self.processSolidusInTag()
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character in attribute name.")})
|
|
self.emitCurrentToken()
|
|
leavingThisState = False
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in attribute name.")})
|
|
self.emitCurrentToken()
|
|
leavingThisState = False
|
|
else:
|
|
self.currentToken["data"][-1][0] += data
|
|
leavingThisState = False
|
|
|
|
if leavingThisState:
|
|
# Attributes are not dropped at this stage. That happens when the
|
|
# start tag token is emitted so values can still be safely appended
|
|
# to attributes, but we do want to report the parse error in time.
|
|
for name, value in self.currentToken["data"][:-1]:
|
|
if self.currentToken["data"][-1][0] == name:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Dropped duplicate attribute on tag.")})
|
|
# XXX Fix for above XXX
|
|
if data == u">":
|
|
self.emitCurrentToken()
|
|
return True
|
|
|
|
def afterAttributeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data == u"=":
|
|
self.state = self.states["beforeAttributeValue"]
|
|
elif data == u">":
|
|
self.emitCurrentToken()
|
|
elif data in asciiLetters:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.states["attributeName"]
|
|
elif data == u"/":
|
|
self.processSolidusInTag()
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character. Expected = or end of tag.")})
|
|
self.emitCurrentToken()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file. Expected = or end of tag.")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"].append([data, ""])
|
|
self.state = self.states["attributeName"]
|
|
return True
|
|
|
|
def beforeAttributeValueState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.stream.charsUntil(spaceCharacters, True)
|
|
elif data == u"\"":
|
|
self.state = self.states["attributeValueDoubleQuoted"]
|
|
elif data == u"&":
|
|
self.state = self.states["attributeValueUnQuoted"]
|
|
self.stream.queue.append(data);
|
|
elif data == u"'":
|
|
self.state = self.states["attributeValueSingleQuoted"]
|
|
elif data == u">":
|
|
self.emitCurrentToken()
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character. Expected attribute value.")})
|
|
self.emitCurrentToken()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file. Expected attribute value.")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"][-1][1] += data
|
|
self.state = self.states["attributeValueUnQuoted"]
|
|
return True
|
|
|
|
def attributeValueDoubleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "\"":
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data == u"&":
|
|
self.processEntityInAttribute()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in attribute value (\").")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"][-1][1] += data +\
|
|
self.stream.charsUntil(("\"", u"&"))
|
|
return True
|
|
|
|
def attributeValueSingleQuotedState(self):
|
|
data = self.stream.char()
|
|
if data == "'":
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data == u"&":
|
|
self.processEntityInAttribute()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in attribute value (').")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"][-1][1] += data +\
|
|
self.stream.charsUntil(("'", u"&"))
|
|
return True
|
|
|
|
def attributeValueUnQuotedState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.states["beforeAttributeName"]
|
|
elif data == u"&":
|
|
self.processEntityInAttribute()
|
|
elif data == u">":
|
|
self.emitCurrentToken()
|
|
elif data == u"<":
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected < character in attribute value.")})
|
|
self.emitCurrentToken()
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in attribute value.")})
|
|
self.emitCurrentToken()
|
|
else:
|
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
|
frozenset(("&", ">","<")) | spaceCharacters)
|
|
return True
|
|
|
|
def bogusCommentState(self):
|
|
# Make a new comment token and give it as value all the characters
|
|
# until the first > or EOF (charsUntil checks for EOF automatically)
|
|
# and emit it.
|
|
self.tokenQueue.append(
|
|
{"type": "Comment", "data": self.stream.charsUntil((u">"))})
|
|
|
|
# Eat the character directly after the bogus comment which is either a
|
|
# ">" or an EOF.
|
|
self.stream.char()
|
|
self.state = self.states["data"]
|
|
return True
|
|
|
|
def markupDeclarationOpenState(self):
|
|
charStack = [self.stream.char(), self.stream.char()]
|
|
if charStack == [u"-", u"-"]:
|
|
self.currentToken = {"type": "Comment", "data": ""}
|
|
self.state = self.states["comment"]
|
|
else:
|
|
for x in xrange(5):
|
|
charStack.append(self.stream.char())
|
|
# Put in explicit EOF check
|
|
if (not EOF in charStack and
|
|
"".join(charStack).upper() == u"DOCTYPE"):
|
|
self.currentToken =\
|
|
{"type": "Doctype", "name": "", "data": True}
|
|
self.state = self.states["doctype"]
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
|
self.stream.queue.extend(charStack)
|
|
self.state = self.states["bogusComment"]
|
|
return True
|
|
|
|
def commentState(self):
|
|
data = self.stream.char()
|
|
if data == u"-":
|
|
self.state = self.states["commentDash"]
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in comment.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
|
return True
|
|
|
|
def commentDashState(self):
|
|
data = self.stream.char()
|
|
if data == u"-":
|
|
self.state = self.states["commentEnd"]
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in comment (-)")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
self.currentToken["data"] += u"-" + data +\
|
|
self.stream.charsUntil(u"-")
|
|
# Consume the next character which is either a "-" or an EOF as
|
|
# well so if there's a "-" directly after the "-" we go nicely to
|
|
# the "comment end state" without emitting a ParseError() there.
|
|
self.stream.char()
|
|
return True
|
|
|
|
def commentEndState(self):
|
|
data = self.stream.char()
|
|
if data == u">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
elif data == u"-":
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected '-' after '--' found in comment.")})
|
|
self.currentToken["data"] += data
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in comment (--).")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
# XXX
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected character in comment found.")})
|
|
self.currentToken["data"] += u"--" + data
|
|
self.state = self.states["comment"]
|
|
return True
|
|
|
|
def doctypeState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
self.state = self.states["beforeDoctypeName"]
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("No space after literal string 'DOCTYPE'.")})
|
|
self.stream.queue.append(data)
|
|
self.state = self.states["beforeDoctypeName"]
|
|
return True
|
|
|
|
def beforeDoctypeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data in asciiLowercase:
|
|
self.currentToken["name"] = data.upper()
|
|
self.state = self.states["doctypeName"]
|
|
elif data == u">":
|
|
# Character needs to be consumed per the specification so don't
|
|
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected > character. Expected DOCTYPE name.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file. Expected DOCTYPE name.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
self.currentToken["name"] = data
|
|
self.state = self.states["doctypeName"]
|
|
return True
|
|
|
|
def doctypeNameState(self):
|
|
data = self.stream.char()
|
|
needsDoctypeCheck = False
|
|
if data in spaceCharacters:
|
|
self.state = self.states["afterDoctypeName"]
|
|
needsDoctypeCheck = True
|
|
elif data == u">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
elif data == EOF:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in DOCTYPE name.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
# We can't just uppercase everything that arrives here. For
|
|
# instance, non-ASCII characters.
|
|
if data in asciiLowercase:
|
|
data = data.upper()
|
|
self.currentToken["name"] += data
|
|
needsDoctypeCheck = True
|
|
|
|
# After some iterations through this state it should eventually say
|
|
# "HTML". Otherwise there's an error.
|
|
if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
|
|
self.currentToken["data"] = False
|
|
return True
|
|
|
|
def afterDoctypeNameState(self):
|
|
data = self.stream.char()
|
|
if data in spaceCharacters:
|
|
pass
|
|
elif data == u">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
elif data == EOF:
|
|
self.currentToken["data"] = True
|
|
# XXX EMIT
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in DOCTYPE.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Expected space or '>'. Got '" + data + "'")})
|
|
self.currentToken["data"] = True
|
|
self.state = self.states["bogusDoctype"]
|
|
return True
|
|
|
|
def bogusDoctypeState(self):
|
|
data = self.stream.char()
|
|
if data == u">":
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
elif data == EOF:
|
|
# XXX EMIT
|
|
self.stream.queue.append(data)
|
|
self.tokenQueue.append({"type": "ParseError", "data":
|
|
_("Unexpected end of file in bogus doctype.")})
|
|
self.tokenQueue.append(self.currentToken)
|
|
self.state = self.states["data"]
|
|
else:
|
|
pass
|
|
return True
|