try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset import gettext _ = gettext.gettext from constants import contentModelFlags, spaceCharacters from constants import entitiesWindows1252, entities from constants import asciiLowercase, asciiLetters from constants import digits, hexDigits, EOF from inputstream import HTMLInputStream class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.states Holds a mapping between states and methods that implement the state. * self.stream Points to HTMLInputStream object. """ # XXX need to fix documentation def __init__(self, stream, encoding=None): self.stream = HTMLInputStream(stream, encoding) self.states = { "data":self.dataState, "entityData":self.entityDataState, "tagOpen":self.tagOpenState, "closeTagOpen":self.closeTagOpenState, "tagName":self.tagNameState, "beforeAttributeName":self.beforeAttributeNameState, "attributeName":self.attributeNameState, "afterAttributeName":self.afterAttributeNameState, "beforeAttributeValue":self.beforeAttributeValueState, "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState, "attributeValueSingleQuoted":self.attributeValueSingleQuotedState, "attributeValueUnQuoted":self.attributeValueUnQuotedState, "bogusComment":self.bogusCommentState, "markupDeclarationOpen":self.markupDeclarationOpenState, "comment":self.commentState, "commentDash":self.commentDashState, "commentEnd":self.commentEndState, "doctype":self.doctypeState, "beforeDoctypeName":self.beforeDoctypeNameState, "doctypeName":self.doctypeNameState, "afterDoctypeName":self.afterDoctypeNameState, "bogusDoctype":self.bogusDoctypeState } # Setup the initial tokenizer state self.contentModelFlag = contentModelFlags["PCDATA"] self.state = self.states["data"] # The current token being created self.currentToken = None # Tokens to be processed. self.tokenQueue = [] def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.stream.reset() self.tokenQueue = [] # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.tokenQueue: yield self.tokenQueue.pop(0) # Below are various helper functions the tokenizer states use worked out. def processSolidusInTag(self): """If the next character is a '>', convert the currentToken into an EmptyTag """ # We need to consume another character to make sure it's a ">" data = self.stream.char() if self.currentToken["type"] == "StartTag" and data == u">": self.currentToken["type"] = "EmptyTag" else: self.tokenQueue.append({"type": "ParseError", "data": _("Solidus (/) incorrectly placed in tag.")}) # The character we just consumed need to be put back on the stack so it # doesn't get lost... self.stream.queue.append(data) def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": "ParseError"}) is invoked. """ # XXX More need to be done here. For instance, #13 should prolly be # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and # such. Thoughts on this appreciated. allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 char = u"\uFFFD" charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) # If the integer is between 127 and 160 (so 128 and bigger and 159 and # smaller) we need to do the "windows trick". if 127 < charAsInt < 160: #XXX - removed parse error from windows 1252 entity for now #we may want to reenable this later #self.tokenQueue.append({"type": "ParseError", "data": # _("Entity used with illegal number (windows-1252 reference).")}) charAsInt = entitiesWindows1252[charAsInt - 128] # 0 is not a good number. if charAsInt == 0: charAsInt = 65533 try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work # according to hsivonen. Also, unichr has a limitation of 65535 char = unichr(charAsInt) except: self.tokenQueue.append({"type": "ParseError", "data": _("Numeric entity couldn't be converted to character.")}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != u";": self.tokenQueue.append({"type": "ParseError", "data": _("Numeric entity didn't end with ';'.")}) self.stream.queue.append(c) return char def consumeEntity(self): char = None charStack = [self.stream.char()] if charStack[0] == u"#": # We might have a number entity here. charStack.extend([self.stream.char(), self.stream.char()]) if EOF in charStack: # If we reach the end of the file put everything up to EOF # back in the queue charStack = charStack[:charStack.index(EOF)] self.stream.queue.extend(charStack) self.tokenQueue.append({"type": "ParseError", "data": _("Numeric entity expected. Got end of file instead.")}) else: if charStack[1].lower() == u"x" \ and charStack[2] in hexDigits: # Hexadecimal entity detected. self.stream.queue.append(charStack[2]) char = self.consumeNumberEntity(True) elif charStack[1] in digits: # Decimal entity detected. self.stream.queue.extend(charStack[1:]) char = self.consumeNumberEntity(False) else: # No number entity detected. self.stream.queue.extend(charStack) self.tokenQueue.append({"type": "ParseError", "data": _("Numeric entity expected but none found.")}) # Break out if we reach the end of the file elif charStack[0] == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Entity expected. Got end of file instead.")}) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. filteredEntityList = [e for e in entities if \ e.startswith(charStack[0])] def entitiesStartingWith(name): return [e for e in filteredEntityList if e.startswith(name)] while charStack[-1] != EOF and\ entitiesStartingWith("".join(charStack)): charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity entityName = None # Try to find the longest entity the string will match for entityLength in xrange(len(charStack)-1,1,-1): possibleEntityName = "".join(charStack[:entityLength]) if possibleEntityName in entities: entityName = possibleEntityName break if entityName is not None: char = entities[entityName] # Check whether or not the last character returned can be # discarded or needs to be put back. if not charStack[-1] == ";": self.tokenQueue.append({"type": "ParseError", "data": _("Named entity didn't end with ';'.")}) self.stream.queue.extend(charStack[entityLength:]) else: self.tokenQueue.append({"type": "ParseError", "data": _("Named entity expected. Got none.")}) self.stream.queue.extend(charStack) return char def processEntityInAttribute(self): """This method replaces the need for "entityInAttributeValueState". """ entity = self.consumeEntity() if entity: self.currentToken["data"][-1][1] += entity else: self.currentToken["data"][-1][1] += u"&" def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ # Add token to the queue to be yielded self.tokenQueue.append(self.currentToken) self.state = self.states["data"] # Below are the various tokenizer states worked out. # XXX AT Perhaps we should have Hixie run some evaluation on billions of # documents to figure out what the order of the various if and elif # statements should be. def dataState(self): data = self.stream.char() if data == u"&" and self.contentModelFlag in\ (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]): self.state = self.states["entityData"] elif data == u"<" and self.contentModelFlag !=\ contentModelFlags["PLAINTEXT"]: self.state = self.states["tagOpen"] elif data == EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. # XXX need to check if we don't need a special "spaces" flag on # characters. self.tokenQueue.append({"type": "SpaceCharacters", "data": data + self.stream.charsUntil(spaceCharacters, True)}) else: self.tokenQueue.append({"type": "Characters", "data": data + self.stream.charsUntil((u"&", u"<"))}) return True def entityDataState(self): entity = self.consumeEntity() if entity: self.tokenQueue.append({"type": "Characters", "data": entity}) else: self.tokenQueue.append({"type": "Characters", "data": u"&"}) self.state = self.states["data"] return True def tagOpenState(self): data = self.stream.char() if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!": self.state = self.states["markupDeclarationOpen"] elif data == u"/": self.state = self.states["closeTagOpen"] elif data in asciiLetters: self.currentToken =\ {"type": "StartTag", "name": data, "data": []} self.state = self.states["tagName"] elif data == u">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": "ParseError", "data": _("Expected tag name. Got '>' instead.")}) self.tokenQueue.append({"type": "Characters", "data": u"<>"}) self.state = self.states["data"] elif data == u"?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": "ParseError", "data": _("Expected tag name. Got '?' instead (HTML doesn't " "support processing instructions).")}) self.stream.queue.append(data) self.state = self.states["bogusComment"] else: # XXX self.tokenQueue.append({"type": "ParseError", "data": _("Expected tag name. Got something else instead")}) self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.queue.append(data) self.state = self.states["data"] else: # We know the content model flag is set to either RCDATA or CDATA # now because this state can never be entered with the PLAINTEXT # flag. if data == u"/": self.state = self.states["closeTagOpen"] else: self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.stream.queue.append(data) self.state = self.states["data"] return True def closeTagOpenState(self): if self.contentModelFlag in (contentModelFlags["RCDATA"],\ contentModelFlags["CDATA"]): charStack = [] # So far we know that "", u"/", u"<", EOF))): # Because the characters are correct we can safely switch to # PCDATA mode now. This also means we don't have to do it when # emitting the end tag token. self.contentModelFlag = contentModelFlags["PCDATA"] else: self.tokenQueue.append({"type": "ParseError", "data": _("Expected closing tag after seeing '": self.tokenQueue.append({"type": "ParseError", "data": _("Expected closing tag. Got '>' instead. Ignoring ''.")}) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Expected closing tag. Unexpected end of file.")}) self.tokenQueue.append({"type": "Characters", "data": u"": self.emitCurrentToken() elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character when getting the tag name.")}) self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in the tag name.")}) self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] else: self.currentToken["name"] += data return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u">": self.emitCurrentToken() elif data == u"/": self.processSolidusInTag() elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character. Expected attribute name instead.")}) self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file. Expected attribute name instead.")}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def attributeNameState(self): data = self.stream.char() leavingThisState = True if data == u"=": self.state = self.states["beforeAttributeValue"] elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == u">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list pass elif data in spaceCharacters: self.state = self.states["afterAttributeName"] elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character in attribute name.")}) self.emitCurrentToken() leavingThisState = False elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in attribute name.")}) self.emitCurrentToken() leavingThisState = False else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. for name, value in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": "ParseError", "data": _("Dropped duplicate attribute on tag.")}) # XXX Fix for above XXX if data == u">": self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"=": self.state = self.states["beforeAttributeValue"] elif data == u">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] elif data == u"/": self.processSolidusInTag() self.state = self.states["beforeAttributeName"] elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character. Expected = or end of tag.")}) self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file. Expected = or end of tag.")}) self.emitCurrentToken() else: self.currentToken["data"].append([data, ""]) self.state = self.states["attributeName"] return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == u"\"": self.state = self.states["attributeValueDoubleQuoted"] elif data == u"&": self.state = self.states["attributeValueUnQuoted"] self.stream.queue.append(data); elif data == u"'": self.state = self.states["attributeValueSingleQuoted"] elif data == u">": self.emitCurrentToken() elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character. Expected attribute value.")}) self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file. Expected attribute value.")}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data self.state = self.states["attributeValueUnQuoted"] return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in attribute value (\").")}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", u"&")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in attribute value (').")}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", u"&")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeAttributeName"] elif data == u"&": self.processEntityInAttribute() elif data == u">": self.emitCurrentToken() elif data == u"<": self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected < character in attribute value.")}) self.emitCurrentToken() elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in attribute value.")}) self.emitCurrentToken() else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ frozenset(("&", ">","<")) | spaceCharacters) return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. self.tokenQueue.append( {"type": "Comment", "data": self.stream.charsUntil((u">"))}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.states["data"] return True def markupDeclarationOpenState(self): charStack = [self.stream.char(), self.stream.char()] if charStack == [u"-", u"-"]: self.currentToken = {"type": "Comment", "data": ""} self.state = self.states["comment"] else: for x in xrange(5): charStack.append(self.stream.char()) # Put in explicit EOF check if (not EOF in charStack and "".join(charStack).upper() == u"DOCTYPE"): self.currentToken =\ {"type": "Doctype", "name": "", "data": True} self.state = self.states["doctype"] else: self.tokenQueue.append({"type": "ParseError", "data": _("Expected '--' or 'DOCTYPE'. Not found.")}) self.stream.queue.extend(charStack) self.state = self.states["bogusComment"] return True def commentState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentDash"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in comment.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += data + self.stream.charsUntil(u"-") return True def commentDashState(self): data = self.stream.char() if data == u"-": self.state = self.states["commentEnd"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in comment (-)")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["data"] += u"-" + data +\ self.stream.charsUntil(u"-") # Consume the next character which is either a "-" or an EOF as # well so if there's a "-" directly after the "-" we go nicely to # the "comment end state" without emitting a ParseError() there. self.stream.char() return True def commentEndState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == u"-": self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected '-' after '--' found in comment.")}) self.currentToken["data"] += data elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in comment (--).")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: # XXX self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected character in comment found.")}) self.currentToken["data"] += u"--" + data self.state = self.states["comment"] return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.states["beforeDoctypeName"] else: self.tokenQueue.append({"type": "ParseError", "data": _("No space after literal string 'DOCTYPE'.")}) self.stream.queue.append(data) self.state = self.states["beforeDoctypeName"] return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data in asciiLowercase: self.currentToken["name"] = data.upper() self.state = self.states["doctypeName"] elif data == u">": # Character needs to be consumed per the specification so don't # invoke emitCurrentTokenWithParseError with "data" as argument. self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected > character. Expected DOCTYPE name.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file. Expected DOCTYPE name.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.currentToken["name"] = data self.state = self.states["doctypeName"] return True def doctypeNameState(self): data = self.stream.char() needsDoctypeCheck = False if data in spaceCharacters: self.state = self.states["afterDoctypeName"] needsDoctypeCheck = True elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in DOCTYPE name.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: # We can't just uppercase everything that arrives here. For # instance, non-ASCII characters. if data in asciiLowercase: data = data.upper() self.currentToken["name"] += data needsDoctypeCheck = True # After some iterations through this state it should eventually say # "HTML". Otherwise there's an error. if needsDoctypeCheck and self.currentToken["name"] == u"HTML": self.currentToken["data"] = False return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: self.currentToken["data"] = True # XXX EMIT self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in DOCTYPE.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: self.tokenQueue.append({"type": "ParseError", "data": _("Expected space or '>'. Got '" + data + "'")}) self.currentToken["data"] = True self.state = self.states["bogusDoctype"] return True def bogusDoctypeState(self): data = self.stream.char() if data == u">": self.tokenQueue.append(self.currentToken) self.state = self.states["data"] elif data == EOF: # XXX EMIT self.stream.queue.append(data) self.tokenQueue.append({"type": "ParseError", "data": _("Unexpected end of file in bogus doctype.")}) self.tokenQueue.append(self.currentToken) self.state = self.states["data"] else: pass return True