planet/planet/html5lib/tokenizer.py

try:
    frozenset
except NameError:
    # Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext

from constants import contentModelFlags, spaceCharacters
from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters
from constants import digits, hexDigits, EOF

from inputstream import HTMLInputStream

class HTMLTokenizer(object):
    """ This class takes care of tokenizing HTML.

    * self.currentToken
      Holds the token that is currently being processed.

    * self.state
      Holds a reference to the method to be invoked... XXX

    * self.states
      Holds a mapping between states and methods that implement the state.

    * self.stream
      Points to HTMLInputStream object.
    """

    # XXX need to fix documentation

    def __init__(self, stream, encoding=None):
        self.stream = HTMLInputStream(stream, encoding)

        self.states = {
            "data":self.dataState,
            "entityData":self.entityDataState,
            "tagOpen":self.tagOpenState,
            "closeTagOpen":self.closeTagOpenState,
            "tagName":self.tagNameState,
            "beforeAttributeName":self.beforeAttributeNameState,
            "attributeName":self.attributeNameState,
            "afterAttributeName":self.afterAttributeNameState,
            "beforeAttributeValue":self.beforeAttributeValueState,
            "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
            "attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
            "bogusComment":self.bogusCommentState,
            "markupDeclarationOpen":self.markupDeclarationOpenState,
            "comment":self.commentState,
            "commentDash":self.commentDashState,
            "commentEnd":self.commentEndState,
            "doctype":self.doctypeState,
            "beforeDoctypeName":self.beforeDoctypeNameState,
            "doctypeName":self.doctypeNameState,
            "afterDoctypeName":self.afterDoctypeNameState,
            "bogusDoctype":self.bogusDoctypeState
        }

        # Setup the initial tokenizer state
        self.contentModelFlag = contentModelFlags["PCDATA"]
        self.state = self.states["data"]

        # The current token being created
        self.currentToken = None

        # Tokens to be processed.
        self.tokenQueue = []

    def __iter__(self):
        """ This is where the magic happens.

        We do our usually processing through the states and when we have a token
        to return we yield the token which pauses processing until the next token
        is requested.
        """
        self.stream.reset()
        self.tokenQueue = []
        # Start processing. When EOF is reached self.state will return False
        # instead of True and the loop will terminate.
        while self.state():
            while self.tokenQueue:
                yield self.tokenQueue.pop(0)

    # Below are various helper functions the tokenizer states use worked out.
    def processSolidusInTag(self):
        """If the next character is a '>', convert the currentToken into
        an EmptyTag
        """

        # We need to consume another character to make sure it's a ">"
        data = self.stream.char()

        if self.currentToken["type"] == "StartTag" and data == u">":
            self.currentToken["type"] = "EmptyTag"
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Solidus (/) incorrectly placed in tag.")})

        # The character we just consumed need to be put back on the stack so it
        # doesn't get lost...
        self.stream.queue.append(data)

    def consumeNumberEntity(self, isHex):
        """This function returns either U+FFFD or the character based on the
        decimal or hexadecimal representation. It also discards ";" if present.
        If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
        """

        # XXX More need to be done here. For instance, #13 should prolly be
        # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
        # such. Thoughts on this appreciated.
        allowed = digits
        radix = 10
        if isHex:
            allowed = hexDigits
            radix = 16

        char = u"\uFFFD"
        charStack = []

        # Consume all the characters that are in range while making sure we
        # don't hit an EOF.
        c = self.stream.char()
        while c in allowed and c is not EOF:
            charStack.append(c)
            c = self.stream.char()

        # Convert the set of characters consumed to an int.
        charAsInt = int("".join(charStack), radix)

        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
        # smaller) we need to do the "windows trick".
        if 127 < charAsInt < 160:
            #XXX - removed parse error from windows 1252 entity for now
            #we may want to reenable this later
            #self.tokenQueue.append({"type": "ParseError", "data":
            #  _("Entity used with illegal number (windows-1252 reference).")})

            charAsInt = entitiesWindows1252[charAsInt - 128]

        # 0 is not a good number.
        if charAsInt == 0:
            charAsInt = 65533

        try:
            # XXX We should have a separate function that does "int" to
            # "unicodestring" conversion since this doesn't always work
            # according to hsivonen. Also, unichr has a limitation of 65535
            char = unichr(charAsInt)
        except:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity couldn't be converted to character.")})

        # Discard the ; if present. Otherwise, put it back on the queue and
        # invoke parseError on parser.
        if c != u";":
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity didn't end with ';'.")})
            self.stream.queue.append(c)

        return char

    def consumeEntity(self):
        char = None
        charStack = [self.stream.char()]
        if charStack[0] == u"#":
            # We might have a number entity here.
            charStack.extend([self.stream.char(), self.stream.char()])
            if EOF in charStack:
                # If we reach the end of the file put everything up to EOF
                # back in the queue
                charStack = charStack[:charStack.index(EOF)]
                self.stream.queue.extend(charStack)
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Numeric entity expected. Got end of file instead.")})
            else:
                if charStack[1].lower() == u"x" \
                  and charStack[2] in hexDigits:
                    # Hexadecimal entity detected.
                    self.stream.queue.append(charStack[2])
                    char = self.consumeNumberEntity(True)
                elif charStack[1] in digits:
                    # Decimal entity detected.
                    self.stream.queue.extend(charStack[1:])
                    char = self.consumeNumberEntity(False)
                else:
                    # No number entity detected.
                    self.stream.queue.extend(charStack)
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Numeric entity expected but none found.")})
        # Break out if we reach the end of the file
        elif charStack[0] == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Entity expected. Got end of file instead.")})
        else:
            # At this point in the process might have named entity. Entities
            # are stored in the global variable "entities".
            #
            # Consume characters and compare to these to a substring of the
            # entity names in the list until the substring no longer matches.
            filteredEntityList = [e for e in entities if \
              e.startswith(charStack[0])]

            def entitiesStartingWith(name):
                return [e for e in filteredEntityList if e.startswith(name)]

            while charStack[-1] != EOF and\
              entitiesStartingWith("".join(charStack)):
                charStack.append(self.stream.char())

            # At this point we have a string that starts with some characters
            # that may match an entity
            entityName = None

            # Try to find the longest entity the string will match
            for entityLength in xrange(len(charStack)-1,1,-1):
                possibleEntityName = "".join(charStack[:entityLength])
                if possibleEntityName in entities:
                    entityName = possibleEntityName
                    break

            if entityName is not None:
                char = entities[entityName]

                # Check whether or not the last character returned can be
                # discarded or needs to be put back.
                if not charStack[-1] == ";":
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Named entity didn't end with ';'.")})
                    self.stream.queue.extend(charStack[entityLength:])
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Named entity expected. Got none.")})
                self.stream.queue.extend(charStack)
        return char

    def processEntityInAttribute(self):
        """This method replaces the need for "entityInAttributeValueState".
        """
        entity = self.consumeEntity()
        if entity:
            self.currentToken["data"][-1][1] += entity
        else:
            self.currentToken["data"][-1][1] += u"&"

    def emitCurrentToken(self):
        """This method is a generic handler for emitting the tags. It also sets
        the state to "data" because that's what's needed after a token has been
        emitted.
        """

        # Add token to the queue to be yielded
        self.tokenQueue.append(self.currentToken)
        self.state = self.states["data"]


    # Below are the various tokenizer states worked out.

    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
    # documents to figure out what the order of the various if and elif
    # statements should be.

    def dataState(self):
        data = self.stream.char()
        if data == u"&" and self.contentModelFlag in\
          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
            self.state = self.states["entityData"]
        elif data == u"<" and self.contentModelFlag !=\
          contentModelFlags["PLAINTEXT"]:
            self.state = self.states["tagOpen"]
        elif data == EOF:
            # Tokenization ends.
            return False
        elif data in spaceCharacters:
            # Directly after emitting a token you switch back to the "data
            # state". At that point spaceCharacters are important so they are
            # emitted separately.
            # XXX need to check if we don't need a special "spaces" flag on
            # characters.
            self.tokenQueue.append({"type": "SpaceCharacters", "data":
              data + self.stream.charsUntil(spaceCharacters, True)})
        else:
            self.tokenQueue.append({"type": "Characters", "data":
              data + self.stream.charsUntil((u"&", u"<"))})
        return True

    def entityDataState(self):
        entity = self.consumeEntity()
        if entity:
            self.tokenQueue.append({"type": "Characters", "data": entity})
        else:
            self.tokenQueue.append({"type": "Characters", "data": u"&"})
        self.state = self.states["data"]
        return True

    def tagOpenState(self):
        data = self.stream.char()
        if self.contentModelFlag == contentModelFlags["PCDATA"]:
            if data == u"!":
                self.state = self.states["markupDeclarationOpen"]
            elif data == u"/":
                self.state = self.states["closeTagOpen"]
            elif data in asciiLetters:
                self.currentToken =\
                  {"type": "StartTag", "name": data, "data": []}
                self.state = self.states["tagName"]
            elif data == u">":
                # XXX In theory it could be something besides a tag name. But
                # do we really care?
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got '>' instead.")})
                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
                self.state = self.states["data"]
            elif data == u"?":
                # XXX In theory it could be something besides a tag name. But
                # do we really care?
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got '?' instead (HTML doesn't "
                  "support processing instructions).")})
                self.stream.queue.append(data)
                self.state = self.states["bogusComment"]
            else:
                # XXX
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got something else instead")})
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
                self.stream.queue.append(data)
                self.state = self.states["data"]
        else:
            # We know the content model flag is set to either RCDATA or CDATA
            # now because this state can never be entered with the PLAINTEXT
            # flag.
            if data == u"/":
                self.state = self.states["closeTagOpen"]
            else:
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
                self.stream.queue.append(data)
                self.state = self.states["data"]
        return True

    def closeTagOpenState(self):
        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
          contentModelFlags["CDATA"]):
            charStack = []

            # So far we know that "</" has been consumed. We now need to know
            # whether the next few characters match the name of last emitted
            # start tag which also happens to be the currentToken. We also need
            # to have the character directly after the characters that could
            # match the start tag name.
            for x in xrange(len(self.currentToken["name"]) + 1):
                charStack.append(self.stream.char())
                # Make sure we don't get hit by EOF
                if charStack[-1] == EOF:
                    break

            # Since this is just for checking. We put the characters back on
            # the stack.
            self.stream.queue.extend(charStack)

            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
              and charStack[-1] in (spaceCharacters |
              frozenset((u">", u"/", u"<", EOF))):
                # Because the characters are correct we can safely switch to
                # PCDATA mode now. This also means we don't have to do it when
                # emitting the end tag token.
                self.contentModelFlag = contentModelFlags["PCDATA"]
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag after seeing '</'. None found.")})
                self.tokenQueue.append({"type": "Characters", "data": u"</"})
                self.state = self.states["data"]

                # Need to return here since we don't want the rest of the
                # method to be walked through.
                return True

        if self.contentModelFlag == contentModelFlags["PCDATA"]:
            data = self.stream.char()
            if data in asciiLetters:
                self.currentToken =\
                  {"type": "EndTag", "name": data, "data": []}
                self.state = self.states["tagName"]
            elif data == u">":
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
                self.state = self.states["data"]
            elif data == EOF:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Unexpected end of file.")})
                self.tokenQueue.append({"type": "Characters", "data": u"</"})
                self.state = self.states["data"]
            else:
                # XXX data can be _'_...
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
                self.stream.queue.append(data)
                self.state = self.states["bogusComment"]
        return True

    def tagNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeAttributeName"]
        elif data in asciiLetters:
            self.currentToken["name"] += data +\
              self.stream.charsUntil(asciiLetters, True)
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character when getting the tag name.")})
            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in the tag name.")})
            self.emitCurrentToken()
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        else:
            self.currentToken["name"] += data
        return True

    def beforeAttributeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data in asciiLetters:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"/":
            self.processSolidusInTag()
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character. Expected attribute name instead.")})
            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected attribute name instead.")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        return True

    def attributeNameState(self):
        data = self.stream.char()
        leavingThisState = True
        if data == u"=":
            self.state = self.states["beforeAttributeValue"]
        elif data in asciiLetters:
            self.currentToken["data"][-1][0] += data +\
              self.stream.charsUntil(asciiLetters, True)
            leavingThisState = False
        elif data == u">":
            # XXX If we emit here the attributes are converted to a dict
            # without being checked and when the code below runs we error
            # because data is a dict not a list
            pass
        elif data in spaceCharacters:
            self.state = self.states["afterAttributeName"]
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character in attribute name.")})
            self.emitCurrentToken()
            leavingThisState = False
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute name.")})
            self.emitCurrentToken()
            leavingThisState = False
        else:
            self.currentToken["data"][-1][0] += data
            leavingThisState = False

        if leavingThisState:
            # Attributes are not dropped at this stage. That happens when the
            # start tag token is emitted so values can still be safely appended
            # to attributes, but we do want to report the parse error in time.
            for name, value in self.currentToken["data"][:-1]:
                if self.currentToken["data"][-1][0] == name:
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Dropped duplicate attribute on tag.")})
            # XXX Fix for above XXX
            if data == u">":
                self.emitCurrentToken()
        return True

    def afterAttributeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data == u"=":
            self.state = self.states["beforeAttributeValue"]
        elif data == u">":
            self.emitCurrentToken()
        elif data in asciiLetters:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character. Expected = or end of tag.")})
            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected = or end of tag.")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        return True

    def beforeAttributeValueState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data == u"\"":
            self.state = self.states["attributeValueDoubleQuoted"]
        elif data == u"&":
            self.state = self.states["attributeValueUnQuoted"]
            self.stream.queue.append(data);
        elif data == u"'":
            self.state = self.states["attributeValueSingleQuoted"]
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character. Expected attribute value.")})
            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected attribute value.")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"][-1][1] += data
            self.state = self.states["attributeValueUnQuoted"]
        return True

    def attributeValueDoubleQuotedState(self):
        data = self.stream.char()
        if data == "\"":
            self.state = self.states["beforeAttributeName"]
        elif data == u"&":
            self.processEntityInAttribute()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute value (\").")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"][-1][1] += data +\
              self.stream.charsUntil(("\"", u"&"))
        return True

    def attributeValueSingleQuotedState(self):
        data = self.stream.char()
        if data == "'":
            self.state = self.states["beforeAttributeName"]
        elif data == u"&":
            self.processEntityInAttribute()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute value (').")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"][-1][1] += data +\
              self.stream.charsUntil(("'", u"&"))
        return True

    def attributeValueUnQuotedState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeAttributeName"]
        elif data == u"&":
            self.processEntityInAttribute()
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<":
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected < character in attribute value.")})
            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute value.")})
            self.emitCurrentToken()
        else:
            self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
              frozenset(("&", ">","<")) | spaceCharacters)
        return True

    def bogusCommentState(self):
        # Make a new comment token and give it as value all the characters
        # until the first > or EOF (charsUntil checks for EOF automatically)
        # and emit it.
        self.tokenQueue.append(
          {"type": "Comment", "data": self.stream.charsUntil((u">"))})

        # Eat the character directly after the bogus comment which is either a
        # ">" or an EOF.
        self.stream.char()
        self.state = self.states["data"]
        return True

    def markupDeclarationOpenState(self):
        charStack = [self.stream.char(), self.stream.char()]
        if charStack == [u"-", u"-"]:
            self.currentToken = {"type": "Comment", "data": ""}
            self.state = self.states["comment"]
        else:
            for x in xrange(5):
                charStack.append(self.stream.char())
            # Put in explicit EOF check
            if (not EOF in charStack and
                "".join(charStack).upper() == u"DOCTYPE"):
                self.currentToken =\
                  {"type": "Doctype", "name": "", "data": True}
                self.state = self.states["doctype"]
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected '--' or 'DOCTYPE'. Not found.")})
                self.stream.queue.extend(charStack)
                self.state = self.states["bogusComment"]
        return True

    def commentState(self):
        data = self.stream.char()
        if data == u"-":
            self.state = self.states["commentDash"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in comment.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
        return True

    def commentDashState(self):
        data = self.stream.char()
        if data == u"-":
            self.state = self.states["commentEnd"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in comment (-)")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            self.currentToken["data"] += u"-" + data +\
              self.stream.charsUntil(u"-")
            # Consume the next character which is either a "-" or an EOF as
            # well so if there's a "-" directly after the "-" we go nicely to
            # the "comment end state" without emitting a ParseError() there.
            self.stream.char()
        return True

    def commentEndState(self):
        data = self.stream.char()
        if data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == u"-":
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected '-' after '--' found in comment.")})
            self.currentToken["data"] += data
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in comment (--).")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            # XXX
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected character in comment found.")})
            self.currentToken["data"] += u"--" + data
            self.state = self.states["comment"]
        return True

    def doctypeState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeDoctypeName"]
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("No space after literal string 'DOCTYPE'.")})
            self.stream.queue.append(data)
            self.state = self.states["beforeDoctypeName"]
        return True

    def beforeDoctypeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            pass
        elif data in asciiLowercase:
            self.currentToken["name"] = data.upper()
            self.state = self.states["doctypeName"]
        elif data == u">":
            # Character needs to be consumed per the specification so don't
            # invoke emitCurrentTokenWithParseError with "data" as argument.
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected > character. Expected DOCTYPE name.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected DOCTYPE name.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            self.currentToken["name"] = data
            self.state = self.states["doctypeName"]
        return True

    def doctypeNameState(self):
        data = self.stream.char()
        needsDoctypeCheck = False
        if data in spaceCharacters:
            self.state = self.states["afterDoctypeName"]
            needsDoctypeCheck = True
        elif data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in DOCTYPE name.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            # We can't just uppercase everything that arrives here. For
            # instance, non-ASCII characters.
            if data in asciiLowercase:
                data = data.upper()
            self.currentToken["name"] += data
            needsDoctypeCheck = True

        # After some iterations through this state it should eventually say
        # "HTML". Otherwise there's an error.
        if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
            self.currentToken["data"] = False
        return True

    def afterDoctypeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            pass
        elif data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.currentToken["data"] = True
            # XXX EMIT
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in DOCTYPE.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Expected space or '>'. Got '" + data + "'")})
            self.currentToken["data"] = True
            self.state = self.states["bogusDoctype"]
        return True

    def bogusDoctypeState(self):
        data = self.stream.char()
        if data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            # XXX EMIT
            self.stream.queue.append(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in bogus doctype.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
            pass
        return True