~bptato/chame

fead3e276208a073bf1f4ef7627bf4e7a7fa93ef — bptato a month ago 9690ba8
htmltokenizer: get rid of some redundant checks
2 files changed, 112 insertions(+), 166 deletions(-)

M chame/htmltokenizer.nim
M chame/tokstate.nim
M chame/htmltokenizer.nim => chame/htmltokenizer.nim +111 -164
@@ 80,11 80,7 @@ proc strToAtom[Handle, Atom](tokenizer: Tokenizer[Handle, Atom],

proc newTokenizer*[Handle, Atom](dombuilder: DOMBuilder[Handle, Atom],
    initialState = DATA): Tokenizer[Handle, Atom] =
  var t = Tokenizer[Handle, Atom](
    state: initialState,
    dombuilder: dombuilder
  )
  return t
  return Tokenizer[Handle, Atom](state: initialState, dombuilder: dombuilder)

proc reconsume(tokenizer: var Tokenizer, s: openArray[char]) =
  for i in countdown(s.high, 0):


@@ 117,11 113,13 @@ proc consume(tokenizer: var Tokenizer, ibuf: openArray[char]): int =

proc flushChars[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]) =
  if tokenizer.charbuf.len > 0:
    let token = if not tokenizer.isws:
      Token[Atom](t: CHARACTER, s: tokenizer.charbuf)
    if tokenizer.isws:
      tokenizer.tokqueue.add(Token[Atom](
        t: CHARACTER_WHITESPACE,
        s: tokenizer.charbuf
      ))
    else:
      Token[Atom](t: CHARACTER_WHITESPACE, s: tokenizer.charbuf)
    tokenizer.tokqueue.add(token)
      tokenizer.tokqueue.add(Token[Atom](t: CHARACTER, s: tokenizer.charbuf))
    tokenizer.isws = false
    tokenizer.charbuf.setLen(0)



@@ 130,12 128,13 @@ const AttributeStates = {
  ATTRIBUTE_VALUE_UNQUOTED
}

func consumedAsAnAttribute(tokenizer: Tokenizer): bool =
func consumedAsAttribute(tokenizer: Tokenizer): bool =
  return tokenizer.rstate in AttributeStates

proc appendToCurrentAttrValue(tokenizer: var Tokenizer, c: auto) =
proc appendToAttrValue(tokenizer: var Tokenizer, s: openArray[char]) =
  if tokenizer.attr:
    tokenizer.attrv &= c
    for c in s:
      tokenizer.attrv &= c

proc emit(tokenizer: var Tokenizer, c: char) =
  let isws = c in AsciiWhitespace


@@ 211,6 210,13 @@ proc findCharRef(tokenizer: var Tokenizer, c: char, ibuf: openArray[char]):
    inc ci
  return (i, ci, entry)

proc appendAttrOrEmit(tokenizer: var Tokenizer, s: openArray[char]) =
  if tokenizer.consumedAsAttribute():
    tokenizer.appendToAttrValue(s)
  else:
    for c in s:
      tokenizer.emit(c)

proc numericCharacterReferenceEndState(tokenizer: var Tokenizer) =
  const ControlMap = [
    0x20ACu16, 0, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021,


@@ 239,11 245,7 @@ proc numericCharacterReferenceEndState(tokenizer: var Tokenizer) =
      char(u shr 12 and 0x3F or 0x80) &
      char(u shr 6 and 0x3F or 0x80) &
      char(u and 0x3F or 0x80)
  if tokenizer.consumedAsAnAttribute():
    tokenizer.appendToCurrentAttrValue(s)
  else:
    for c in s:
      tokenizer.emit(c)
  tokenizer.appendAttrOrEmit(s)

proc flushAttr(tokenizer: var Tokenizer) =
  tokenizer.tok.attrs[tokenizer.attrna] = tokenizer.attrv


@@ 289,55 291,38 @@ proc flushTagName(tokenizer: var Tokenizer) =
  tokenizer.tok.tagname = tokenizer.strToAtom(tokenizer.tagNameBuf)

proc emitTmp(tokenizer: var Tokenizer) =
  for c in tokenizer.tmp:
    tokenizer.emit(c)

proc flushCodePointsConsumedAsCharRef(tokenizer: var Tokenizer) =
  if tokenizer.consumedAsAnAttribute():
    tokenizer.appendToCurrentAttrValue(tokenizer.tmp)
  else:
    tokenizer.emitTmp()
  if tokenizer.isws:
    tokenizer.flushChars()
  tokenizer.charbuf &= "</"
  tokenizer.charbuf &= tokenizer.tmp

# if true, redo
proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool =
  template emit(tok: Token) =
    tokenizer.flushChars()
    tokenizer.tokqueue.add(tok)
  template reconsume_in(s: TokenizerState) =
    tokenizer.state = s
    return true
  template emit(ch: char) =
    tokenizer.emit(ch)
  template emit(s: static string) =
    static:
      doAssert AsciiWhitespace notin s
    if tokenizer.isws:
      tokenizer.flushChars()
    tokenizer.charbuf &= s

  tokenizer.tokqueue.setLen(0)

  if tokenizer.isws:
    tokenizer.flushChars()
  case tokenizer.state
  of TAG_OPEN, RCDATA_LESS_THAN_SIGN, RAWTEXT_LESS_THAN_SIGN,
      SCRIPT_DATA_LESS_THAN_SIGN, SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
    emit '<'
    tokenizer.charbuf &= '<'
  of END_TAG_OPEN, RCDATA_END_TAG_OPEN, RAWTEXT_END_TAG_OPEN,
      SCRIPT_DATA_END_TAG_OPEN, SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
    emit "</"
    tokenizer.charbuf &= "</"
  of RCDATA_END_TAG_NAME, RAWTEXT_END_TAG_NAME, SCRIPT_DATA_END_TAG_NAME,
      SCRIPT_DATA_ESCAPED_END_TAG_NAME:
    emit "</"
    tokenizer.emitTmp()
  of BOGUS_COMMENT, BOGUS_DOCTYPE, COMMENT_END_DASH,
      COMMENT_END, COMMENT_END_BANG, COMMENT_LESS_THAN_SIGN_BANG_DASH,
      COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, COMMENT_START_DASH, COMMENT,
      COMMENT_START, COMMENT_LESS_THAN_SIGN, COMMENT_LESS_THAN_SIGN_BANG:
    emit tokenizer.tok
    tokenizer.flushChars()
    tokenizer.tokqueue.add(tokenizer.tok)
  of MARKUP_DECLARATION_OPEN:
    # note: was reconsume (bogus comment)
    emit Token[Atom](t: COMMENT)
    tokenizer.flushChars()
    tokenizer.tokqueue.add(Token[Atom](t: COMMENT))
  of DOCTYPE, BEFORE_DOCTYPE_NAME:
    emit Token[Atom](t: DOCTYPE, quirks: true)
    tokenizer.flushChars()
    tokenizer.tokqueue.add(Token[Atom](t: DOCTYPE, quirks: true))
  of DOCTYPE_NAME, AFTER_DOCTYPE_NAME, AFTER_DOCTYPE_PUBLIC_KEYWORD,
      BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
      DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED,


@@ 349,33 334,29 @@ proc tokenizeEOF[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom]): bool =
      DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED,
      AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
    tokenizer.tok.quirks = true
    emit tokenizer.tok
    tokenizer.flushChars()
    tokenizer.tokqueue.add(tokenizer.tok)
  of CDATA_SECTION_BRACKET:
    emit ']'
    # note: was reconsume (CDATA section)
    tokenizer.charbuf &= ']'
  of CDATA_SECTION_END:
    emit "]]"
    # note: was reconsume (CDATA section)
    tokenizer.charbuf &= "]]"
  of CHARACTER_REFERENCE:
    tokenizer.tmp = "&"
    tokenizer.flushCodePointsConsumedAsCharRef()
    reconsume_in tokenizer.rstate
  of NAMED_CHARACTER_REFERENCE:
    # No match for EOF
    tokenizer.flushCodePointsConsumedAsCharRef()
    # note: was switch state (ambiguous ampersand state)
    reconsume_in tokenizer.rstate
    tokenizer.appendAttrOrEmit("&")
    tokenizer.state = tokenizer.rstate
    return true
  of AMBIGUOUS_AMPERSAND_STATE:
    reconsume_in tokenizer.rstate
  of HEXADECIMAL_CHARACTER_REFERENCE_START, DECIMAL_CHARACTER_REFERENCE_START,
    tokenizer.state = tokenizer.rstate
    return true
  of NAMED_CHARACTER_REFERENCE, HEXADECIMAL_CHARACTER_REFERENCE_START,
      NUMERIC_CHARACTER_REFERENCE:
    tokenizer.flushCodePointsConsumedAsCharRef()
    reconsume_in tokenizer.rstate
  of HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE,
      NUMERIC_CHARACTER_REFERENCE_END:
    tokenizer.appendAttrOrEmit(tokenizer.tmp)
    tokenizer.state = tokenizer.rstate
    return true
  of HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE:
    tokenizer.numericCharacterReferenceEndState()
    # we unnecessarily consumed once so reconsume
    reconsume_in tokenizer.rstate
    tokenizer.state = tokenizer.rstate
    return true
  else: discard
  tokenizer.flushChars()
  false


@@ 385,12 366,6 @@ type TokenizeResult* = enum

proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
    ibuf: openArray[char]): TokenizeResult =
  template emit(tok: Token) =
    tokenizer.flushChars()
    if tok.t == START_TAG:
      tokenizer.laststart = tok
    tokenizer.tokqueue.add(tok)
  template emit(tok: TokenType) = emit Token[Atom](t: tok)
  template emit(s: static string) =
    static:
      doAssert AsciiWhitespace notin s


@@ 401,13 376,13 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
    tokenizer.emit(ch)
  template emit_null =
    tokenizer.flushChars()
    emit Token[Atom](t: CHARACTER_NULL)
    tokenizer.tokqueue.add(Token[Atom](t: CHARACTER_NULL))
  template prepare_attrs_if_start =
    if tokenizer.tok.t == START_TAG and tokenizer.attr and
        tokenizer.tmp != "":
    if tokenizer.tok.t == START_TAG and tokenizer.attr:
      tokenizer.flushAttr()
  template emit_tok =
    emit tokenizer.tok
    tokenizer.flushChars()
    tokenizer.tokqueue.add(tokenizer.tok)
  template emit_replacement = emit "\uFFFD"
  template switch_state(s: TokenizerState) =
    tokenizer.state = s


@@ 484,6 459,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      of '/': switch_state END_TAG_OPEN
      of AsciiAlpha:
        new_token Token[Atom](t: START_TAG)
        tokenizer.laststart = tokenizer.tok
        tokenizer.tagNameBuf = $c.toLowerAscii()
        # note: was reconsume
        switch_state TAG_NAME


@@ 519,9 495,8 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        switch_state DATA
        tokenizer.flushTagName()
        emit_tok
      of AsciiUpperAlpha: tokenizer.tagNameBuf &= c.toLowerAscii()
      of '\0': tokenizer.tagNameBuf &= "\uFFFD"
      else: tokenizer.tagNameBuf &= c
      else: tokenizer.tagNameBuf &= c.toLowerAscii()

    of RCDATA_LESS_THAN_SIGN:
      case c


@@ 546,7 521,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],

    of RCDATA_END_TAG_NAME:
      template anything_else =
        emit "</"
        tokenizer.emitTmp()
        reconsume_in RCDATA
      case c


@@ 598,7 572,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],

    of RAWTEXT_END_TAG_NAME:
      template anything_else =
        emit "</"
        tokenizer.emitTmp()
        reconsume_in RAWTEXT
      case c


@@ 653,7 626,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],

    of SCRIPT_DATA_END_TAG_NAME:
      template anything_else =
        emit "</"
        tokenizer.emitTmp()
        reconsume_in SCRIPT_DATA
      case c


@@ 753,8 725,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        reconsume_in SCRIPT_DATA_ESCAPED

    of SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
      case c
      of AsciiAlpha:
      if c in AsciiAlpha:
        new_token Token[Atom](t: END_TAG)
        tokenizer.tagNameBuf = $c.toLowerAscii()
        tokenizer.tmp &= c


@@ 766,7 737,6 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],

    of SCRIPT_DATA_ESCAPED_END_TAG_NAME:
      template anything_else =
        emit "</"
        tokenizer.emitTmp()
        reconsume_in SCRIPT_DATA_ESCAPED
      case c


@@ 892,12 862,10 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      of '=':
        leave_attribute_name_state
        switch_state BEFORE_ATTRIBUTE_VALUE
      of AsciiUpperAlpha:
        tokenizer.tmp &= c.toLowerAscii()
      of '\0':
        tokenizer.tmp &= "\uFFFD"
      else:
        tokenizer.tmp &= c
        tokenizer.tmp &= c.toLowerAscii()

    of AFTER_ATTRIBUTE_NAME:
      case c


@@ 927,15 895,15 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      case c
      of '"': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
      of '&': switch_state_return CHARACTER_REFERENCE
      of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD")
      else: tokenizer.appendToCurrentAttrValue(c)
      of '\0': tokenizer.appendToAttrValue("\uFFFD")
      else: tokenizer.appendToAttrValue([c])

    of ATTRIBUTE_VALUE_SINGLE_QUOTED:
      case c
      of '\'': switch_state AFTER_ATTRIBUTE_VALUE_QUOTED
      of '&': switch_state_return CHARACTER_REFERENCE
      of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD")
      else: tokenizer.appendToCurrentAttrValue(c)
      of '\0': tokenizer.appendToAttrValue("\uFFFD")
      else: tokenizer.appendToAttrValue([c])

    of ATTRIBUTE_VALUE_UNQUOTED:
      case c


@@ 945,8 913,8 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        switch_state DATA
        prepare_attrs_if_start
        emit_tok
      of '\0': tokenizer.appendToCurrentAttrValue("\uFFFD")
      else: tokenizer.appendToCurrentAttrValue(c)
      of '\0': tokenizer.appendToAttrValue("\uFFFD")
      else: tokenizer.appendToAttrValue([c])

    of AFTER_ATTRIBUTE_VALUE_QUOTED:
      case c


@@ 1094,26 1062,22 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        reconsume_in COMMENT

    of DOCTYPE:
      case c
      of AsciiWhitespace: switch_state BEFORE_DOCTYPE_NAME
      of '>': reconsume_in BEFORE_DOCTYPE_NAME
      else: reconsume_in BEFORE_DOCTYPE_NAME
      if c notin AsciiWhitespace:
        tokenizer.reconsume(c)
      switch_state BEFORE_DOCTYPE_NAME

    of BEFORE_DOCTYPE_NAME:
      case c
      of AsciiWhitespace: discard
      of AsciiUpperAlpha:
        new_token Token[Atom](t: DOCTYPE, name: some($c.toLowerAscii()))
        switch_state DOCTYPE_NAME
      of '\0':
        new_token Token[Atom](t: DOCTYPE, name: some($"\uFFFD"))
        new_token Token[Atom](t: DOCTYPE, name: some("\uFFFD"))
        switch_state DOCTYPE_NAME
      of '>':
        new_token Token[Atom](t: DOCTYPE, quirks: true)
        switch_state DATA
        emit_tok
      else:
        new_token Token[Atom](t: DOCTYPE, name: some($c))
        new_token Token[Atom](t: DOCTYPE, name: some($c.toLowerAscii()))
        switch_state DOCTYPE_NAME

    of DOCTYPE_NAME:


@@ 1122,9 1086,8 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      of '>':
        switch_state DATA
        emit_tok
      of AsciiUpperAlpha: tokenizer.tok.name.get &= c.toLowerAscii()
      of '\0': tokenizer.tok.name.get &= "\uFFFD"
      else: tokenizer.tok.name.get &= c
      else: tokenizer.tok.name.get &= c.toLowerAscii()

    of AFTER_DOCTYPE_NAME: # note: rewritten to fit case model as we consume a char anyway
      template anything_else =


@@ 1298,14 1261,13 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      of '>':
        switch_state DATA
        emit_tok
      else: reconsume_in BOGUS_DOCTYPE
      else:
        switch_state BOGUS_DOCTYPE

    of BOGUS_DOCTYPE:
      case c
      of '>':
        switch_state DATA
      if c == '>':
        emit_tok
      else: discard
        switch_state DATA

    of CDATA_SECTION:
      case c


@@ 1319,8 1281,8 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        emit c

    of CDATA_SECTION_BRACKET:
      case c
      of ']': switch_state CDATA_SECTION_END
      if c == ']':
        switch_state CDATA_SECTION_END
      else:
        emit ']'
        reconsume_in CDATA_SECTION


@@ 1342,8 1304,7 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
        tokenizer.tmp = "&#"
        switch_state NUMERIC_CHARACTER_REFERENCE
      else:
        tokenizer.tmp = "&"
        tokenizer.flushCodePointsConsumedAsCharRef()
        tokenizer.appendAttrOrEmit("&")
        reconsume_in tokenizer.rstate

    of NAMED_CHARACTER_REFERENCE:


@@ 1360,14 1321,13 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      tokenizer.tmp.setLen(ci + 1)
      if entry != nil and entry[ci] == ':':
        let n = tokenizer.consume(ibuf)
        let sc = tokenizer.consumedAsAnAttribute() and tokenizer.tmp[^1] != ';'
        let sc = tokenizer.consumedAsAttribute() and tokenizer.tmp[^1] != ';'
        if sc and n != -1 and cast[char](n) in {'='} + AsciiAlphaNumeric:
          tokenizer.reconsume(cast[char](n))
          tokenizer.flushCodePointsConsumedAsCharRef()
          tokenizer.appendAttrOrEmit(tokenizer.tmp)
          switch_state tokenizer.rstate
        elif sc and n == -1 and not tokenizer.isend:
          # We have to redo the above check.
          #TODO it would be great to not completely lose our state here...
          tokenizer.reconsume(tokenizer.tmp.toOpenArray(1, tokenizer.tmp.high))
          tokenizer.tmp = "&"
          break


@@ 1379,21 1339,17 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
          while (let c = entry[ci]; c != '\0'):
            tokenizer.tmp &= c
            inc ci
          tokenizer.flushCodePointsConsumedAsCharRef()
          tokenizer.appendAttrOrEmit(tokenizer.tmp)
          switch_state tokenizer.rstate
      else:
        tokenizer.flushCodePointsConsumedAsCharRef()
        tokenizer.appendAttrOrEmit(tokenizer.tmp)
        switch_state AMBIGUOUS_AMPERSAND_STATE

    of AMBIGUOUS_AMPERSAND_STATE:
      case c
      of AsciiAlpha:
        if tokenizer.consumedAsAnAttribute():
          tokenizer.appendToCurrentAttrValue(c)
        else:
          emit c
      of ';': reconsume_in tokenizer.rstate
      else: reconsume_in tokenizer.rstate
      if c in AsciiAlpha:
        tokenizer.appendAttrOrEmit([c])
      else:
        reconsume_in tokenizer.rstate

    of NUMERIC_CHARACTER_REFERENCE:
      tokenizer.code = 0


@@ 1401,65 1357,56 @@ proc tokenize*[Handle, Atom](tokenizer: var Tokenizer[Handle, Atom],
      of 'x', 'X':
        tokenizer.tmp &= c
        switch_state HEXADECIMAL_CHARACTER_REFERENCE_START
      else: reconsume_in DECIMAL_CHARACTER_REFERENCE_START

    of HEXADECIMAL_CHARACTER_REFERENCE_START:
      case c
      of AsciiDigit:
        tokenizer.code = uint32(c) - uint32('0')
        # note: was reconsume
        switch_state HEXADECIMAL_CHARACTER_REFERENCE
      of 'a'..'f':
        tokenizer.code = uint32(c) - uint32('a') + 10
        # note: was reconsume
        switch_state HEXADECIMAL_CHARACTER_REFERENCE
      of 'A'..'F':
        tokenizer.code = uint32(c) - uint32('A') + 10
        # note: was reconsume
        switch_state HEXADECIMAL_CHARACTER_REFERENCE
        switch_state DECIMAL_CHARACTER_REFERENCE
      else:
        tokenizer.flushCodePointsConsumedAsCharRef()
        tokenizer.appendAttrOrEmit(tokenizer.tmp)
        reconsume_in tokenizer.rstate

    of DECIMAL_CHARACTER_REFERENCE_START:
      case c
    of HEXADECIMAL_CHARACTER_REFERENCE_START:
      let c2 = c.toLowerAscii()
      case c2
      of AsciiDigit:
        tokenizer.code = uint32(c) - uint32('0')
        tokenizer.code = uint32(c2) - uint32('0')
        # note: was reconsume
        switch_state DECIMAL_CHARACTER_REFERENCE
        switch_state HEXADECIMAL_CHARACTER_REFERENCE
      of 'a'..'f':
        tokenizer.code = uint32(c2) - uint32('a') + 10
        # note: was reconsume
        switch_state HEXADECIMAL_CHARACTER_REFERENCE
      else:
        tokenizer.flushCodePointsConsumedAsCharRef()
        tokenizer.appendAttrOrEmit(tokenizer.tmp)
        reconsume_in tokenizer.rstate

    of HEXADECIMAL_CHARACTER_REFERENCE:
      case c
      let c2 = c.toLowerAscii()
      case c2
      of AsciiDigit:
        if tokenizer.code < 0x10FFFF:
          tokenizer.code *= 0x10
          tokenizer.code += uint32(c) - uint32('0')
          tokenizer.code += uint32(c2) - uint32('0')
      of 'a'..'f':
        if tokenizer.code < 0x10FFFF:
          tokenizer.code *= 0x10
          tokenizer.code += uint32(c) - uint32('a') + 10
      of 'A'..'F':
        if tokenizer.code < 0x10FFFF:
          tokenizer.code *= 0x10
          tokenizer.code += uint32(c) - uint32('A') + 10
      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
      else: reconsume_in NUMERIC_CHARACTER_REFERENCE_END
          tokenizer.code += uint32(c2) - uint32('a') + 10
      else:
        if c != ';':
          tokenizer.reconsume(c)
        tokenizer.numericCharacterReferenceEndState()
        switch_state tokenizer.rstate

    of DECIMAL_CHARACTER_REFERENCE:
      case c
      of AsciiDigit:
      if c in AsciiDigit:
        if tokenizer.code < 0x10FFFF:
          tokenizer.code *= 10
          tokenizer.code += uint32(c) - uint32('0')
      of ';': switch_state NUMERIC_CHARACTER_REFERENCE_END
      else: reconsume_in NUMERIC_CHARACTER_REFERENCE_END

    of NUMERIC_CHARACTER_REFERENCE_END:
      tokenizer.numericCharacterReferenceEndState()
      reconsume_in tokenizer.rstate # we unnecessarily consumed once so reconsume
      else:
        if c != ';':
          tokenizer.reconsume(c)
        tokenizer.numericCharacterReferenceEndState()
        switch_state tokenizer.rstate

  return trDone


M chame/tokstate.nim => chame/tokstate.nim +1 -2
@@ 29,5 29,4 @@ type TokenizerState* = enum
  AFTER_DOCTYPE_SYSTEM_IDENTIFIER, CDATA_SECTION_BRACKET, CDATA_SECTION_END,
  NAMED_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE,
  AMBIGUOUS_AMPERSAND_STATE, HEXADECIMAL_CHARACTER_REFERENCE_START,
  DECIMAL_CHARACTER_REFERENCE_START, HEXADECIMAL_CHARACTER_REFERENCE,
  DECIMAL_CHARACTER_REFERENCE, NUMERIC_CHARACTER_REFERENCE_END
  HEXADECIMAL_CHARACTER_REFERENCE, DECIMAL_CHARACTER_REFERENCE