package zerojson
import (
"fmt"
"io"
)
// errorString allows a string constant to be an error.
type errorString string
func (e errorString) Error() string {
return string(e)
}
const (
// ErrIncompleteValue is the error representing an incomplete true, false or
// null token, an unclosed string, object or array, or an incomplete number.
ErrIncompleteValue errorString = "incomplete value"
// ErrInvalidCodePoint is the error representing an invalid code point. This is
// any code point disallowed in the JSON grammar (e.g. "!" outside a string value),
// as well as an unescaped code point that is only allowed in escaped form within
// a string (quotation mark U+0022, reverse solidus U+005C, and the control characters
// U+0000 to U+001F).
ErrInvalidCodePoint errorString = "invalid code point"
// ErrIncompleteHexEsc is the error representing an invalid \uXXXX hexadecimal
// escape sequence in a string value.
ErrIncompleteHexEsc errorString = "incomplete hexadecimal escape sequence"
)
// JSON supports 7 different values:
//
// - Object, starting with '{'
// - Array, starting with '['
// - String, starting with '"'
// - Number, starting with '-' or [0-9]
// - True, starting with 't'
// - False, starting with 'f'
// - Null, starting with 'n'
//
// Of those, Object and Array can nest and as such require a stack.
// Also, when processing an Object, the parser must keep track of whether
// it is processing a key or a value (both may be strings).
const (
staticStackSize = 4
trueTrail = "rue"
falseTrail = "alse"
nullTrail = "ull"
)
type stack struct {
depth int
cur byte
// each uint64 can store 64 levels deep (1 bit is sufficient per level)
static [staticStackSize]uint64
// for very deeply-nested JSON, resort to allocation
dynamic []uint64
}
// It is expected that v is either '{' or '['. Behaviour is undefined
// if a different value is provided.
func (s *stack) push(v byte) {
// depth 0 means nothing pushed on the stack, so to find the bit
// index we need to get depth *before* incrementing it, and then
// module 64 as each uint64 has 64 bits.
wordIndex := s.depth / 64
bitIndex := s.depth % 64
s.depth++
s.cur = v
var fn func(uint, uint64) uint64
if v == '{' {
fn = setBit
} else {
fn = unsetBit
}
if wordIndex < staticStackSize {
s.static[wordIndex] = fn(uint(bitIndex), s.static[wordIndex])
return
}
wordIndex -= staticStackSize
if wordIndex >= len(s.dynamic) {
if wordIndex > len(s.dynamic) {
panic("stack wordIndex jumped ahead")
}
s.dynamic = append(s.dynamic, 0)
}
s.dynamic[wordIndex] = fn(uint(bitIndex), s.dynamic[wordIndex])
}
func setBit(at uint, word uint64) uint64 {
return word | 1<<at
}
func unsetBit(at uint, word uint64) uint64 {
return word &^ (1 << at)
}
func (s *stack) peek() byte {
if s.cur > 0 || s.depth == 0 {
return s.cur
}
ix := s.depth - 1
wordIndex := ix / 64
bitIndex := ix % 64
var word uint64
if wordIndex < staticStackSize {
word = s.static[wordIndex]
} else {
wordIndex -= staticStackSize
word = s.dynamic[wordIndex]
}
if word&(1<<uint(bitIndex)) == 0 {
s.cur = '['
} else {
s.cur = '{'
}
return s.cur
}
func (s *stack) pop() byte {
b := s.peek()
s.depth--
if s.depth < 0 {
panic("negative stack depth")
}
s.cur = 0 // unset, will need to be read on peek
return b
}
type parser struct {
// immutable fields
// input is the data to parse
input []byte
// emit is called for each parsed tokens with the type of token, the
// value and an error if the token is an invalid token. If the function returns
// an error, the parsing stops, otherwise it continues with the next
// token.
//
// The type may be:
// '{' : start of an object (v is '{')
// '[' : start of an array (v is '[')
// '}' : end of an object (v is '}')
// ']' : end of an array (v is ']')
// '"' : a string (v is the full string, with quotes)
// '1' : a number (v is the full number)
// 't' : true (v is the full literal)
// 'f' : false (v is the full literal)
// 'n' : null (v is the full literal)
// other : unknown or eof, only provided with a non-nil err
//
// The value v is a slice into the original input, it should not be
// modified. If err is not nil, typ represents the type that it should
// have been had it been valid.
emit func(offset int, typ byte, v []byte, err error) error
// parser state
cur byte
allow byte // '\x00'=any value, '<'=key, '>'=value, ':'=colon, ','=comma or ']' or '}'
pos int
stack stack
debug bool
}
func (p *parser) parse() error {
finalErr := io.EOF
// bootstrap execution
p.pos = -1
p.advance()
loop:
for {
p.skipWhitespace()
var (
typ byte
err error
)
start, atEOF := p.pos, p.eof()
typ = p.cur
p.advance()
if p.debug {
peek := p.stack.peek()
fmt.Printf("%d: stack=%c [%d : %b] ; allow=%c ; atEOF=%t ; char=%c\n", start, peek, p.stack.depth, p.stack.static[0], p.allow, atEOF, typ)
}
// TODO: probably need to emit ':' and ',' to keep track of where we are (key vs value),
// or add a "role" arg that indicates key, value or array value?
switch p.allow {
case ':':
if typ == ':' {
p.allow = '>'
continue
}
if atEOF {
finalErr = io.ErrUnexpectedEOF
break loop
}
err = ErrInvalidCodePoint
case ',':
closing := p.stack.peek() + 2 // turn '[' or '{' to ']' or '}'
if typ == ',' {
// expect another value in array or another key in object
if closing == '}' {
p.allow = '<'
} else {
p.allow = '>'
}
continue
}
if typ == closing {
p.stack.pop()
peek := p.stack.peek()
switch peek {
case '{', '[':
p.allow = ','
default:
p.allow = 0
}
} else {
if atEOF {
finalErr = io.ErrUnexpectedEOF
break loop
}
err = ErrInvalidCodePoint
}
case '<':
// key expected, must be a string or a closing object
switch typ {
case '"':
err = p.scanString()
p.allow = ':'
case '}':
p.stack.pop()
peek := p.stack.peek()
switch peek {
case '{', '[':
p.allow = ','
default:
p.allow = 0
}
default:
if atEOF {
finalErr = io.ErrUnexpectedEOF
break loop
}
err = ErrInvalidCodePoint
}
default:
allow := p.allow
if allow == '>' {
p.allow = ','
// special case for '>', if inside an array, can be a ']'
if inside := p.stack.peek(); inside == '[' && typ == ']' {
p.stack.pop()
peek := p.stack.peek()
switch peek {
case '{', '[':
p.allow = ','
default:
p.allow = 0
}
break
}
}
switch typ {
case '{':
p.stack.push(typ)
p.allow = '<'
case '[':
p.stack.push(typ)
p.allow = '>'
case '"':
err = p.scanString()
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
first := typ
typ = '1'
err = p.scanNumber(first)
case 't':
err = p.scanToken(trueTrail)
case 'f':
err = p.scanToken(falseTrail)
case 'n':
err = p.scanToken(nullTrail)
default:
if atEOF {
if allow == '>' {
finalErr = io.ErrUnexpectedEOF
}
break loop
}
err = ErrInvalidCodePoint
}
}
if e := p.emit(start, typ, p.input[start:p.pos], err); e != nil {
return e
}
}
return p.emit(p.pos, p.cur, nil, finalErr)
}
func (p *parser) scanNumber(first byte) error {
// if first == '-', the next char must be 0-9
if first == '-' {
if '0' <= p.cur && p.cur <= '9' {
first = p.cur
p.advance()
} else {
return ErrIncompleteValue
}
}
// if first digit is not '0', scan any subsequent digits
if first != '0' {
for '0' <= p.cur && p.cur <= '9' {
p.advance()
}
}
// scan any fractional digit
if p.cur == '.' {
var digitSeen bool
p.advance()
for '0' <= p.cur && p.cur <= '9' {
digitSeen = true
p.advance()
}
if !digitSeen {
return ErrIncompleteValue
}
}
// scan any exponent
if p.cur == 'e' || p.cur == 'E' {
p.advance()
if p.cur == '+' || p.cur == '-' {
p.advance()
}
var digitSeen bool
for '0' <= p.cur && p.cur <= '9' {
digitSeen = true
p.advance()
}
if !digitSeen {
return ErrIncompleteValue
}
}
return nil
}
func (p *parser) scanString() error {
for p.cur != '"' {
switch {
case p.cur == '\\':
p.advance()
if err := p.scanEscape(); err != nil {
return err
}
case p.eof():
return ErrIncompleteValue
case p.cur < 0x20:
// do not advance, the control character will be considered outside
// the string and will generate a distinct ErrInvalidCodePoint.
return ErrIncompleteValue
default:
p.advance()
}
}
// advance past the closing '"'
p.advance()
return nil
}
func (p *parser) scanEscape() error {
switch p.cur {
case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
p.advance()
return nil
case 'u', 'U':
p.advance()
return p.scanFourHex()
default:
// invalid escape, move back to previous byte and report unclosed
// string, then treat the single backslash as an invalid code point.
p.back()
return ErrIncompleteValue
}
}
func (p *parser) scanFourHex() error {
for i := 0; i < 4; i++ {
switch {
case '0' <= p.cur && p.cur <= '9':
case 'a' <= p.cur && p.cur <= 'f':
case 'A' <= p.cur && p.cur <= 'F':
default:
return ErrIncompleteHexEsc
}
p.advance()
}
return nil
}
func (p *parser) scanToken(trail string) error {
for _, b := range []byte(trail) {
if p.cur != b {
return ErrIncompleteValue
}
p.advance()
}
return nil
}
func (p *parser) eof() bool {
return p.cur == 0 && p.pos >= len(p.input)
}
func (p *parser) advance() {
if p.pos >= len(p.input)-1 {
p.pos = len(p.input)
p.cur = 0
return
}
p.pos++
p.cur = p.input[p.pos]
}
func (p *parser) back() {
p.pos--
p.cur = p.input[p.pos]
}
func (p *parser) skipWhitespace() {
for p.cur == '\t' || p.cur == '\r' || p.cur == '\n' || p.cur == ' ' {
p.advance()
}
}