780b8ea647d6b91e344b6dd20401a7f05b7f6317 — Martin Angers 6 months ago 2f03901
scan strings, start escapes
3 files changed, 118 insertions(+), 20 deletions(-)

M zerojson.go
A zerojson_bench_test.go
M zerojson_test.go
M zerojson.go => zerojson.go +75 -20
@@ 1,7 1,6 @@
 package zerojson
 
 import (
-	"fmt"
 	"io"
 )
 


@@ 13,9 12,21 @@
 }
 
 const (
-	// ErrIncompleteLiteral is the error representing an incomplete true, false or
-	// null literal.
-	ErrIncompleteLiteral errorString = "incomplete literal"
+	// ErrIncompleteToken is the error representing an incomplete true, false or
+	// null token.
+	ErrIncompleteToken errorString = "incomplete token"
+
+	// ErrUnclosedString is the error representing an unclosed string value, i.e.
+	// when the opening quote was encountered, but EOF was reached before the closing
+	// quote, or an unescaped code point that must be escaped is found.
+	ErrUnclosedString errorString = "unclosed string"
+
+	// ErrInvalidCodePoint is the error representing an invalid code point. This is
+	// any code point disallowed in the JSON grammar (e.g. "!" outside a string value),
+	// as well as an unescaped code point that is only allowed in escaped form within
+	// a string (quotation mark U+0022, reverse solidus U+005C, and the control characters
+	// U+0000 to U+001F).
+	ErrInvalidCodePoint errorString = "invalid code point"
 )
 
 // JSON supports 7 different values:


@@ 42,9 53,6 @@
 	trueTrail  = "rue"
 	falseTrail = "alse"
 	nullTrail  = "ull"
-
-	// Code point 00 (NUL) is invalid in JSON, use it as eof marker
-	eof = '\x00'
 )
 
 type stack struct {


@@ 134,7 142,7 @@
 	// 't' : true (v is the full literal)
 	// 'f' : false (v is the full literal)
 	// 'n' : null (v is the full literal)
-	// '\x00' : unknown or eof, only provided with a non-nil err
+	// other : unknown or eof, only provided with a non-nil err
 	//
 	// The value v is a slice into the original input, it should not be
 	// modified. If err is not nil, typ represents the type that it should


@@ 153,7 161,7 @@
 	p.advance()
 
 loop:
-	for p.cur != eof {
+	for {
 		p.skipWhitespace()
 
 		var (


@@ 171,23 179,24 @@
 		case '[':
 			p.stack.push(typ)
 		case '"':
-			//err = p.scanString()
+			err = p.scanString()
 		case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			//first := typ
 			typ = '1'
 			//err = p.scanNumber(first)
 		case 't':
-			err = p.scanLiteral(trueTrail)
+			err = p.scanToken(trueTrail)
 		case 'f':
-			err = p.scanLiteral(falseTrail)
+			err = p.scanToken(falseTrail)
 		case 'n':
-			err = p.scanLiteral(nullTrail)
-		case eof:
-			// TODO: if a token was required, error
-			break loop
+			err = p.scanToken(nullTrail)
 
 		default:
-			err = fmt.Errorf("invalid character: %#U", typ)
+			if p.eof() {
+				// TODO: if a token was required, error
+				break loop
+			}
+			err = ErrInvalidCodePoint
 		}
 
 		if e := p.emit(start, typ, p.input[start:p.pos], err); e != nil {


@@ 197,26 206,72 @@
 	return p.emit(p.pos, p.cur, nil, io.EOF)
 }
 
-func (p *parser) scanLiteral(trail string) error {
+func (p *parser) scanString() error {
+	for p.cur != '"' {
+		switch cur := p.cur; {
+		case cur == '\\':
+			// TODO: parse escape
+			p.advance()
+			if err := p.scanEscape(); err != nil {
+				return err
+			}
+		case p.eof():
+			return ErrUnclosedString
+		case cur <= 0x1F:
+			// do not advance, the control character will be considered outside
+			// the string and will generate a distinct ErrInvalidCodePoint.
+			return ErrUnclosedString
+		}
+		p.advance()
+	}
+	return nil
+}
+
+func (p *parser) scanEscape() error {
+	switch p.cur {
+	case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
+		p.advance()
+		return nil
+	case 'u', 'U':
+		// TODO: scan 4 hex digits
+		return nil
+	default:
+		// invalid escape, move back to previous byte and report unclosed
+		// string, then treat the single backslash as an invalid code point.
+		p.back()
+		return ErrUnclosedString
+	}
+}
+
+func (p *parser) scanToken(trail string) error {
 	for _, b := range []byte(trail) {
 		if p.cur != b {
-			return ErrIncompleteLiteral
+			return ErrIncompleteToken
 		}
 		p.advance()
 	}
 	return nil
 }
 
+func (p *parser) eof() bool {
+	return p.cur == 0 && p.pos >= len(p.input)
+}
+
 func (p *parser) advance() {
 	if p.pos >= len(p.input)-1 {
 		p.pos++
-		p.cur = eof
+		p.cur = 0
 		return
 	}
 	p.pos++
 	p.cur = p.input[p.pos]
 }
 
+func (p *parser) back() {
+	p.pos--
+	p.cur = p.input[p.pos]
+}
+
 func (p *parser) skipWhitespace() {
 	for p.cur == '\t' || p.cur == '\r' || p.cur == '\n' || p.cur == ' ' {
 		p.advance()

A zerojson_bench_test.go => zerojson_bench_test.go +38 -0
@@ 0,0 1,38 @@
+package zerojson
+
+import (
+	"fmt"
+	"io"
+	"testing"
+)
+
+func BenchmarkTrue(b *testing.B) {
+	benchmarkInput(b, "true")
+}
+
+func BenchmarkFalse(b *testing.B) {
+	benchmarkInput(b, "false")
+}
+
+func BenchmarkNull(b *testing.B) {
+	benchmarkInput(b, "null")
+}
+
+func benchmarkInput(b *testing.B, input string) {
+	p := parser{
+		input: []byte(fmt.Sprintf(" %s ", input)),
+		emit: func(offset int, typ byte, v []byte, err error) error {
+			if err == io.EOF {
+				return nil
+			}
+			return err
+		},
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		p.stack = stack{}
+		if err := p.parse(); err != nil {
+			b.Fatal(err)
+		}
+	}
+}

M zerojson_test.go => zerojson_test.go +5 -0
@@ 19,6 19,11 @@
 	}{
 		{"", "", nil},
 		{" \t\n ", "", nil},
+
+		{" nul ", "1: n: nul", ErrIncompleteToken},
+		{"t", "0: t: t", ErrIncompleteToken},
+		{"\t\n fa", "3: f: fa", ErrIncompleteToken},
+
 		{"null", "0: n: null", nil},
 		{"true", "0: t: true", nil},
 		{"false", "0: f: false", nil},