internal/encoding/json: improve decoding speed and memory allocation Change use of regexp for matching literals true,false,null to simple bytes comparison. Small gain from doing this. Remove computing for position in Value as that is only needed in error messages. In order to preserve ability to compute for position later, store the original input in Value instead of just the slice containing the value, however, need to also store the start index and size of the parsed value. Using benchmark in encoding/bench_test.go now shows faster time and less memory usage than V1. name old time/op new time/op delta JSONEncode-4 30.3ms ± 1% 10.3ms ± 1% -66.02% (p=0.000 n=9+8) JSONDecode-4 54.4ms ± 3% 18.9ms ± 2% -65.33% (p=0.000 n=10+10) name old alloc/op new alloc/op delta JSONEncode-4 10.3MB ± 0% 3.9MB ± 0% -61.74% (p=0.000 n=10+9) JSONDecode-4 19.0MB ± 0% 3.6MB ± 0% -81.29% (p=0.000 n=10+9) name old allocs/op new allocs/op delta JSONEncode-4 465k ± 0% 64k ± 0% -86.30% (p=0.000 n=10+8) JSONDecode-4 289k ± 0% 163k ± 0% -43.69% (p=0.000 n=10+9) Change-Id: I0a3108d675d6442674facb065aaebd14051f6c5d Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/172662 Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>

commit: decef41dccee58c474ca03c1100a1abaa6f30549 [log] [tgz]
author: Herbie Ong <herbie@google.com> Wed Apr 17 15:47:43 2019 -0700
committer: Herbie Ong <herbie@google.com> Mon Apr 29 22:13:25 2019 +0000
tree: 6d3dbad9b01cdbf29f04a4dcd8e1dfe9d2976252
parent: 4f0be71f91df46e6b018cde46fcef69ca915042b [diff] [blame]
diff --git a/internal/encoding/json/decode.go b/internal/encoding/json/decode.go
index 0ee6c85..545fb42 100644
--- a/internal/encoding/json/decode.go
+++ b/internal/encoding/json/decode.go

@@ -61,8 +61,8 @@
 }
 
 // Read returns the next JSON value. It will return an error if there is no
-// valid value.  For String types containing invalid UTF8 characters, a
-// non-fatal error is returned and caller can call Read for the next value.
+// valid value. For String types containing invalid UTF8 characters, a non-fatal
+// error is returned and caller can call Read for the next value.
 func (d *Decoder) Read() (Value, error) {
 	defer func() { d.lastCall = readCall }()
 	if d.lastCall == peekCall {
@@ -70,10 +70,11 @@
 	}
 
 	var nerr errors.NonFatal
-	value, n, err := d.parseNext()
+	value, err := d.parseNext()
 	if !nerr.Merge(err) {
 		return Value{}, err
 	}
+	n := value.size
 
 	switch value.typ {
 	case EOF:
@@ -137,9 +138,8 @@
 		}
 	}
 
-	// Update lastType only after validating value to be in the right
-	// sequence.
-	d.value.typ = value.typ
+	// Update d.value only after validating value to be in the right sequence.
+	d.value = value
 	d.in = d.in[n:]
 
 	if d.value.typ == comma {
@@ -148,77 +148,79 @@
 	return value, nerr.E
 }
 
-var (
-	literalRegexp = regexp.MustCompile(`^(null|true|false)`)
-	// Any sequence that looks like a non-delimiter (for error reporting).
-	errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`)
-)
+// Any sequence that looks like a non-delimiter (for error reporting).
+var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9]{1,32}|.)`)
 
 // parseNext parses for the next JSON value. It returns a Value object for
-// different types, except for Name. It also returns the size that was parsed.
-// It does not handle whether the next value is in a valid sequence or not, it
-// only ensures that the value is a valid one.
-func (d *Decoder) parseNext() (value Value, n int, err error) {
+// different types, except for Name. It does not handle whether the next value
+// is in a valid sequence or not.
+func (d *Decoder) parseNext() (value Value, err error) {
 	// Trim leading spaces.
 	d.consume(0)
 
 	in := d.in
 	if len(in) == 0 {
-		return d.newValue(nil, EOF), 0, nil
+		return d.newValue(EOF, nil, 0), nil
 	}
 
 	switch in[0] {
-	case 'n', 't', 'f':
-		n := matchWithDelim(literalRegexp, in)
+	case 'n':
+		n := matchWithDelim("null", in)
 		if n == 0 {
-			return Value{}, 0, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
+			return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
 		}
-		switch in[0] {
-		case 'n':
-			return d.newValue(in[:n], Null), n, nil
-		case 't':
-			return d.newBoolValue(in[:n], true), n, nil
-		case 'f':
-			return d.newBoolValue(in[:n], false), n, nil
+		return d.newValue(Null, in, n), nil
+
+	case 't':
+		n := matchWithDelim("true", in)
+		if n == 0 {
+			return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
 		}
+		return d.newBoolValue(in, n, true), nil
+
+	case 'f':
+		n := matchWithDelim("false", in)
+		if n == 0 {
+			return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
+		}
+		return d.newBoolValue(in, n, false), nil
 
 	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 		n, ok := consumeNumber(in)
 		if !ok {
-			return Value{}, 0, d.newSyntaxError("invalid number %s", errRegexp.Find(in))
+			return Value{}, d.newSyntaxError("invalid number %s", errRegexp.Find(in))
 		}
-		return d.newValue(in[:n], Number), n, nil
+		return d.newValue(Number, in, n), nil
 
 	case '"':
 		var nerr errors.NonFatal
 		s, n, err := d.parseString(in)
 		if !nerr.Merge(err) {
-			return Value{}, 0, err
+			return Value{}, err
 		}
-		return d.newStringValue(in[:n], s), n, nerr.E
+		return d.newStringValue(in, n, s), nerr.E
 
 	case '{':
-		return d.newValue(in[:1], StartObject), 1, nil
+		return d.newValue(StartObject, in, 1), nil
 
 	case '}':
-		return d.newValue(in[:1], EndObject), 1, nil
+		return d.newValue(EndObject, in, 1), nil
 
 	case '[':
-		return d.newValue(in[:1], StartArray), 1, nil
+		return d.newValue(StartArray, in, 1), nil
 
 	case ']':
-		return d.newValue(in[:1], EndArray), 1, nil
+		return d.newValue(EndArray, in, 1), nil
 
 	case ',':
-		return d.newValue(in[:1], comma), 1, nil
+		return d.newValue(comma, in, 1), nil
 	}
-	return Value{}, 0, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
+	return Value{}, d.newSyntaxError("invalid value %s", errRegexp.Find(in))
 }
 
-// position returns line and column number of parsed bytes.
-func (d *Decoder) position() (int, int) {
-	// Calculate line and column of consumed input.
-	b := d.orig[:len(d.orig)-len(d.in)]
+// position returns line and column number of index in given orig slice.
+func position(orig []byte, idx int) (int, int) {
+	b := orig[:idx]
 	line := bytes.Count(b, []byte("\n")) + 1
 	if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
 		b = b[i+1:]
@@ -231,20 +233,22 @@
 // syntax errors.
 func (d *Decoder) newSyntaxError(f string, x ...interface{}) error {
 	e := errors.New(f, x...)
-	line, column := d.position()
+	line, column := position(d.orig, len(d.orig)-len(d.in))
 	return errors.New("syntax error (line %d:%d): %v", line, column, e)
 }
 
-// matchWithDelim matches r with the input b and verifies that the match
+// matchWithDelim matches s with the input b and verifies that the match
 // terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
-// As a special case, EOF is considered a delimiter.
-func matchWithDelim(r *regexp.Regexp, b []byte) int {
-	n := len(r.Find(b))
-	if n < len(b) {
-		// Check that the next character is a delimiter.
-		if isNotDelim(b[n]) {
-			return 0
-		}
+// As a special case, EOF is considered a delimiter. It returns the length of s
+// if there is a match, else 0.
+func matchWithDelim(s string, b []byte) int {
+	if !bytes.HasPrefix(b, []byte(s)) {
+		return 0
+	}
+
+	n := len(s)
+	if n < len(b) && isNotDelim(b[n]) {
+		return 0
 	}
 	return n
 }
@@ -290,37 +294,34 @@
 }
 
 // newValue constructs a Value for given Type.
-func (d *Decoder) newValue(input []byte, typ Type) Value {
-	line, column := d.position()
+func (d *Decoder) newValue(typ Type, input []byte, size int) Value {
 	return Value{
-		input:  input,
-		line:   line,
-		column: column,
-		typ:    typ,
+		typ:   typ,
+		input: d.orig,
+		start: len(d.orig) - len(input),
+		size:  size,
 	}
 }
 
 // newBoolValue constructs a Value for a JSON boolean.
-func (d *Decoder) newBoolValue(input []byte, b bool) Value {
-	line, column := d.position()
+func (d *Decoder) newBoolValue(input []byte, size int, b bool) Value {
 	return Value{
-		input:  input,
-		line:   line,
-		column: column,
-		typ:    Bool,
-		boo:    b,
+		typ:   Bool,
+		input: d.orig,
+		start: len(d.orig) - len(input),
+		size:  size,
+		boo:   b,
 	}
 }
 
 // newStringValue constructs a Value for a JSON string.
-func (d *Decoder) newStringValue(input []byte, s string) Value {
-	line, column := d.position()
+func (d *Decoder) newStringValue(input []byte, size int, s string) Value {
 	return Value{
-		input:  input,
-		line:   line,
-		column: column,
-		typ:    String,
-		str:    s,
+		typ:   String,
+		input: d.orig,
+		start: len(d.orig) - len(input),
+		size:  size,
+		str:   s,
 	}
 }
 
@@ -332,23 +333,29 @@
 	return &ret
 }
 
-// Value contains a JSON type and value parsed from calling Decoder.Read.
+// Value provides a parsed JSON type and value.
+//
+// The original input slice is stored in this struct in order to compute for
+// position as needed. The raw JSON value is derived from the original input
+// slice given start and size.
+//
 // For JSON boolean and string, it holds the converted value in boo and str
-// fields respectively. For JSON number, input field holds a valid number which
-// is converted only in Int or Float. Other JSON types do not require any
+// fields respectively. For JSON number, the raw JSON value holds a valid number
+// which is converted only in Int or Float. Other JSON types do not require any
 // additional data.
 type Value struct {
-	input  []byte
-	line   int
-	column int
-	typ    Type
-	boo    bool
-	str    string
+	typ   Type
+	input []byte
+	start int
+	size  int
+	boo   bool
+	str   string
 }
 
 func (v Value) newError(f string, x ...interface{}) error {
 	e := errors.New(f, x...)
-	return errors.New("error (line %d:%d): %v", v.line, v.column, e)
+	line, col := v.Position()
+	return errors.New("error (line %d:%d): %v", line, col, e)
 }
 
 // Type returns the JSON type.
@@ -358,13 +365,13 @@
 
 // Position returns the line and column of the value.
 func (v Value) Position() (int, int) {
-	return v.line, v.column
+	return position(v.input, v.start)
 }
 
 // Bool returns the bool value if token is Bool, else it will return an error.
 func (v Value) Bool() (bool, error) {
 	if v.typ != Bool {
-		return false, v.newError("%s is not a bool", v.input)
+		return false, v.newError("%s is not a bool", v.Raw())
 	}
 	return v.boo, nil
 }
@@ -373,7 +380,7 @@
 // string if token is not a string.
 func (v Value) String() string {
 	if v.typ != String {
-		return string(v.input)
+		return v.Raw()
 	}
 	return v.str
 }
@@ -381,14 +388,14 @@
 // Name returns the object name if token is Name, else it will return an error.
 func (v Value) Name() (string, error) {
 	if v.typ != Name {
-		return "", v.newError("%s is not an object name", v.input)
+		return "", v.newError("%s is not an object name", v.Raw())
 	}
 	return v.str, nil
 }
 
 // Raw returns the read value in string.
 func (v Value) Raw() string {
-	return string(v.input)
+	return string(v.input[v.start : v.start+v.size])
 }
 
 // Float returns the floating-point number if token is Number, else it will
@@ -401,9 +408,9 @@
 // bitSize.
 func (v Value) Float(bitSize int) (float64, error) {
 	if v.typ != Number {
-		return 0, v.newError("%s is not a number", v.input)
+		return 0, v.newError("%s is not a number", v.Raw())
 	}
-	f, err := strconv.ParseFloat(string(v.input), bitSize)
+	f, err := strconv.ParseFloat(v.Raw(), bitSize)
 	if err != nil {
 		return 0, v.newError("%v", err)
 	}
@@ -450,7 +457,7 @@
 	if v.typ != Number {
 		return "", v.newError("%s is not a number", v.input)
 	}
-	parts, ok := parseNumber(v.input)
+	parts, ok := parseNumber(v.input[v.start : v.start+v.size])
 	if !ok {
 		return "", v.newError("%s is not a number", v.input)
 	}
commit	decef41dccee58c474ca03c1100a1abaa6f30549	[log] [tgz]
author	Herbie Ong <herbie@google.com>	Wed Apr 17 15:47:43 2019 -0700
committer	Herbie Ong <herbie@google.com>	Mon Apr 29 22:13:25 2019 +0000
tree	6d3dbad9b01cdbf29f04a4dcd8e1dfe9d2976252
parent	4f0be71f91df46e6b018cde46fcef69ca915042b [diff] [blame]