internal/encoding/json: initial commit of JSON parser/serializer

Package json provides a parser and serializer for the JSON format.
This focuses on the grammar of the format and is agnostic towards specific
semantics of protobuf types.

High-level API:
	func Marshal(v Value, indent string) ([]byte, error)
	func Unmarshal(b []byte) (Value, error)
	type Type uint8
	    const Null Type ...
	type Value struct{ ... }
	    func ValueOf(v interface{}) Value
		func (v Value) Type() Type
		func (v Value) Bool() bool
		func (v Value) Number() float64
		func (v Value) String() string
		func (v Value) Array() []Value
		func (v Value) Object() [][2]Value
		func (v Value) Raw() []byte

Change-Id: I26422f6b3881ef1a11b8aa95160645b1384b27b8
Reviewed-on: https://go-review.googlesource.com/127824
Reviewed-by: Herbie Ong <herbie@google.com>
diff --git a/internal/encoding/json/decode.go b/internal/encoding/json/decode.go
new file mode 100644
index 0000000..4f7ddb6
--- /dev/null
+++ b/internal/encoding/json/decode.go
@@ -0,0 +1,194 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package json
+
+import (
+	"bytes"
+	"io"
+	"regexp"
+	"unicode/utf8"
+
+	"google.golang.org/proto/internal/errors"
+)
+
+type syntaxError struct{ error }
+
+func newSyntaxError(f string, x ...interface{}) error {
+	return syntaxError{errors.New(f, x...)}
+}
+
+// Unmarshal parses b as the JSON format.
+// It returns a Value, which represents the input as an AST.
+func Unmarshal(b []byte) (Value, error) {
+	p := decoder{in: b}
+	p.consume(0) // trim leading spaces
+	v, err := p.unmarshalValue()
+	if !p.nerr.Merge(err) {
+		if e, ok := err.(syntaxError); ok {
+			b = b[:len(b)-len(p.in)] // consumed input
+			line := bytes.Count(b, []byte("\n")) + 1
+			if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
+				b = b[i+1:]
+			}
+			column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
+			err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
+		}
+		return Value{}, err
+	}
+	if len(p.in) > 0 {
+		return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
+	}
+	return v, p.nerr.E
+}
+
+type decoder struct {
+	nerr errors.NonFatal
+	in   []byte
+}
+
+var literalRegexp = regexp.MustCompile("^(null|true|false)")
+
+func (p *decoder) unmarshalValue() (Value, error) {
+	if len(p.in) == 0 {
+		return Value{}, io.ErrUnexpectedEOF
+	}
+	switch p.in[0] {
+	case 'n', 't', 'f':
+		if n := matchWithDelim(literalRegexp, p.in); n > 0 {
+			var v Value
+			switch p.in[0] {
+			case 'n':
+				v = rawValueOf(nil, p.in[:n:n])
+			case 't':
+				v = rawValueOf(true, p.in[:n:n])
+			case 'f':
+				v = rawValueOf(false, p.in[:n:n])
+			}
+			p.consume(n)
+			return v, nil
+		}
+		return Value{}, newSyntaxError("invalid %q as literal", errRegexp.Find(p.in))
+	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+		return p.unmarshalNumber()
+	case '"':
+		return p.unmarshalString()
+	case '[':
+		return p.unmarshalArray()
+	case '{':
+		return p.unmarshalObject()
+	default:
+		return Value{}, newSyntaxError("invalid %q as value", errRegexp.Find(p.in))
+	}
+}
+
+func (p *decoder) unmarshalArray() (Value, error) {
+	b := p.in
+	var elems []Value
+	if err := p.consumeChar('[', "at start of array"); err != nil {
+		return Value{}, err
+	}
+	if len(p.in) > 0 && p.in[0] != ']' {
+		for len(p.in) > 0 {
+			v, err := p.unmarshalValue()
+			if !p.nerr.Merge(err) {
+				return Value{}, err
+			}
+			elems = append(elems, v)
+			if !p.tryConsumeChar(',') {
+				break
+			}
+		}
+	}
+	if err := p.consumeChar(']', "at end of array"); err != nil {
+		return Value{}, err
+	}
+	b = b[:len(b)-len(p.in)]
+	return rawValueOf(elems, b[:len(b):len(b)]), nil
+}
+
+func (p *decoder) unmarshalObject() (Value, error) {
+	b := p.in
+	var items [][2]Value
+	if err := p.consumeChar('{', "at start of object"); err != nil {
+		return Value{}, err
+	}
+	if len(p.in) > 0 && p.in[0] != '}' {
+		for len(p.in) > 0 {
+			k, err := p.unmarshalString()
+			if !p.nerr.Merge(err) {
+				return Value{}, err
+			}
+			if err := p.consumeChar(':', "in object"); err != nil {
+				return Value{}, err
+			}
+			v, err := p.unmarshalValue()
+			if !p.nerr.Merge(err) {
+				return Value{}, err
+			}
+			items = append(items, [2]Value{k, v})
+			if !p.tryConsumeChar(',') {
+				break
+			}
+		}
+	}
+	if err := p.consumeChar('}', "at end of object"); err != nil {
+		return Value{}, err
+	}
+	b = b[:len(b)-len(p.in)]
+	return rawValueOf(items, b[:len(b):len(b)]), nil
+}
+
+func (p *decoder) consumeChar(c byte, msg string) error {
+	if p.tryConsumeChar(c) {
+		return nil
+	}
+	if len(p.in) == 0 {
+		return io.ErrUnexpectedEOF
+	}
+	return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
+}
+
+func (p *decoder) tryConsumeChar(c byte) bool {
+	if len(p.in) > 0 && p.in[0] == c {
+		p.consume(1)
+		return true
+	}
+	return false
+}
+
+// consume consumes n bytes of input and any subsequent whitespace.
+func (p *decoder) consume(n int) {
+	p.in = p.in[n:]
+	for len(p.in) > 0 {
+		switch p.in[0] {
+		case ' ', '\n', '\r', '\t':
+			p.in = p.in[1:]
+		default:
+			return
+		}
+	}
+}
+
+// Any sequence that looks like a non-delimiter (for error reporting).
+var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
+
+// matchWithDelim matches r with the input b and verifies that the match
+// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
+// As a special case, EOF is considered a delimiter.
+func matchWithDelim(r *regexp.Regexp, b []byte) int {
+	n := len(r.Find(b))
+	if n < len(b) {
+		// Check that that the next character is a delimiter.
+		c := b[n]
+		notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
+			('a' <= c && c <= 'z') ||
+			('A' <= c && c <= 'Z') ||
+			('0' <= c && c <= '9'))
+		if notDelim {
+			return 0
+		}
+	}
+	return n
+}