encoding/protojson: refactor to follow prototext pattern All unmarshaling error messages now contain line number and column information, except for the following errors: - `unexpected EOF` - `no support for proto1 MessageSets` - `required fields X not set` Changes to internal/encoding/json: - Moved encoding funcs in string.go and number.go into encode.go. - Separated out encoding kind constants from decoding ones. - Renamed file string.go to decode_string.go. - Renamed file number.go to decode_number.go. - Renamed Type struct to Kind. - Renamed Value struct to Token. - Token accessor methods no longer return error. Name, Bool, ParsedString will panic if called on the wrong kind. Float, Int, Uint has ok bool result to check against. - Changed Peek to return Token and error. Changes to encoding/protojson: - Updated internal/encoding/json API calls. - Added line info on most unmarshaling error messages and kept description simple and consistent. Change-Id: Ie50456694f2214c5c4fafd2c9b9239680da0deec Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/218978 Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>

commit: d2ece139c61886cffb2ed8812e44ad0886661ff6 [log] [tgz]
author: Herbie Ong <herbie@google.com> Tue Jan 07 16:45:24 2020 -0800
committer: Herbie Ong <herbie@google.com> Tue Feb 11 22:59:08 2020 +0000
tree: e44ced66b5b94344deff56b634b32e58d5ed917b
parent: 3b512245dc29bb790d2d75d3199ea653d5ef2c07 [diff] [blame]
diff --git a/internal/encoding/json/encode.go b/internal/encoding/json/encode.go
index 741f34f..17dec31 100644
--- a/internal/encoding/json/encode.go
+++ b/internal/encoding/json/encode.go

@@ -5,18 +5,34 @@
 package json
 
 import (
+	"math"
+	"math/bits"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 
 	"google.golang.org/protobuf/internal/detrand"
 	"google.golang.org/protobuf/internal/errors"
 )
 
+// kind represents an encoding type.
+type kind uint8
+
+const (
+	_ kind = (1 << iota) / 2
+	name
+	scalar
+	objectOpen
+	objectClose
+	arrayOpen
+	arrayClose
+)
+
 // Encoder provides methods to write out JSON constructs and values. The user is
 // responsible for producing valid sequences of JSON constructs and values.
 type Encoder struct {
 	indent   string
-	lastType Type
+	lastKind kind
 	indents  []byte
 	out      []byte
 }
@@ -43,13 +59,13 @@
 
 // WriteNull writes out the null value.
 func (e *Encoder) WriteNull() {
-	e.prepareNext(Null)
+	e.prepareNext(scalar)
 	e.out = append(e.out, "null"...)
 }
 
 // WriteBool writes out the given boolean value.
 func (e *Encoder) WriteBool(b bool) {
-	e.prepareNext(Bool)
+	e.prepareNext(scalar)
 	if b {
 		e.out = append(e.out, "true"...)
 	} else {
@@ -57,9 +73,10 @@
 	}
 }
 
-// WriteString writes out the given string in JSON string value.
+// WriteString writes out the given string in JSON string value. Returns error
+// if input string contains invalid UTF-8.
 func (e *Encoder) WriteString(s string) error {
-	e.prepareNext(String)
+	e.prepareNext(scalar)
 	var err error
 	if e.out, err = appendString(e.out, s); err != nil {
 		return err
@@ -67,42 +84,126 @@
 	return nil
 }
 
+// Sentinel error used for indicating invalid UTF-8.
+var invalidUTF8Err = errors.New("invalid UTF-8")
+
+func appendString(out []byte, in string) ([]byte, error) {
+	out = append(out, '"')
+	i := indexNeedEscapeInString(in)
+	in, out = in[i:], append(out, in[:i]...)
+	for len(in) > 0 {
+		switch r, n := utf8.DecodeRuneInString(in); {
+		case r == utf8.RuneError && n == 1:
+			return out, invalidUTF8Err
+		case r < ' ' || r == '"' || r == '\\':
+			out = append(out, '\\')
+			switch r {
+			case '"', '\\':
+				out = append(out, byte(r))
+			case '\b':
+				out = append(out, 'b')
+			case '\f':
+				out = append(out, 'f')
+			case '\n':
+				out = append(out, 'n')
+			case '\r':
+				out = append(out, 'r')
+			case '\t':
+				out = append(out, 't')
+			default:
+				out = append(out, 'u')
+				out = append(out, "0000"[1+(bits.Len32(uint32(r))-1)/4:]...)
+				out = strconv.AppendUint(out, uint64(r), 16)
+			}
+			in = in[n:]
+		default:
+			i := indexNeedEscapeInString(in[n:])
+			in, out = in[n+i:], append(out, in[:n+i]...)
+		}
+	}
+	out = append(out, '"')
+	return out, nil
+}
+
+// indexNeedEscapeInString returns the index of the character that needs
+// escaping. If no characters need escaping, this returns the input length.
+func indexNeedEscapeInString(s string) int {
+	for i, r := range s {
+		if r < ' ' || r == '\\' || r == '"' || r == utf8.RuneError {
+			return i
+		}
+	}
+	return len(s)
+}
+
 // WriteFloat writes out the given float and bitSize in JSON number value.
 func (e *Encoder) WriteFloat(n float64, bitSize int) {
-	e.prepareNext(Number)
+	e.prepareNext(scalar)
 	e.out = appendFloat(e.out, n, bitSize)
 }
 
+// appendFloat formats given float in bitSize, and appends to the given []byte.
+func appendFloat(out []byte, n float64, bitSize int) []byte {
+	switch {
+	case math.IsNaN(n):
+		return append(out, `"NaN"`...)
+	case math.IsInf(n, +1):
+		return append(out, `"Infinity"`...)
+	case math.IsInf(n, -1):
+		return append(out, `"-Infinity"`...)
+	}
+
+	// JSON number formatting logic based on encoding/json.
+	// See floatEncoder.encode for reference.
+	fmt := byte('f')
+	if abs := math.Abs(n); abs != 0 {
+		if bitSize == 64 && (abs < 1e-6 || abs >= 1e21) ||
+			bitSize == 32 && (float32(abs) < 1e-6 || float32(abs) >= 1e21) {
+			fmt = 'e'
+		}
+	}
+	out = strconv.AppendFloat(out, n, fmt, -1, bitSize)
+	if fmt == 'e' {
+		n := len(out)
+		if n >= 4 && out[n-4] == 'e' && out[n-3] == '-' && out[n-2] == '0' {
+			out[n-2] = out[n-1]
+			out = out[:n-1]
+		}
+	}
+	return out
+}
+
 // WriteInt writes out the given signed integer in JSON number value.
 func (e *Encoder) WriteInt(n int64) {
-	e.prepareNext(Number)
+	e.prepareNext(scalar)
 	e.out = append(e.out, strconv.FormatInt(n, 10)...)
 }
 
 // WriteUint writes out the given unsigned integer in JSON number value.
 func (e *Encoder) WriteUint(n uint64) {
-	e.prepareNext(Number)
+	e.prepareNext(scalar)
 	e.out = append(e.out, strconv.FormatUint(n, 10)...)
 }
 
 // StartObject writes out the '{' symbol.
 func (e *Encoder) StartObject() {
-	e.prepareNext(StartObject)
+	e.prepareNext(objectOpen)
 	e.out = append(e.out, '{')
 }
 
 // EndObject writes out the '}' symbol.
 func (e *Encoder) EndObject() {
-	e.prepareNext(EndObject)
+	e.prepareNext(objectClose)
 	e.out = append(e.out, '}')
 }
 
 // WriteName writes out the given string in JSON string value and the name
-// separator ':'.
+// separator ':'. Returns error if input string contains invalid UTF-8, which
+// should not be likely as protobuf field names should be valid.
 func (e *Encoder) WriteName(s string) error {
-	e.prepareNext(Name)
-	// Errors returned by appendString() are non-fatal.
+	e.prepareNext(name)
 	var err error
+	// Append to output regardless of error.
 	e.out, err = appendString(e.out, s)
 	e.out = append(e.out, ':')
 	return err
@@ -110,28 +211,28 @@
 
 // StartArray writes out the '[' symbol.
 func (e *Encoder) StartArray() {
-	e.prepareNext(StartArray)
+	e.prepareNext(arrayOpen)
 	e.out = append(e.out, '[')
 }
 
 // EndArray writes out the ']' symbol.
 func (e *Encoder) EndArray() {
-	e.prepareNext(EndArray)
+	e.prepareNext(arrayClose)
 	e.out = append(e.out, ']')
 }
 
 // prepareNext adds possible comma and indentation for the next value based
-// on last type and indent option. It also updates lastType to next.
-func (e *Encoder) prepareNext(next Type) {
+// on last type and indent option. It also updates lastKind to next.
+func (e *Encoder) prepareNext(next kind) {
 	defer func() {
-		// Set lastType to next.
-		e.lastType = next
+		// Set lastKind to next.
+		e.lastKind = next
 	}()
 
 	if len(e.indent) == 0 {
 		// Need to add comma on the following condition.
-		if e.lastType&(Null|Bool|Number|String|EndObject|EndArray) != 0 &&
-			next&(Name|Null|Bool|Number|String|StartObject|StartArray) != 0 {
+		if e.lastKind&(scalar|objectClose|arrayClose) != 0 &&
+			next&(name|scalar|objectOpen|arrayOpen) != 0 {
 			e.out = append(e.out, ',')
 			// For single-line output, add a random extra space after each
 			// comma to make output unstable.
@@ -143,28 +244,28 @@
 	}
 
 	switch {
-	case e.lastType&(StartObject|StartArray) != 0:
+	case e.lastKind&(objectOpen|arrayOpen) != 0:
 		// If next type is NOT closing, add indent and newline.
-		if next&(EndObject|EndArray) == 0 {
+		if next&(objectClose|arrayClose) == 0 {
 			e.indents = append(e.indents, e.indent...)
 			e.out = append(e.out, '\n')
 			e.out = append(e.out, e.indents...)
 		}
 
-	case e.lastType&(Null|Bool|Number|String|EndObject|EndArray) != 0:
+	case e.lastKind&(scalar|objectClose|arrayClose) != 0:
 		switch {
 		// If next type is either a value or name, add comma and newline.
-		case next&(Name|Null|Bool|Number|String|StartObject|StartArray) != 0:
+		case next&(name|scalar|objectOpen|arrayOpen) != 0:
 			e.out = append(e.out, ',', '\n')
 
 		// If next type is a closing object or array, adjust indentation.
-		case next&(EndObject|EndArray) != 0:
+		case next&(objectClose|arrayClose) != 0:
 			e.indents = e.indents[:len(e.indents)-len(e.indent)]
 			e.out = append(e.out, '\n')
 		}
 		e.out = append(e.out, e.indents...)
 
-	case e.lastType&Name != 0:
+	case e.lastKind&name != 0:
 		e.out = append(e.out, ' ')
 		// For multi-line output, add a random extra space after key: to make
 		// output unstable.
commit	d2ece139c61886cffb2ed8812e44ad0886661ff6	[log] [tgz]
author	Herbie Ong <herbie@google.com>	Tue Jan 07 16:45:24 2020 -0800
committer	Herbie Ong <herbie@google.com>	Tue Feb 11 22:59:08 2020 +0000
tree	e44ced66b5b94344deff56b634b32e58d5ed917b
parent	3b512245dc29bb790d2d75d3199ea653d5ef2c07 [diff] [blame]