internal/encoding/text: replace use of regular expression in decoding

Improve performance by replacing use of regular expressions with direct
parsing code.

Compared to latest version:

name                                     old time/op    new time/op    delta
Text/Unmarshal/google_message1_proto2-4    21.8µs ± 5%    14.0µs ± 9%  -35.69%  (p=0.000 n=10+9)
Text/Unmarshal/google_message1_proto3-4    19.6µs ± 4%    13.8µs ±10%  -29.47%  (p=0.000 n=10+10)
Text/Unmarshal/google_message2-4           13.4ms ± 4%     4.9ms ± 4%  -63.44%  (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4      13.8µs ± 2%    14.1µs ± 4%   +2.42%  (p=0.011 n=9+10)
Text/Marshal/google_message1_proto3-4      11.6µs ± 2%    11.8µs ± 8%     ~     (p=0.573 n=8+10)
Text/Marshal/google_message2-4             8.01ms ±48%    5.97ms ± 5%  -25.44%  (p=0.000 n=10+10)

name                                     old alloc/op   new alloc/op   delta
Text/Unmarshal/google_message1_proto2-4    13.0kB ± 0%    12.6kB ± 0%   -3.40%  (p=0.000 n=10+10)
Text/Unmarshal/google_message1_proto3-4    13.0kB ± 0%    12.5kB ± 0%   -3.50%  (p=0.000 n=10+10)
Text/Unmarshal/google_message2-4           5.67MB ± 0%    5.50MB ± 0%   -3.13%  (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4      12.0kB ± 0%    12.1kB ± 0%   +0.02%  (p=0.000 n=10+10)
Text/Marshal/google_message1_proto3-4      11.7kB ± 0%    11.7kB ± 0%   +0.01%  (p=0.000 n=10+10)
Text/Marshal/google_message2-4             5.68MB ± 0%    5.68MB ± 0%   +0.01%  (p=0.000 n=10+10)

name                                     old allocs/op  new allocs/op  delta
Text/Unmarshal/google_message1_proto2-4       142 ± 0%       142 ± 0%     ~     (all equal)
Text/Unmarshal/google_message1_proto3-4       156 ± 0%       156 ± 0%     ~     (all equal)
Text/Unmarshal/google_message2-4            70.1k ± 0%     65.4k ± 0%   -6.76%  (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4        91.0 ± 0%      91.0 ± 0%     ~     (all equal)
Text/Marshal/google_message1_proto3-4        80.0 ± 0%      80.0 ± 0%     ~     (all equal)
Text/Marshal/google_message2-4              36.4k ± 0%     36.4k ± 0%     ~     (all equal)

Change-Id: Ia5d3c16e9e33961aae03bac0d53fcfc5b1943d2a
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/173360
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
index 2b32ed9..c0513a8 100644
--- a/internal/encoding/text/decode.go
+++ b/internal/encoding/text/decode.go
@@ -8,6 +8,7 @@
 	"bytes"
 	"io"
 	"regexp"
+	"strconv"
 	"unicode/utf8"
 
 	"google.golang.org/protobuf/internal/errors"
@@ -114,12 +115,6 @@
 	return rawValueOf(items, b[:len(b):len(b)]), nil
 }
 
-// This expression is more liberal than ConsumeAnyTypeUrl in C++.
-// However, the C++ parser does not handle many legal URL strings.
-// The Go implementation is more liberal to be backwards compatible with
-// the historical Go implementation which was overly liberal (and buggy).
-var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
-
 // unmarshalKey parses the key, which may be a Name, String, or Uint.
 func (p *decoder) unmarshalKey() (v Value, err error) {
 	if p.tryConsumeChar('[') {
@@ -134,21 +129,70 @@
 			if err != nil {
 				return Value{}, err
 			}
-		} else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
-			v = rawValueOf(string(p.in[:n]), p.in[:n:n])
-			p.consume(n)
 		} else {
-			return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+			v, err = p.unmarshalURL()
+			if err != nil {
+				return Value{}, err
+			}
 		}
 		if err := p.consumeChar(']', "at end of extension name"); err != nil {
 			return Value{}, err
 		}
 		return v, nil
 	}
-	if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
-		return p.unmarshalNumber()
+	v, err = p.unmarshalName()
+	if err == nil {
+		return v, nil
 	}
-	return p.unmarshalName()
+	v, err = p.unmarshalNumberKey()
+	if err == nil {
+		return v, nil
+	}
+	return Value{}, err
+}
+
+// unmarshalURL parses an Any type URL string. The C++ parser does not handle
+// many legal URL strings. This implementation is more liberal and allows for
+// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
+func (p *decoder) unmarshalURL() (Value, error) {
+	s := p.in
+	var size int
+	for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
+		('0' <= s[0] && s[0] <= '9') ||
+		('a' <= s[0] && s[0] <= 'z') ||
+		('A' <= s[0] && s[0] <= 'Z')) {
+		s = s[1:]
+		size++
+		if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
+			s = s[1:]
+			size++
+		}
+	}
+
+	// Last character cannot be '.' or '/'.
+	// Next byte should either be a delimiter or it is at the end.
+	if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
+		(len(s) > 0 && !isDelim(s[0])) {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	v := rawValueOf(string(p.in[:size]), p.in[:size:size])
+	p.consume(size)
+	return v, nil
+}
+
+// unmarshalNumberKey parses field number as key. Field numbers are non-negative
+// integers.
+func (p *decoder) unmarshalNumberKey() (Value, error) {
+	num, ok := parseNumber(p.in)
+	if !ok || num.neg || num.typ == numFloat {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	v, err := strconv.ParseUint(string(num.value), 0, 64)
+	if err != nil {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	p.consume(num.size)
+	return rawValueOf(v, num.value), nil
 }
 
 func (p *decoder) unmarshalValue() (Value, error) {
@@ -163,27 +207,62 @@
 	case '{', '<':
 		return p.unmarshalMessage(true)
 	default:
-		n := matchWithDelim(nameRegexp, p.in) // zero if no match
-		if n > 0 && literals[string(p.in[:n])] == nil {
-			return p.unmarshalName()
+		n, ok := consumeName(p.in)
+		if ok && literals[string(p.in[:n])] == nil {
+			v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+			p.consume(n)
+			return v, nil
 		}
 		return p.unmarshalNumber()
 	}
 }
 
-// This expression matches all valid proto identifiers.
-var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
-
-// unmarshalName unmarshals an unquoted identifier.
+// unmarshalName unmarshals an unquoted proto identifier.
+// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
 //
 // E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
 func (p *decoder) unmarshalName() (Value, error) {
-	if n := matchWithDelim(nameRegexp, p.in); n > 0 {
-		v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
-		p.consume(n)
-		return v, nil
+	n, ok := consumeName(p.in)
+	if !ok {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
 	}
-	return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+
+	v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+	p.consume(n)
+	return v, nil
+}
+
+func consumeName(input []byte) (int, bool) {
+	var n int
+
+	s := input
+	if len(s) == 0 {
+		return 0, false
+	}
+
+	switch {
+	case s[0] == '_',
+		'a' <= s[0] && s[0] <= 'z',
+		'A' <= s[0] && s[0] <= 'Z':
+		s = s[1:]
+		n++
+	default:
+		return 0, false
+	}
+
+	for len(s) > 0 && (s[0] == '_' ||
+		'a' <= s[0] && s[0] <= 'z' ||
+		'A' <= s[0] && s[0] <= 'Z' ||
+		'0' <= s[0] && s[0] <= '9') {
+		s = s[1:]
+		n++
+	}
+
+	if len(s) > 0 && !isDelim(s[0]) {
+		return 0, false
+	}
+
+	return n, true
 }
 
 func (p *decoder) consumeChar(c byte, msg string) error {
@@ -224,23 +303,12 @@
 }
 
 // Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
+var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
 
-// matchWithDelim matches r with the input b and verifies that the match
-// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
-// As a special case, EOF is considered a delimiter.
-func matchWithDelim(r *regexp.Regexp, b []byte) int {
-	n := len(r.Find(b))
-	if n < len(b) {
-		// Check that that the next character is a delimiter.
-		c := b[n]
-		notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
-			('a' <= c && c <= 'z') ||
-			('A' <= c && c <= 'Z') ||
-			('0' <= c && c <= '9'))
-		if notDelim {
-			return 0
-		}
-	}
-	return n
+// isDelim returns true if given byte is a delimiter character.
+func isDelim(c byte) bool {
+	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
+		('a' <= c && c <= 'z') ||
+		('A' <= c && c <= 'Z') ||
+		('0' <= c && c <= '9'))
 }
diff --git a/internal/encoding/text/encode.go b/internal/encoding/text/encode.go
index 982037b..1f49761 100644
--- a/internal/encoding/text/encode.go
+++ b/internal/encoding/text/encode.go
@@ -6,6 +6,7 @@
 
 import (
 	"bytes"
+	"regexp"
 	"strings"
 
 	"google.golang.org/protobuf/internal/detrand"
@@ -142,6 +143,12 @@
 	return nil
 }
 
+// This expression is more liberal than ConsumeAnyTypeUrl in C++.
+// However, the C++ parser does not handle many legal URL strings.
+// The Go implementation is more liberal to be backwards compatible with
+// the historical Go implementation which was overly liberal (and buggy).
+var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
+
 func (p *encoder) marshalKey(v Value) error {
 	switch v.Type() {
 	case String:
diff --git a/internal/encoding/text/number.go b/internal/encoding/text/number.go
index a2e08f9..4e31ee1 100644
--- a/internal/encoding/text/number.go
+++ b/internal/encoding/text/number.go
@@ -8,9 +8,7 @@
 	"bytes"
 	"io"
 	"math"
-	"regexp"
 	"strconv"
-	"strings"
 
 	"google.golang.org/protobuf/internal/errors"
 )
@@ -81,9 +79,6 @@
 		"inf":  math.Inf(+1),
 		"-inf": math.Inf(-1),
 	}
-	literalRegexp = regexp.MustCompile("^-?[a-zA-Z]+")
-	intRegexp     = regexp.MustCompile("^-?([1-9][0-9]*|0[xX][0-9a-fA-F]+|0[0-7]*)")
-	floatRegexp   = regexp.MustCompile("^-?((0|[1-9][0-9]*)?([.][0-9]*)?([eE][+-]?[0-9]+)?[fF]?)")
 )
 
 // unmarshalNumber decodes a Bool, Int, Uint, or Float64 from the input.
@@ -92,40 +87,251 @@
 	p.consume(n)
 	return v, err
 }
+
 func consumeNumber(in []byte) (Value, int, error) {
 	if len(in) == 0 {
 		return Value{}, 0, io.ErrUnexpectedEOF
 	}
-	if n := matchWithDelim(literalRegexp, in); n > 0 {
-		if v, ok := literals[string(in[:n])]; ok {
-			return rawValueOf(v, in[:n:n]), n, nil
+	if v, n := matchLiteral(in); n > 0 {
+		return rawValueOf(v, in[:n]), n, nil
+	}
+
+	num, ok := parseNumber(in)
+	if !ok {
+		return Value{}, 0, newSyntaxError("invalid %q as number or bool", errRegexp.Find(in))
+	}
+
+	if num.typ == numFloat {
+		f, err := strconv.ParseFloat(string(num.value), 64)
+		if err != nil {
+			return Value{}, 0, err
+		}
+		return rawValueOf(f, in[:num.size]), num.size, nil
+	}
+
+	if num.neg {
+		v, err := strconv.ParseInt(string(num.value), 0, 64)
+		if err != nil {
+			return Value{}, 0, err
+		}
+		return rawValueOf(v, num.value), num.size, nil
+	}
+	v, err := strconv.ParseUint(string(num.value), 0, 64)
+	if err != nil {
+		return Value{}, 0, err
+	}
+	return rawValueOf(v, num.value), num.size, nil
+}
+
+func matchLiteral(in []byte) (interface{}, int) {
+	switch in[0] {
+	case 't', 'T':
+		rest := in[1:]
+		if len(rest) == 0 || isDelim(rest[0]) {
+			return true, 1
+		}
+		if n := matchStringWithDelim("rue", rest); n > 0 {
+			return true, 4
+		}
+	case 'f', 'F':
+		rest := in[1:]
+		if len(rest) == 0 || isDelim(rest[0]) {
+			return false, 1
+		}
+		if n := matchStringWithDelim("alse", rest); n > 0 {
+			return false, 5
+		}
+	case 'n':
+		if n := matchStringWithDelim("nan", in); n > 0 {
+			return math.NaN(), 3
+		}
+	case 'i':
+		if n := matchStringWithDelim("inf", in); n > 0 {
+			return math.Inf(1), 3
+		}
+	case '-':
+		if n := matchStringWithDelim("-inf", in); n > 0 {
+			return math.Inf(-1), 4
 		}
 	}
-	if n := matchWithDelim(floatRegexp, in); n > 0 {
-		if bytes.ContainsAny(in[:n], ".eEfF") {
-			s := strings.TrimRight(string(in[:n]), "fF")
-			// Always decode float as 64-bit.
-			f, err := strconv.ParseFloat(s, 64)
-			if err != nil {
-				return Value{}, 0, err
-			}
-			return rawValueOf(f, in[:n:n]), n, nil
+	return nil, 0
+}
+
+func matchStringWithDelim(s string, b []byte) int {
+	if !bytes.HasPrefix(b, []byte(s)) {
+		return 0
+	}
+
+	n := len(s)
+	if n < len(b) && !isDelim(b[n]) {
+		return 0
+	}
+	return n
+}
+
+type numType uint8
+
+const (
+	numDec numType = (1 << iota) / 2
+	numHex
+	numOct
+	numFloat
+)
+
+// number is the result of parsing out a valid number from parseNumber. It
+// contains data for doing float or integer conversion via the strconv package.
+type number struct {
+	typ numType
+	neg bool
+	// Size of input taken up by the number. This may not be the same as
+	// len(number.value).
+	size int
+	// Bytes for doing strconv.Parse{Float,Int,Uint} conversion.
+	value []byte
+}
+
+// parseNumber constructs a number object from given input. It allows for the
+// following patterns:
+//   integer: ^-?([1-9][0-9]*|0[xX][0-9a-fA-F]+|0[0-7]*)
+//   float: ^-?((0|[1-9][0-9]*)?([.][0-9]*)?([eE][+-]?[0-9]+)?[fF]?)
+func parseNumber(input []byte) (number, bool) {
+	var size int
+	var neg bool
+	typ := numDec
+
+	s := input
+	if len(s) == 0 {
+		return number{}, false
+	}
+
+	// Optional -
+	if s[0] == '-' {
+		neg = true
+		s = s[1:]
+		size++
+		if len(s) == 0 {
+			return number{}, false
 		}
 	}
-	if n := matchWithDelim(intRegexp, in); n > 0 {
-		if in[0] == '-' {
-			v, err := strconv.ParseInt(string(in[:n]), 0, 64)
-			if err != nil {
-				return Value{}, 0, err
+
+	// C++ allows for whitespace and comments in between the negative sign and
+	// the rest of the number. This logic currently does not but is consistent
+	// with v1.
+
+	switch {
+	case s[0] == '0':
+		if len(s) > 1 {
+			switch {
+			case s[1] == 'x' || s[1] == 'X':
+				// Parse as hex number.
+				typ = numHex
+				n := 2
+				s = s[2:]
+				for len(s) > 0 && (('0' <= s[0] && s[0] <= '9') ||
+					('a' <= s[0] && s[0] <= 'f') ||
+					('A' <= s[0] && s[0] <= 'F')) {
+					s = s[1:]
+					n++
+				}
+				if n == 2 {
+					return number{}, false
+				}
+				size += n
+
+			case '0' <= s[1] && s[1] <= '7':
+				// Parse as octal number.
+				typ = numOct
+				n := 2
+				s = s[2:]
+				for len(s) > 0 && '0' <= s[0] && s[0] <= '7' {
+					s = s[1:]
+					n++
+				}
+				size += n
 			}
-			return rawValueOf(v, in[:n:n]), n, nil
-		} else {
-			v, err := strconv.ParseUint(string(in[:n]), 0, 64)
-			if err != nil {
-				return Value{}, 0, err
+
+			if typ&(numHex|numOct) > 0 {
+				if len(s) > 0 && !isDelim(s[0]) {
+					return number{}, false
+				}
+				return number{
+					typ:   typ,
+					size:  size,
+					neg:   neg,
+					value: input[:size],
+				}, true
 			}
-			return rawValueOf(v, in[:n:n]), n, nil
 		}
+		s = s[1:]
+		size++
+
+	case '1' <= s[0] && s[0] <= '9':
+		n := 1
+		s = s[1:]
+		for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+			s = s[1:]
+			n++
+		}
+		size += n
+
+	case s[0] == '.':
+		// Handled below.
+
+	default:
+		return number{}, false
 	}
-	return Value{}, 0, newSyntaxError("invalid %q as number or bool", errRegexp.Find(in))
+
+	// . followed by 0 or more digits.
+	if len(s) > 0 && s[0] == '.' {
+		typ = numFloat
+		n := 1
+		s = s[1:]
+		for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+			s = s[1:]
+			n++
+		}
+		size += n
+	}
+
+	// e or E followed by an optional - or + and 1 or more digits.
+	if len(s) >= 2 && (s[0] == 'e' || s[0] == 'E') {
+		typ = numFloat
+		s = s[1:]
+		n := 1
+		if s[0] == '+' || s[0] == '-' {
+			s = s[1:]
+			n++
+			if len(s) == 0 {
+				return number{}, false
+			}
+		}
+		for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+			s = s[1:]
+			n++
+		}
+		size += n
+	}
+
+	// At this point, input[:size] contains a valid number that can be converted
+	// via strconv.Parse{Float,Int,Uint}.
+	value := input[:size]
+
+	// Optional suffix f or F for floats.
+	if len(s) > 0 && (s[0] == 'f' || s[0] == 'F') {
+		typ = numFloat
+		s = s[1:]
+		size++
+	}
+
+	// Check that next byte is a delimiter or it is at the end.
+	if len(s) > 0 && !isDelim(s[0]) {
+		return number{}, false
+	}
+
+	return number{
+		typ:   typ,
+		size:  size,
+		neg:   neg,
+		value: value,
+	}, true
 }
diff --git a/internal/encoding/text/text_test.go b/internal/encoding/text/text_test.go
index ee43190..0e40fa8 100644
--- a/internal/encoding/text/text_test.go
+++ b/internal/encoding/text/text_test.go
@@ -120,6 +120,15 @@
 		in:      "[$]",
 		wantErr: `invalid "$" as identifier`,
 	}, {
+		in:      `[proto.package.]:0`,
+		wantErr: `invalid "proto.package." as identifier`,
+	}, {
+		in:      `[/proto.package]:0`,
+		wantErr: `invalid "/proto.package" as identifier`,
+	}, {
+		in:      `[proto.package/]:0`,
+		wantErr: `invalid "proto.package/" as identifier`,
+	}, {
 		// This parses fine, but should result in a error later since no
 		// type name in proto will ever be just a number.
 		in:      "[20]:0",