internal/encoding/text: replace use of regular expression in decoding Improve performance by replacing use of regular expressions with direct parsing code. Compared to latest version: name old time/op new time/op delta Text/Unmarshal/google_message1_proto2-4 21.8µs ± 5% 14.0µs ± 9% -35.69% (p=0.000 n=10+9) Text/Unmarshal/google_message1_proto3-4 19.6µs ± 4% 13.8µs ±10% -29.47% (p=0.000 n=10+10) Text/Unmarshal/google_message2-4 13.4ms ± 4% 4.9ms ± 4% -63.44% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 13.8µs ± 2% 14.1µs ± 4% +2.42% (p=0.011 n=9+10) Text/Marshal/google_message1_proto3-4 11.6µs ± 2% 11.8µs ± 8% ~ (p=0.573 n=8+10) Text/Marshal/google_message2-4 8.01ms ±48% 5.97ms ± 5% -25.44% (p=0.000 n=10+10) name old alloc/op new alloc/op delta Text/Unmarshal/google_message1_proto2-4 13.0kB ± 0% 12.6kB ± 0% -3.40% (p=0.000 n=10+10) Text/Unmarshal/google_message1_proto3-4 13.0kB ± 0% 12.5kB ± 0% -3.50% (p=0.000 n=10+10) Text/Unmarshal/google_message2-4 5.67MB ± 0% 5.50MB ± 0% -3.13% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 12.0kB ± 0% 12.1kB ± 0% +0.02% (p=0.000 n=10+10) Text/Marshal/google_message1_proto3-4 11.7kB ± 0% 11.7kB ± 0% +0.01% (p=0.000 n=10+10) Text/Marshal/google_message2-4 5.68MB ± 0% 5.68MB ± 0% +0.01% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Text/Unmarshal/google_message1_proto2-4 142 ± 0% 142 ± 0% ~ (all equal) Text/Unmarshal/google_message1_proto3-4 156 ± 0% 156 ± 0% ~ (all equal) Text/Unmarshal/google_message2-4 70.1k ± 0% 65.4k ± 0% -6.76% (p=0.000 n=10+10) Text/Marshal/google_message1_proto2-4 91.0 ± 0% 91.0 ± 0% ~ (all equal) Text/Marshal/google_message1_proto3-4 80.0 ± 0% 80.0 ± 0% ~ (all equal) Text/Marshal/google_message2-4 36.4k ± 0% 36.4k ± 0% ~ (all equal) Change-Id: Ia5d3c16e9e33961aae03bac0d53fcfc5b1943d2a Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/173360 Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>

commit: a3369c5dc28ac197f8ec8de40a7291be4dea85b5 [log] [tgz]
author: Herbie Ong <herbie@google.com> Tue Apr 23 00:24:46 2019 -0700
committer: Herbie Ong <herbie@google.com> Tue Jul 23 22:08:16 2019 +0000
tree: 0b0896e0fbda0f087745cba5e5749684670cd2d9
parent: 57b4ff63e07d16a27f5e535e815b4d8e52d97ef9 [diff] [blame]
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
index 2b32ed9..c0513a8 100644
--- a/internal/encoding/text/decode.go
+++ b/internal/encoding/text/decode.go

@@ -8,6 +8,7 @@
 	"bytes"
 	"io"
 	"regexp"
+	"strconv"
 	"unicode/utf8"
 
 	"google.golang.org/protobuf/internal/errors"
@@ -114,12 +115,6 @@
 	return rawValueOf(items, b[:len(b):len(b)]), nil
 }
 
-// This expression is more liberal than ConsumeAnyTypeUrl in C++.
-// However, the C++ parser does not handle many legal URL strings.
-// The Go implementation is more liberal to be backwards compatible with
-// the historical Go implementation which was overly liberal (and buggy).
-var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
-
 // unmarshalKey parses the key, which may be a Name, String, or Uint.
 func (p *decoder) unmarshalKey() (v Value, err error) {
 	if p.tryConsumeChar('[') {
@@ -134,21 +129,70 @@
 			if err != nil {
 				return Value{}, err
 			}
-		} else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
-			v = rawValueOf(string(p.in[:n]), p.in[:n:n])
-			p.consume(n)
 		} else {
-			return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+			v, err = p.unmarshalURL()
+			if err != nil {
+				return Value{}, err
+			}
 		}
 		if err := p.consumeChar(']', "at end of extension name"); err != nil {
 			return Value{}, err
 		}
 		return v, nil
 	}
-	if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
-		return p.unmarshalNumber()
+	v, err = p.unmarshalName()
+	if err == nil {
+		return v, nil
 	}
-	return p.unmarshalName()
+	v, err = p.unmarshalNumberKey()
+	if err == nil {
+		return v, nil
+	}
+	return Value{}, err
+}
+
+// unmarshalURL parses an Any type URL string. The C++ parser does not handle
+// many legal URL strings. This implementation is more liberal and allows for
+// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
+func (p *decoder) unmarshalURL() (Value, error) {
+	s := p.in
+	var size int
+	for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
+		('0' <= s[0] && s[0] <= '9') ||
+		('a' <= s[0] && s[0] <= 'z') ||
+		('A' <= s[0] && s[0] <= 'Z')) {
+		s = s[1:]
+		size++
+		if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
+			s = s[1:]
+			size++
+		}
+	}
+
+	// Last character cannot be '.' or '/'.
+	// Next byte should either be a delimiter or it is at the end.
+	if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
+		(len(s) > 0 && !isDelim(s[0])) {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	v := rawValueOf(string(p.in[:size]), p.in[:size:size])
+	p.consume(size)
+	return v, nil
+}
+
+// unmarshalNumberKey parses field number as key. Field numbers are non-negative
+// integers.
+func (p *decoder) unmarshalNumberKey() (Value, error) {
+	num, ok := parseNumber(p.in)
+	if !ok || num.neg || num.typ == numFloat {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	v, err := strconv.ParseUint(string(num.value), 0, 64)
+	if err != nil {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+	}
+	p.consume(num.size)
+	return rawValueOf(v, num.value), nil
 }
 
 func (p *decoder) unmarshalValue() (Value, error) {
@@ -163,27 +207,62 @@
 	case '{', '<':
 		return p.unmarshalMessage(true)
 	default:
-		n := matchWithDelim(nameRegexp, p.in) // zero if no match
-		if n > 0 && literals[string(p.in[:n])] == nil {
-			return p.unmarshalName()
+		n, ok := consumeName(p.in)
+		if ok && literals[string(p.in[:n])] == nil {
+			v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+			p.consume(n)
+			return v, nil
 		}
 		return p.unmarshalNumber()
 	}
 }
 
-// This expression matches all valid proto identifiers.
-var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
-
-// unmarshalName unmarshals an unquoted identifier.
+// unmarshalName unmarshals an unquoted proto identifier.
+// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
 //
 // E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
 func (p *decoder) unmarshalName() (Value, error) {
-	if n := matchWithDelim(nameRegexp, p.in); n > 0 {
-		v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
-		p.consume(n)
-		return v, nil
+	n, ok := consumeName(p.in)
+	if !ok {
+		return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
 	}
-	return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+
+	v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+	p.consume(n)
+	return v, nil
+}
+
+func consumeName(input []byte) (int, bool) {
+	var n int
+
+	s := input
+	if len(s) == 0 {
+		return 0, false
+	}
+
+	switch {
+	case s[0] == '_',
+		'a' <= s[0] && s[0] <= 'z',
+		'A' <= s[0] && s[0] <= 'Z':
+		s = s[1:]
+		n++
+	default:
+		return 0, false
+	}
+
+	for len(s) > 0 && (s[0] == '_' ||
+		'a' <= s[0] && s[0] <= 'z' ||
+		'A' <= s[0] && s[0] <= 'Z' ||
+		'0' <= s[0] && s[0] <= '9') {
+		s = s[1:]
+		n++
+	}
+
+	if len(s) > 0 && !isDelim(s[0]) {
+		return 0, false
+	}
+
+	return n, true
 }
 
 func (p *decoder) consumeChar(c byte, msg string) error {
@@ -224,23 +303,12 @@
 }
 
 // Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
+var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
 
-// matchWithDelim matches r with the input b and verifies that the match
-// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
-// As a special case, EOF is considered a delimiter.
-func matchWithDelim(r *regexp.Regexp, b []byte) int {
-	n := len(r.Find(b))
-	if n < len(b) {
-		// Check that that the next character is a delimiter.
-		c := b[n]
-		notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
-			('a' <= c && c <= 'z') ||
-			('A' <= c && c <= 'Z') ||
-			('0' <= c && c <= '9'))
-		if notDelim {
-			return 0
-		}
-	}
-	return n
+// isDelim returns true if given byte is a delimiter character.
+func isDelim(c byte) bool {
+	return !(c == '-' || c == '+' || c == '.' || c == '_' ||
+		('a' <= c && c <= 'z') ||
+		('A' <= c && c <= 'Z') ||
+		('0' <= c && c <= '9'))
 }
commit	a3369c5dc28ac197f8ec8de40a7291be4dea85b5	[log] [tgz]
author	Herbie Ong <herbie@google.com>	Tue Apr 23 00:24:46 2019 -0700
committer	Herbie Ong <herbie@google.com>	Tue Jul 23 22:08:16 2019 +0000
tree	0b0896e0fbda0f087745cba5e5749684670cd2d9
parent	57b4ff63e07d16a27f5e535e815b4d8e52d97ef9 [diff] [blame]