internal/encoding/text: replace use of regular expression in decoding
Improve performance by replacing use of regular expressions with direct
parsing code.
Compared to latest version:
name old time/op new time/op delta
Text/Unmarshal/google_message1_proto2-4 21.8µs ± 5% 14.0µs ± 9% -35.69% (p=0.000 n=10+9)
Text/Unmarshal/google_message1_proto3-4 19.6µs ± 4% 13.8µs ±10% -29.47% (p=0.000 n=10+10)
Text/Unmarshal/google_message2-4 13.4ms ± 4% 4.9ms ± 4% -63.44% (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4 13.8µs ± 2% 14.1µs ± 4% +2.42% (p=0.011 n=9+10)
Text/Marshal/google_message1_proto3-4 11.6µs ± 2% 11.8µs ± 8% ~ (p=0.573 n=8+10)
Text/Marshal/google_message2-4 8.01ms ±48% 5.97ms ± 5% -25.44% (p=0.000 n=10+10)
name old alloc/op new alloc/op delta
Text/Unmarshal/google_message1_proto2-4 13.0kB ± 0% 12.6kB ± 0% -3.40% (p=0.000 n=10+10)
Text/Unmarshal/google_message1_proto3-4 13.0kB ± 0% 12.5kB ± 0% -3.50% (p=0.000 n=10+10)
Text/Unmarshal/google_message2-4 5.67MB ± 0% 5.50MB ± 0% -3.13% (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4 12.0kB ± 0% 12.1kB ± 0% +0.02% (p=0.000 n=10+10)
Text/Marshal/google_message1_proto3-4 11.7kB ± 0% 11.7kB ± 0% +0.01% (p=0.000 n=10+10)
Text/Marshal/google_message2-4 5.68MB ± 0% 5.68MB ± 0% +0.01% (p=0.000 n=10+10)
name old allocs/op new allocs/op delta
Text/Unmarshal/google_message1_proto2-4 142 ± 0% 142 ± 0% ~ (all equal)
Text/Unmarshal/google_message1_proto3-4 156 ± 0% 156 ± 0% ~ (all equal)
Text/Unmarshal/google_message2-4 70.1k ± 0% 65.4k ± 0% -6.76% (p=0.000 n=10+10)
Text/Marshal/google_message1_proto2-4 91.0 ± 0% 91.0 ± 0% ~ (all equal)
Text/Marshal/google_message1_proto3-4 80.0 ± 0% 80.0 ± 0% ~ (all equal)
Text/Marshal/google_message2-4 36.4k ± 0% 36.4k ± 0% ~ (all equal)
Change-Id: Ia5d3c16e9e33961aae03bac0d53fcfc5b1943d2a
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/173360
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/encoding/text/decode.go b/internal/encoding/text/decode.go
index 2b32ed9..c0513a8 100644
--- a/internal/encoding/text/decode.go
+++ b/internal/encoding/text/decode.go
@@ -8,6 +8,7 @@
"bytes"
"io"
"regexp"
+ "strconv"
"unicode/utf8"
"google.golang.org/protobuf/internal/errors"
@@ -114,12 +115,6 @@
return rawValueOf(items, b[:len(b):len(b)]), nil
}
-// This expression is more liberal than ConsumeAnyTypeUrl in C++.
-// However, the C++ parser does not handle many legal URL strings.
-// The Go implementation is more liberal to be backwards compatible with
-// the historical Go implementation which was overly liberal (and buggy).
-var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
-
// unmarshalKey parses the key, which may be a Name, String, or Uint.
func (p *decoder) unmarshalKey() (v Value, err error) {
if p.tryConsumeChar('[') {
@@ -134,21 +129,70 @@
if err != nil {
return Value{}, err
}
- } else if n := matchWithDelim(urlRegexp, p.in); n > 0 {
- v = rawValueOf(string(p.in[:n]), p.in[:n:n])
- p.consume(n)
} else {
- return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+ v, err = p.unmarshalURL()
+ if err != nil {
+ return Value{}, err
+ }
}
if err := p.consumeChar(']', "at end of extension name"); err != nil {
return Value{}, err
}
return v, nil
}
- if matchWithDelim(intRegexp, p.in) > 0 && p.in[0] != '-' {
- return p.unmarshalNumber()
+ v, err = p.unmarshalName()
+ if err == nil {
+ return v, nil
}
- return p.unmarshalName()
+ v, err = p.unmarshalNumberKey()
+ if err == nil {
+ return v, nil
+ }
+ return Value{}, err
+}
+
+// unmarshalURL parses an Any type URL string. The C++ parser does not handle
+// many legal URL strings. This implementation is more liberal and allows for
+// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
+func (p *decoder) unmarshalURL() (Value, error) {
+ s := p.in
+ var size int
+ for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
+ ('0' <= s[0] && s[0] <= '9') ||
+ ('a' <= s[0] && s[0] <= 'z') ||
+ ('A' <= s[0] && s[0] <= 'Z')) {
+ s = s[1:]
+ size++
+ if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
+ s = s[1:]
+ size++
+ }
+ }
+
+ // Last character cannot be '.' or '/'.
+ // Next byte should either be a delimiter or it is at the end.
+ if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
+ (len(s) > 0 && !isDelim(s[0])) {
+ return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+ }
+ v := rawValueOf(string(p.in[:size]), p.in[:size:size])
+ p.consume(size)
+ return v, nil
+}
+
+// unmarshalNumberKey parses field number as key. Field numbers are non-negative
+// integers.
+func (p *decoder) unmarshalNumberKey() (Value, error) {
+ num, ok := parseNumber(p.in)
+ if !ok || num.neg || num.typ == numFloat {
+ return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+ }
+ v, err := strconv.ParseUint(string(num.value), 0, 64)
+ if err != nil {
+ return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+ }
+ p.consume(num.size)
+ return rawValueOf(v, num.value), nil
}
func (p *decoder) unmarshalValue() (Value, error) {
@@ -163,27 +207,62 @@
case '{', '<':
return p.unmarshalMessage(true)
default:
- n := matchWithDelim(nameRegexp, p.in) // zero if no match
- if n > 0 && literals[string(p.in[:n])] == nil {
- return p.unmarshalName()
+ n, ok := consumeName(p.in)
+ if ok && literals[string(p.in[:n])] == nil {
+ v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+ p.consume(n)
+ return v, nil
}
return p.unmarshalNumber()
}
}
-// This expression matches all valid proto identifiers.
-var nameRegexp = regexp.MustCompile(`^[_a-zA-Z][_a-zA-Z0-9]*`)
-
-// unmarshalName unmarshals an unquoted identifier.
+// unmarshalName unmarshals an unquoted proto identifier.
+// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
//
// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
func (p *decoder) unmarshalName() (Value, error) {
- if n := matchWithDelim(nameRegexp, p.in); n > 0 {
- v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
- p.consume(n)
- return v, nil
+ n, ok := consumeName(p.in)
+ if !ok {
+ return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
}
- return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
+
+ v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
+ p.consume(n)
+ return v, nil
+}
+
+func consumeName(input []byte) (int, bool) {
+ var n int
+
+ s := input
+ if len(s) == 0 {
+ return 0, false
+ }
+
+ switch {
+ case s[0] == '_',
+ 'a' <= s[0] && s[0] <= 'z',
+ 'A' <= s[0] && s[0] <= 'Z':
+ s = s[1:]
+ n++
+ default:
+ return 0, false
+ }
+
+ for len(s) > 0 && (s[0] == '_' ||
+ 'a' <= s[0] && s[0] <= 'z' ||
+ 'A' <= s[0] && s[0] <= 'Z' ||
+ '0' <= s[0] && s[0] <= '9') {
+ s = s[1:]
+ n++
+ }
+
+ if len(s) > 0 && !isDelim(s[0]) {
+ return 0, false
+ }
+
+ return n, true
}
func (p *decoder) consumeChar(c byte, msg string) error {
@@ -224,23 +303,12 @@
}
// Any sequence that looks like a non-delimiter (for error reporting).
-var errRegexp = regexp.MustCompile("^([-+._a-zA-Z0-9]{1,32}|.)")
+var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
-// matchWithDelim matches r with the input b and verifies that the match
-// terminates with a delimiter of some form (e.g., r"[^-+_.a-zA-Z0-9]").
-// As a special case, EOF is considered a delimiter.
-func matchWithDelim(r *regexp.Regexp, b []byte) int {
- n := len(r.Find(b))
- if n < len(b) {
- // Check that that the next character is a delimiter.
- c := b[n]
- notDelim := (c == '-' || c == '+' || c == '.' || c == '_' ||
- ('a' <= c && c <= 'z') ||
- ('A' <= c && c <= 'Z') ||
- ('0' <= c && c <= '9'))
- if notDelim {
- return 0
- }
- }
- return n
+// isDelim returns true if given byte is a delimiter character.
+func isDelim(c byte) bool {
+ return !(c == '-' || c == '+' || c == '.' || c == '_' ||
+ ('a' <= c && c <= 'z') ||
+ ('A' <= c && c <= 'Z') ||
+ ('0' <= c && c <= '9'))
}
diff --git a/internal/encoding/text/encode.go b/internal/encoding/text/encode.go
index 982037b..1f49761 100644
--- a/internal/encoding/text/encode.go
+++ b/internal/encoding/text/encode.go
@@ -6,6 +6,7 @@
import (
"bytes"
+ "regexp"
"strings"
"google.golang.org/protobuf/internal/detrand"
@@ -142,6 +143,12 @@
return nil
}
+// This expression is more liberal than ConsumeAnyTypeUrl in C++.
+// However, the C++ parser does not handle many legal URL strings.
+// The Go implementation is more liberal to be backwards compatible with
+// the historical Go implementation which was overly liberal (and buggy).
+var urlRegexp = regexp.MustCompile(`^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`)
+
func (p *encoder) marshalKey(v Value) error {
switch v.Type() {
case String:
diff --git a/internal/encoding/text/number.go b/internal/encoding/text/number.go
index a2e08f9..4e31ee1 100644
--- a/internal/encoding/text/number.go
+++ b/internal/encoding/text/number.go
@@ -8,9 +8,7 @@
"bytes"
"io"
"math"
- "regexp"
"strconv"
- "strings"
"google.golang.org/protobuf/internal/errors"
)
@@ -81,9 +79,6 @@
"inf": math.Inf(+1),
"-inf": math.Inf(-1),
}
- literalRegexp = regexp.MustCompile("^-?[a-zA-Z]+")
- intRegexp = regexp.MustCompile("^-?([1-9][0-9]*|0[xX][0-9a-fA-F]+|0[0-7]*)")
- floatRegexp = regexp.MustCompile("^-?((0|[1-9][0-9]*)?([.][0-9]*)?([eE][+-]?[0-9]+)?[fF]?)")
)
// unmarshalNumber decodes a Bool, Int, Uint, or Float64 from the input.
@@ -92,40 +87,251 @@
p.consume(n)
return v, err
}
+
func consumeNumber(in []byte) (Value, int, error) {
if len(in) == 0 {
return Value{}, 0, io.ErrUnexpectedEOF
}
- if n := matchWithDelim(literalRegexp, in); n > 0 {
- if v, ok := literals[string(in[:n])]; ok {
- return rawValueOf(v, in[:n:n]), n, nil
+ if v, n := matchLiteral(in); n > 0 {
+ return rawValueOf(v, in[:n]), n, nil
+ }
+
+ num, ok := parseNumber(in)
+ if !ok {
+ return Value{}, 0, newSyntaxError("invalid %q as number or bool", errRegexp.Find(in))
+ }
+
+ if num.typ == numFloat {
+ f, err := strconv.ParseFloat(string(num.value), 64)
+ if err != nil {
+ return Value{}, 0, err
+ }
+ return rawValueOf(f, in[:num.size]), num.size, nil
+ }
+
+ if num.neg {
+ v, err := strconv.ParseInt(string(num.value), 0, 64)
+ if err != nil {
+ return Value{}, 0, err
+ }
+ return rawValueOf(v, num.value), num.size, nil
+ }
+ v, err := strconv.ParseUint(string(num.value), 0, 64)
+ if err != nil {
+ return Value{}, 0, err
+ }
+ return rawValueOf(v, num.value), num.size, nil
+}
+
+func matchLiteral(in []byte) (interface{}, int) {
+ switch in[0] {
+ case 't', 'T':
+ rest := in[1:]
+ if len(rest) == 0 || isDelim(rest[0]) {
+ return true, 1
+ }
+ if n := matchStringWithDelim("rue", rest); n > 0 {
+ return true, 4
+ }
+ case 'f', 'F':
+ rest := in[1:]
+ if len(rest) == 0 || isDelim(rest[0]) {
+ return false, 1
+ }
+ if n := matchStringWithDelim("alse", rest); n > 0 {
+ return false, 5
+ }
+ case 'n':
+ if n := matchStringWithDelim("nan", in); n > 0 {
+ return math.NaN(), 3
+ }
+ case 'i':
+ if n := matchStringWithDelim("inf", in); n > 0 {
+ return math.Inf(1), 3
+ }
+ case '-':
+ if n := matchStringWithDelim("-inf", in); n > 0 {
+ return math.Inf(-1), 4
}
}
- if n := matchWithDelim(floatRegexp, in); n > 0 {
- if bytes.ContainsAny(in[:n], ".eEfF") {
- s := strings.TrimRight(string(in[:n]), "fF")
- // Always decode float as 64-bit.
- f, err := strconv.ParseFloat(s, 64)
- if err != nil {
- return Value{}, 0, err
- }
- return rawValueOf(f, in[:n:n]), n, nil
+ return nil, 0
+}
+
+func matchStringWithDelim(s string, b []byte) int {
+ if !bytes.HasPrefix(b, []byte(s)) {
+ return 0
+ }
+
+ n := len(s)
+ if n < len(b) && !isDelim(b[n]) {
+ return 0
+ }
+ return n
+}
+
+type numType uint8
+
+const (
+ numDec numType = (1 << iota) / 2
+ numHex
+ numOct
+ numFloat
+)
+
+// number is the result of parsing out a valid number from parseNumber. It
+// contains data for doing float or integer conversion via the strconv package.
+type number struct {
+ typ numType
+ neg bool
+ // Size of input taken up by the number. This may not be the same as
+ // len(number.value).
+ size int
+ // Bytes for doing strconv.Parse{Float,Int,Uint} conversion.
+ value []byte
+}
+
+// parseNumber constructs a number object from given input. It allows for the
+// following patterns:
+// integer: ^-?([1-9][0-9]*|0[xX][0-9a-fA-F]+|0[0-7]*)
+// float: ^-?((0|[1-9][0-9]*)?([.][0-9]*)?([eE][+-]?[0-9]+)?[fF]?)
+func parseNumber(input []byte) (number, bool) {
+ var size int
+ var neg bool
+ typ := numDec
+
+ s := input
+ if len(s) == 0 {
+ return number{}, false
+ }
+
+ // Optional -
+ if s[0] == '-' {
+ neg = true
+ s = s[1:]
+ size++
+ if len(s) == 0 {
+ return number{}, false
}
}
- if n := matchWithDelim(intRegexp, in); n > 0 {
- if in[0] == '-' {
- v, err := strconv.ParseInt(string(in[:n]), 0, 64)
- if err != nil {
- return Value{}, 0, err
+
+ // C++ allows for whitespace and comments in between the negative sign and
+ // the rest of the number. This logic currently does not but is consistent
+ // with v1.
+
+ switch {
+ case s[0] == '0':
+ if len(s) > 1 {
+ switch {
+ case s[1] == 'x' || s[1] == 'X':
+ // Parse as hex number.
+ typ = numHex
+ n := 2
+ s = s[2:]
+ for len(s) > 0 && (('0' <= s[0] && s[0] <= '9') ||
+ ('a' <= s[0] && s[0] <= 'f') ||
+ ('A' <= s[0] && s[0] <= 'F')) {
+ s = s[1:]
+ n++
+ }
+ if n == 2 {
+ return number{}, false
+ }
+ size += n
+
+ case '0' <= s[1] && s[1] <= '7':
+ // Parse as octal number.
+ typ = numOct
+ n := 2
+ s = s[2:]
+ for len(s) > 0 && '0' <= s[0] && s[0] <= '7' {
+ s = s[1:]
+ n++
+ }
+ size += n
}
- return rawValueOf(v, in[:n:n]), n, nil
- } else {
- v, err := strconv.ParseUint(string(in[:n]), 0, 64)
- if err != nil {
- return Value{}, 0, err
+
+ if typ&(numHex|numOct) > 0 {
+ if len(s) > 0 && !isDelim(s[0]) {
+ return number{}, false
+ }
+ return number{
+ typ: typ,
+ size: size,
+ neg: neg,
+ value: input[:size],
+ }, true
}
- return rawValueOf(v, in[:n:n]), n, nil
}
+ s = s[1:]
+ size++
+
+ case '1' <= s[0] && s[0] <= '9':
+ n := 1
+ s = s[1:]
+ for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+ s = s[1:]
+ n++
+ }
+ size += n
+
+ case s[0] == '.':
+ // Handled below.
+
+ default:
+ return number{}, false
}
- return Value{}, 0, newSyntaxError("invalid %q as number or bool", errRegexp.Find(in))
+
+ // . followed by 0 or more digits.
+ if len(s) > 0 && s[0] == '.' {
+ typ = numFloat
+ n := 1
+ s = s[1:]
+ for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+ s = s[1:]
+ n++
+ }
+ size += n
+ }
+
+ // e or E followed by an optional - or + and 1 or more digits.
+ if len(s) >= 2 && (s[0] == 'e' || s[0] == 'E') {
+ typ = numFloat
+ s = s[1:]
+ n := 1
+ if s[0] == '+' || s[0] == '-' {
+ s = s[1:]
+ n++
+ if len(s) == 0 {
+ return number{}, false
+ }
+ }
+ for len(s) > 0 && '0' <= s[0] && s[0] <= '9' {
+ s = s[1:]
+ n++
+ }
+ size += n
+ }
+
+ // At this point, input[:size] contains a valid number that can be converted
+ // via strconv.Parse{Float,Int,Uint}.
+ value := input[:size]
+
+ // Optional suffix f or F for floats.
+ if len(s) > 0 && (s[0] == 'f' || s[0] == 'F') {
+ typ = numFloat
+ s = s[1:]
+ size++
+ }
+
+ // Check that next byte is a delimiter or it is at the end.
+ if len(s) > 0 && !isDelim(s[0]) {
+ return number{}, false
+ }
+
+ return number{
+ typ: typ,
+ size: size,
+ neg: neg,
+ value: value,
+ }, true
}
diff --git a/internal/encoding/text/text_test.go b/internal/encoding/text/text_test.go
index ee43190..0e40fa8 100644
--- a/internal/encoding/text/text_test.go
+++ b/internal/encoding/text/text_test.go
@@ -120,6 +120,15 @@
in: "[$]",
wantErr: `invalid "$" as identifier`,
}, {
+ in: `[proto.package.]:0`,
+ wantErr: `invalid "proto.package." as identifier`,
+ }, {
+ in: `[/proto.package]:0`,
+ wantErr: `invalid "/proto.package" as identifier`,
+ }, {
+ in: `[proto.package/]:0`,
+ wantErr: `invalid "proto.package/" as identifier`,
+ }, {
// This parses fine, but should result in a error later since no
// type name in proto will ever be just a number.
in: "[20]:0",