internal/strs: unify string manipulation functionality
Create a new internal/strs package that unifies common functionality:
* Since protobuf itself pseudo-specifies at least 4 different camel-case
and snake-case conversion functions, we define all variants in one place.
* We move the internal/filedesc.nameBuilder function to this package.
We simplify its implementation to not depend on a strings.Builder fork
under the hood since the semantics we desire is simpler than what
strings.Builder provides.
* We use strs.Builder in reflect/protodesc in its construction of all
the full names. This is perfect use case of strs.Builder since all
full names within a file descriptor share the same lifetime.
* Add an UnsafeString and UnsafeBytes cast function that will be useful
in the near future for optimizing encoding/prototext and encoding/protojson.
Change-Id: I2cf07cbaf6f72e5f9fd6ae3d37b0d46f6af2ad59
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/185198
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/internal/strs/strings.go b/internal/strs/strings.go
new file mode 100644
index 0000000..295bd29
--- /dev/null
+++ b/internal/strs/strings.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package strs provides string manipulation functionality specific to protobuf.
+package strs
+
+import (
+ "strings"
+ "unicode"
+)
+
+// JSONCamelCase converts a snake_case identifier to a camelCase identifier,
+// according to the protobuf JSON specification.
+func JSONCamelCase(s string) string {
+ var b []byte
+ var wasUnderscore bool
+ for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
+ c := s[i]
+ if c != '_' {
+ isLower := 'a' <= c && c <= 'z'
+ if wasUnderscore && isLower {
+ c -= 'a' - 'A' // convert to uppercase
+ }
+ b = append(b, c)
+ }
+ wasUnderscore = c == '_'
+ }
+ return string(b)
+}
+
+// JSONSnakeCase converts a camelCase identifier to a snake_case identifier,
+// according to the protobuf JSON specification.
+func JSONSnakeCase(s string) string {
+ var b []byte
+ for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
+ c := s[i]
+ isUpper := 'A' <= c && c <= 'Z'
+ if isUpper {
+ b = append(b, '_')
+ c += 'a' - 'A' // convert to lowercase
+ }
+ b = append(b, c)
+ }
+ return string(b)
+}
+
+// MapEntryName derives the name of the map entry message given the field name.
+// See protoc v3.8.0: src/google/protobuf/descriptor.cc:254-276,6057
+func MapEntryName(s string) string {
+ var b []byte
+ upperNext := true
+ for _, c := range s {
+ switch {
+ case c == '_':
+ upperNext = true
+ case upperNext:
+ b = append(b, byte(unicode.ToUpper(c)))
+ upperNext = false
+ default:
+ b = append(b, byte(c))
+ }
+ }
+ b = append(b, "Entry"...)
+ return string(b)
+}
+
+// EnumValueName derives the camel-cased enum value name.
+// See protoc v3.8.0: src/google/protobuf/descriptor.cc:297-313
+func EnumValueName(s string) string {
+ var b []byte
+ upperNext := true
+ for _, c := range s {
+ switch {
+ case c == '_':
+ upperNext = true
+ case upperNext:
+ b = append(b, byte(unicode.ToUpper(c)))
+ upperNext = false
+ default:
+ b = append(b, byte(unicode.ToLower(c)))
+ upperNext = false
+ }
+ }
+ return string(b)
+}
+
+// TrimEnumPrefix trims the enum name prefix from an enum value name,
+// where the prefix is all lowercase without underscores.
+// See protoc v3.8.0: src/google/protobuf/descriptor.cc:330-375
+func TrimEnumPrefix(s, prefix string) string {
+ s0 := s // original input
+ for len(s) > 0 && len(prefix) > 0 {
+ if s[0] == '_' {
+ s = s[1:]
+ continue
+ }
+ if unicode.ToLower(rune(s[0])) != rune(prefix[0]) {
+ return s0 // no prefix match
+ }
+ s, prefix = s[1:], prefix[1:]
+ }
+ if len(prefix) > 0 {
+ return s0 // no prefix match
+ }
+ s = strings.TrimLeft(s, "_")
+ if len(s) == 0 {
+ return s0 // avoid returning empty string
+ }
+ return s
+}
diff --git a/internal/strs/strings_pure.go b/internal/strs/strings_pure.go
new file mode 100644
index 0000000..85e074c
--- /dev/null
+++ b/internal/strs/strings_pure.go
@@ -0,0 +1,27 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build purego appengine
+
+package strs
+
+import pref "google.golang.org/protobuf/reflect/protoreflect"
+
+func UnsafeString(b []byte) string {
+ return string(b)
+}
+
+func UnsafeBytes(s string) []byte {
+ return []byte(s)
+}
+
+type Builder struct{}
+
+func (*Builder) AppendFullName(prefix pref.FullName, name pref.Name) pref.FullName {
+ return prefix.Append(name)
+}
+
+func (*Builder) MakeString(b []byte) string {
+ return string(b)
+}
diff --git a/internal/strs/strings_test.go b/internal/strs/strings_test.go
new file mode 100644
index 0000000..2c4c2ad
--- /dev/null
+++ b/internal/strs/strings_test.go
@@ -0,0 +1,108 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strs
+
+import (
+ "strconv"
+ "testing"
+)
+
+func TestName(t *testing.T) {
+ tests := []struct {
+ in string
+ inEnumPrefix string
+ wantMapEntry string
+ wantEnumValue string
+ wantTrimValue string
+ wantJSONCamelCase string
+ wantJSONSnakeCase string
+ }{{
+ in: "abc",
+ inEnumPrefix: "",
+ wantMapEntry: "AbcEntry",
+ wantEnumValue: "Abc",
+ wantTrimValue: "abc",
+ wantJSONCamelCase: "abc",
+ wantJSONSnakeCase: "abc",
+ }, {
+ in: "foo_baR_",
+ inEnumPrefix: "foo_bar",
+ wantMapEntry: "FooBaREntry",
+ wantEnumValue: "FooBar",
+ wantTrimValue: "foo_baR_",
+ wantJSONCamelCase: "fooBaR",
+ wantJSONSnakeCase: "foo_ba_r_",
+ }, {
+ in: "snake_caseCamelCase",
+ inEnumPrefix: "snakecasecamel",
+ wantMapEntry: "SnakeCaseCamelCaseEntry",
+ wantEnumValue: "SnakeCasecamelcase",
+ wantTrimValue: "Case",
+ wantJSONCamelCase: "snakeCaseCamelCase",
+ wantJSONSnakeCase: "snake_case_camel_case",
+ }, {
+ in: "FiZz_BuZz",
+ inEnumPrefix: "fizz",
+ wantMapEntry: "FiZzBuZzEntry",
+ wantEnumValue: "FizzBuzz",
+ wantTrimValue: "BuZz",
+ wantJSONCamelCase: "FiZzBuZz",
+ wantJSONSnakeCase: "_fi_zz__bu_zz",
+ }}
+
+ for _, tt := range tests {
+ if got := MapEntryName(tt.in); got != tt.wantMapEntry {
+ t.Errorf("MapEntryName(%q) = %q, want %q", tt.in, got, tt.wantMapEntry)
+ }
+ if got := EnumValueName(tt.in); got != tt.wantEnumValue {
+ t.Errorf("EnumValueName(%q) = %q, want %q", tt.in, got, tt.wantEnumValue)
+ }
+ if got := TrimEnumPrefix(tt.in, tt.inEnumPrefix); got != tt.wantTrimValue {
+ t.Errorf("ErimEnumPrefix(%q, %q) = %q, want %q", tt.in, tt.inEnumPrefix, got, tt.wantTrimValue)
+ }
+ if got := JSONCamelCase(tt.in); got != tt.wantJSONCamelCase {
+ t.Errorf("JSONCamelCase(%q) = %q, want %q", tt.in, got, tt.wantJSONCamelCase)
+ }
+ if got := JSONSnakeCase(tt.in); got != tt.wantJSONSnakeCase {
+ t.Errorf("JSONSnakeCase(%q) = %q, want %q", tt.in, got, tt.wantJSONSnakeCase)
+ }
+ }
+}
+
+var (
+ srcString = "1234"
+ srcBytes = []byte(srcString)
+ dst uint64
+)
+
+func BenchmarkCast(b *testing.B) {
+ b.Run("Ideal", func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ dst, _ = strconv.ParseUint(srcString, 0, 64)
+ }
+ if dst != 1234 {
+ b.Errorf("got %d, want %s", dst, srcString)
+ }
+ })
+ b.Run("Copy", func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ dst, _ = strconv.ParseUint(string(srcBytes), 0, 64)
+ }
+ if dst != 1234 {
+ b.Errorf("got %d, want %s", dst, srcString)
+ }
+ })
+ b.Run("Cast", func(b *testing.B) {
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ dst, _ = strconv.ParseUint(UnsafeString(srcBytes), 0, 64)
+ }
+ if dst != 1234 {
+ b.Errorf("got %d, want %s", dst, srcString)
+ }
+ })
+}
diff --git a/internal/strs/strings_unsafe.go b/internal/strs/strings_unsafe.go
new file mode 100644
index 0000000..2160c70
--- /dev/null
+++ b/internal/strs/strings_unsafe.go
@@ -0,0 +1,94 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !purego,!appengine
+
+package strs
+
+import (
+ "unsafe"
+
+ pref "google.golang.org/protobuf/reflect/protoreflect"
+)
+
+type (
+ stringHeader struct {
+ Data unsafe.Pointer
+ Len int
+ }
+ sliceHeader struct {
+ Data unsafe.Pointer
+ Len int
+ Cap int
+ }
+)
+
+// UnsafeString returns an unsafe string reference of b.
+// The caller must treat the input slice as immutable.
+//
+// WARNING: Use carefully. The returned result must not leak to the end user
+// unless the input slice is provably immutable.
+func UnsafeString(b []byte) (s string) {
+ src := (*sliceHeader)(unsafe.Pointer(&b))
+ dst := (*stringHeader)(unsafe.Pointer(&s))
+ dst.Data = src.Data
+ dst.Len = src.Len
+ return s
+}
+
+// UnsafeBytes returns an unsafe bytes slice reference of s.
+// The caller must treat returned slice as immutable.
+//
+// WARNING: Use carefully. The returned result must not leak to the end user.
+func UnsafeBytes(s string) (b []byte) {
+ src := (*stringHeader)(unsafe.Pointer(&s))
+ dst := (*sliceHeader)(unsafe.Pointer(&b))
+ dst.Data = src.Data
+ dst.Len = src.Len
+ dst.Cap = src.Len
+ return b
+}
+
+// Builder builds a set of strings with shared lifetime.
+// This differs from strings.Builder, which is for building a single string.
+type Builder struct {
+ buf []byte
+}
+
+// AppendFullName is equivalent to protoreflect.FullName.Append,
+// but optimized for large batches where each name has a shared lifetime.
+func (sb *Builder) AppendFullName(prefix pref.FullName, name pref.Name) pref.FullName {
+ n := len(prefix) + len(".") + len(name)
+ if len(prefix) == 0 {
+ n -= len(".")
+ }
+ sb.grow(n)
+ sb.buf = append(sb.buf, prefix...)
+ sb.buf = append(sb.buf, '.')
+ sb.buf = append(sb.buf, name...)
+ return pref.FullName(sb.last(n))
+}
+
+// MakeString is equivalent to string(b), but optimized for large batches
+// with a shared lifetime.
+func (sb *Builder) MakeString(b []byte) string {
+ sb.grow(len(b))
+ sb.buf = append(sb.buf, b...)
+ return sb.last(len(b))
+}
+
+func (sb *Builder) grow(n int) {
+ if cap(sb.buf)-len(sb.buf) >= n {
+ return
+ }
+
+ // Unlike strings.Builder, we do not need to copy over the contents
+ // of the old buffer since our builder provides no API for
+ // retrieving previously created strings.
+ sb.buf = make([]byte, 2*(cap(sb.buf)+n))
+}
+
+func (sb *Builder) last(n int) string {
+ return UnsafeString(sb.buf[len(sb.buf)-n:])
+}