compiler/protogen, internal/strs, internal/impl: expose enum Go name derivation
In order to migrate v1 to wrap v2, we need a way to reproduce
the awful enum "names" that v1 used, which was the concatenation of
the proto package with the Go identifier used for the enum.
To support this:
* Move the camel case logic from compiler/protogen to internal/strs
* Add a small stub in internal/impl to expose this functionality
Change-Id: I8ff31daa9ae541e5788dc04d2e89eae1574877e4
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/191637
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/internal/strs/strings.go b/internal/strs/strings.go
index 2208ff2..0b74e76 100644
--- a/internal/strs/strings.go
+++ b/internal/strs/strings.go
@@ -6,8 +6,10 @@
package strs
import (
+ "go/token"
"strings"
"unicode"
+ "unicode/utf8"
"google.golang.org/protobuf/internal/flags"
"google.golang.org/protobuf/reflect/protoreflect"
@@ -23,6 +25,68 @@
return fd.Syntax() == protoreflect.Proto3
}
+// GoCamelCase camel-cases a protobuf name for use as a Go identifier.
+//
+// If there is an interior underscore followed by a lower case letter,
+// drop the underscore and convert the letter to upper case.
+func GoCamelCase(s string) string {
+ // Invariant: if the next letter is lower case, it must be converted
+ // to upper case.
+ // That is, we process a word at a time, where words are marked by _ or
+ // upper case letter. Digits are treated as words.
+ var b []byte
+ for i := 0; i < len(s); i++ {
+ c := s[i]
+ switch {
+ case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]):
+ // Skip over '.' in ".{{lowercase}}".
+ case c == '.':
+ b = append(b, '_') // convert '.' to '_'
+ case c == '_' && (i == 0 || s[i-1] == '.'):
+ // Convert initial '_' to ensure we start with a capital letter.
+ // Do the same for '_' after '.' to match historic behavior.
+ b = append(b, 'X') // convert '_' to 'X'
+ case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]):
+ // Skip over '_' in "_{{lowercase}}".
+ case isASCIIDigit(c):
+ b = append(b, c)
+ default:
+ // Assume we have a letter now - if not, it's a bogus identifier.
+ // The next word is a sequence of characters that must start upper case.
+ if isASCIILower(c) {
+ c -= 'a' - 'A' // convert lowercase to uppercase
+ }
+ b = append(b, c)
+
+ // Accept lower case sequence that follows.
+ for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ {
+ b = append(b, s[i+1])
+ }
+ }
+ }
+ return string(b)
+}
+
+// GoSanitized converts a string to a valid Go identifier.
+func GoSanitized(s string) string {
+ // Sanitize the input to the set of valid characters,
+ // which must be '_' or be in the Unicode L or N categories.
+ s = strings.Map(func(r rune) rune {
+ if unicode.IsLetter(r) || unicode.IsDigit(r) {
+ return r
+ }
+ return '_'
+ }, s)
+
+ // Prepend '_' in the event of a Go keyword conflict or if
+ // the identifier is invalid (does not start in the Unicode L category).
+ r, _ := utf8.DecodeRuneInString(s)
+ if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) {
+ return "_" + s
+ }
+ return s
+}
+
// JSONCamelCase converts a snake_case identifier to a camelCase identifier,
// according to the protobuf JSON specification.
func JSONCamelCase(s string) string {
@@ -31,8 +95,7 @@
for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
c := s[i]
if c != '_' {
- isLower := 'a' <= c && c <= 'z'
- if wasUnderscore && isLower {
+ if wasUnderscore && isASCIILower(c) {
c -= 'a' - 'A' // convert to uppercase
}
b = append(b, c)
@@ -48,8 +111,7 @@
var b []byte
for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
c := s[i]
- isUpper := 'A' <= c && c <= 'Z'
- if isUpper {
+ if isASCIIUpper(c) {
b = append(b, '_')
c += 'a' - 'A' // convert to lowercase
}
@@ -122,3 +184,13 @@
}
return s
}
+
+func isASCIILower(c byte) bool {
+ return 'a' <= c && c <= 'z'
+}
+func isASCIIUpper(c byte) bool {
+ return 'A' <= c && c <= 'Z'
+}
+func isASCIIDigit(c byte) bool {
+ return '0' <= c && c <= '9'
+}
diff --git a/internal/strs/strings_test.go b/internal/strs/strings_test.go
index 2c4c2ad..0bb894a 100644
--- a/internal/strs/strings_test.go
+++ b/internal/strs/strings_test.go
@@ -9,6 +9,61 @@
"testing"
)
+func TestGoCamelCase(t *testing.T) {
+ tests := []struct {
+ in, want string
+ }{
+ {"", ""},
+ {"one", "One"},
+ {"one_two", "OneTwo"},
+ {"_my_field_name_2", "XMyFieldName_2"},
+ {"Something_Capped", "Something_Capped"},
+ {"my_Name", "My_Name"},
+ {"OneTwo", "OneTwo"},
+ {"_", "X"},
+ {"_a_", "XA_"},
+ {"one.two", "OneTwo"},
+ {"one.Two", "One_Two"},
+ {"one_two.three_four", "OneTwoThreeFour"},
+ {"one_two.Three_four", "OneTwo_ThreeFour"},
+ {"_one._two", "XOne_XTwo"},
+ {"SCREAMING_SNAKE_CASE", "SCREAMING_SNAKE_CASE"},
+ {"double__underscore", "Double_Underscore"},
+ {"camelCase", "CamelCase"},
+ {"go2proto", "Go2Proto"},
+ {"世界", "世界"},
+ {"x世界", "X世界"},
+ {"foo_bar世界", "FooBar世界"},
+ }
+ for _, tc := range tests {
+ if got := GoCamelCase(tc.in); got != tc.want {
+ t.Errorf("GoCamelCase(%q) = %q, want %q", tc.in, got, tc.want)
+ }
+ }
+}
+
+func TestGoSanitized(t *testing.T) {
+ tests := []struct {
+ in, want string
+ }{
+ {"", "_"},
+ {"boo", "boo"},
+ {"Boo", "Boo"},
+ {"ßoo", "ßoo"},
+ {"default", "_default"},
+ {"hello", "hello"},
+ {"hello-world!!", "hello_world__"},
+ {"hello-\xde\xad\xbe\xef\x00", "hello_____"},
+ {"hello 世界", "hello_世界"},
+ {"世界", "世界"},
+ }
+ for _, tc := range tests {
+ if got := GoSanitized(tc.in); got != tc.want {
+ t.Errorf("GoSanitized(%q) = %q, want %q", tc.in, got, tc.want)
+ }
+ }
+}
+
func TestName(t *testing.T) {
tests := []struct {
in string