compiler/protogen, internal/strs, internal/impl: expose enum Go name derivation

In order to migrate v1 to wrap v2, we need a way to reproduce
the awful enum "names" that v1 used, which was the concatenation of
the proto package with the Go identifier used for the enum.

To support this:
* Move the camel case logic from compiler/protogen to internal/strs
* Add a small stub in internal/impl to expose this functionality

Change-Id: I8ff31daa9ae541e5788dc04d2e89eae1574877e4
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/191637
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/internal/strs/strings.go b/internal/strs/strings.go
index 2208ff2..0b74e76 100644
--- a/internal/strs/strings.go
+++ b/internal/strs/strings.go
@@ -6,8 +6,10 @@
 package strs
 
 import (
+	"go/token"
 	"strings"
 	"unicode"
+	"unicode/utf8"
 
 	"google.golang.org/protobuf/internal/flags"
 	"google.golang.org/protobuf/reflect/protoreflect"
@@ -23,6 +25,68 @@
 	return fd.Syntax() == protoreflect.Proto3
 }
 
+// GoCamelCase camel-cases a protobuf name for use as a Go identifier.
+//
+// If there is an interior underscore followed by a lower case letter,
+// drop the underscore and convert the letter to upper case.
+func GoCamelCase(s string) string {
+	// Invariant: if the next letter is lower case, it must be converted
+	// to upper case.
+	// That is, we process a word at a time, where words are marked by _ or
+	// upper case letter. Digits are treated as words.
+	var b []byte
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		switch {
+		case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]):
+			// Skip over '.' in ".{{lowercase}}".
+		case c == '.':
+			b = append(b, '_') // convert '.' to '_'
+		case c == '_' && (i == 0 || s[i-1] == '.'):
+			// Convert initial '_' to ensure we start with a capital letter.
+			// Do the same for '_' after '.' to match historic behavior.
+			b = append(b, 'X') // convert '_' to 'X'
+		case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]):
+			// Skip over '_' in "_{{lowercase}}".
+		case isASCIIDigit(c):
+			b = append(b, c)
+		default:
+			// Assume we have a letter now - if not, it's a bogus identifier.
+			// The next word is a sequence of characters that must start upper case.
+			if isASCIILower(c) {
+				c -= 'a' - 'A' // convert lowercase to uppercase
+			}
+			b = append(b, c)
+
+			// Accept lower case sequence that follows.
+			for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ {
+				b = append(b, s[i+1])
+			}
+		}
+	}
+	return string(b)
+}
+
+// GoSanitized converts a string to a valid Go identifier.
+func GoSanitized(s string) string {
+	// Sanitize the input to the set of valid characters,
+	// which must be '_' or be in the Unicode L or N categories.
+	s = strings.Map(func(r rune) rune {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			return r
+		}
+		return '_'
+	}, s)
+
+	// Prepend '_' in the event of a Go keyword conflict or if
+	// the identifier is invalid (does not start in the Unicode L category).
+	r, _ := utf8.DecodeRuneInString(s)
+	if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) {
+		return "_" + s
+	}
+	return s
+}
+
 // JSONCamelCase converts a snake_case identifier to a camelCase identifier,
 // according to the protobuf JSON specification.
 func JSONCamelCase(s string) string {
@@ -31,8 +95,7 @@
 	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
 		c := s[i]
 		if c != '_' {
-			isLower := 'a' <= c && c <= 'z'
-			if wasUnderscore && isLower {
+			if wasUnderscore && isASCIILower(c) {
 				c -= 'a' - 'A' // convert to uppercase
 			}
 			b = append(b, c)
@@ -48,8 +111,7 @@
 	var b []byte
 	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
 		c := s[i]
-		isUpper := 'A' <= c && c <= 'Z'
-		if isUpper {
+		if isASCIIUpper(c) {
 			b = append(b, '_')
 			c += 'a' - 'A' // convert to lowercase
 		}
@@ -122,3 +184,13 @@
 	}
 	return s
 }
+
+func isASCIILower(c byte) bool {
+	return 'a' <= c && c <= 'z'
+}
+func isASCIIUpper(c byte) bool {
+	return 'A' <= c && c <= 'Z'
+}
+func isASCIIDigit(c byte) bool {
+	return '0' <= c && c <= '9'
+}
diff --git a/internal/strs/strings_test.go b/internal/strs/strings_test.go
index 2c4c2ad..0bb894a 100644
--- a/internal/strs/strings_test.go
+++ b/internal/strs/strings_test.go
@@ -9,6 +9,61 @@
 	"testing"
 )
 
+func TestGoCamelCase(t *testing.T) {
+	tests := []struct {
+		in, want string
+	}{
+		{"", ""},
+		{"one", "One"},
+		{"one_two", "OneTwo"},
+		{"_my_field_name_2", "XMyFieldName_2"},
+		{"Something_Capped", "Something_Capped"},
+		{"my_Name", "My_Name"},
+		{"OneTwo", "OneTwo"},
+		{"_", "X"},
+		{"_a_", "XA_"},
+		{"one.two", "OneTwo"},
+		{"one.Two", "One_Two"},
+		{"one_two.three_four", "OneTwoThreeFour"},
+		{"one_two.Three_four", "OneTwo_ThreeFour"},
+		{"_one._two", "XOne_XTwo"},
+		{"SCREAMING_SNAKE_CASE", "SCREAMING_SNAKE_CASE"},
+		{"double__underscore", "Double_Underscore"},
+		{"camelCase", "CamelCase"},
+		{"go2proto", "Go2Proto"},
+		{"世界", "世界"},
+		{"x世界", "X世界"},
+		{"foo_bar世界", "FooBar世界"},
+	}
+	for _, tc := range tests {
+		if got := GoCamelCase(tc.in); got != tc.want {
+			t.Errorf("GoCamelCase(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
+func TestGoSanitized(t *testing.T) {
+	tests := []struct {
+		in, want string
+	}{
+		{"", "_"},
+		{"boo", "boo"},
+		{"Boo", "Boo"},
+		{"ßoo", "ßoo"},
+		{"default", "_default"},
+		{"hello", "hello"},
+		{"hello-world!!", "hello_world__"},
+		{"hello-\xde\xad\xbe\xef\x00", "hello_____"},
+		{"hello 世界", "hello_世界"},
+		{"世界", "世界"},
+	}
+	for _, tc := range tests {
+		if got := GoSanitized(tc.in); got != tc.want {
+			t.Errorf("GoSanitized(%q) = %q, want %q", tc.in, got, tc.want)
+		}
+	}
+}
+
 func TestName(t *testing.T) {
 	tests := []struct {
 		in                string