all: support enforce_utf8 override

In 2014, when proto3 was being developed, there were a number of early
adopters of the new syntax. Before the finalization of proto3 when
it was released in open-source in July 2016, a decision was made to
strictly validate strings in proto3. However, some of the early adopters
were already using invalid UTF-8 with string fields.
The google.protobuf.FieldOptions.enforce_utf8 option only exists to support
those grandfathered users where they can opt-out of the validation logic.
Practical use of that option in open source is impossible even if a user
specifies the proto1_legacy build tag since it requires a hacked
variant of descriptor.proto that is not externally available.

This CL supports enforce_utf8 by modifiyng internal/filedesc to
expose the flag if it detects it in the raw descriptor.
We add an strs.EnforceUTF8 function as a centralized place to determine
whether to perform validation. Validation opt-out is supported
only in builds with legacy support.

We implement support for validating UTF-8 in all proto3 string fields,
even if they are backed by a Go []byte.

Change-Id: I9c0628b84909bc7181125f09db730c80d490e485
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/186002
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/proto/decode_test.go b/proto/decode_test.go
index 5fa3a0f..ce2e1af 100644
--- a/proto/decode_test.go
+++ b/proto/decode_test.go
@@ -12,13 +12,20 @@
 	protoV1 "github.com/golang/protobuf/proto"
 	"google.golang.org/protobuf/encoding/prototext"
 	"google.golang.org/protobuf/internal/encoding/pack"
+	"google.golang.org/protobuf/internal/filedesc"
+	"google.golang.org/protobuf/internal/flags"
 	"google.golang.org/protobuf/proto"
+	"google.golang.org/protobuf/reflect/protodesc"
+	"google.golang.org/protobuf/reflect/protoreflect"
 	pref "google.golang.org/protobuf/reflect/protoreflect"
+	"google.golang.org/protobuf/reflect/prototype"
+	"google.golang.org/protobuf/runtime/protoimpl"
 
 	legacypb "google.golang.org/protobuf/internal/testprotos/legacy"
 	legacy1pb "google.golang.org/protobuf/internal/testprotos/legacy/proto2.v0.0.0-20160225-2fc053c5"
 	testpb "google.golang.org/protobuf/internal/testprotos/test"
 	test3pb "google.golang.org/protobuf/internal/testprotos/test3"
+	"google.golang.org/protobuf/types/descriptorpb"
 )
 
 type testProto struct {
@@ -85,6 +92,23 @@
 	}
 }
 
+func TestDecodeNoEnforceUTF8(t *testing.T) {
+	for _, test := range noEnforceUTF8TestProtos {
+		for _, want := range test.decodeTo {
+			t.Run(fmt.Sprintf("%s (%T)", test.desc, want), func(t *testing.T) {
+				got := reflect.New(reflect.TypeOf(want).Elem()).Interface().(proto.Message)
+				err := proto.Unmarshal(test.wire, got)
+				switch {
+				case flags.Proto1Legacy && err != nil:
+					t.Errorf("Unmarshal returned unexpected error: %v\nMessage:\n%v", err, marshalText(want))
+				case !flags.Proto1Legacy && err == nil:
+					t.Errorf("Unmarshal did not return expected error for invalid UTF8: %v\nMessage:\n%v", err, marshalText(want))
+				}
+			})
+		}
+	}
+}
+
 var testProtos = []testProto{
 	{
 		desc: "basic scalar types",
@@ -1442,6 +1466,129 @@
 	},
 }
 
+var noEnforceUTF8TestProtos = []testProto{
+	{
+		desc: "invalid UTF-8 in optional string field",
+		decodeTo: []proto.Message{&TestNoEnforceUTF8{
+			OptionalString: string("abc\xff"),
+		}},
+		wire: pack.Message{
+			pack.Tag{1, pack.BytesType}, pack.String("abc\xff"),
+		}.Marshal(),
+	},
+	{
+		desc: "invalid UTF-8 in optional string field of Go bytes",
+		decodeTo: []proto.Message{&TestNoEnforceUTF8{
+			OptionalBytes: []byte("abc\xff"),
+		}},
+		wire: pack.Message{
+			pack.Tag{2, pack.BytesType}, pack.String("abc\xff"),
+		}.Marshal(),
+	},
+	{
+		desc: "invalid UTF-8 in repeated string field",
+		decodeTo: []proto.Message{&TestNoEnforceUTF8{
+			RepeatedString: []string{string("foo"), string("abc\xff")},
+		}},
+		wire: pack.Message{
+			pack.Tag{3, pack.BytesType}, pack.String("foo"),
+			pack.Tag{3, pack.BytesType}, pack.String("abc\xff"),
+		}.Marshal(),
+	},
+	{
+		desc: "invalid UTF-8 in repeated string field of Go bytes",
+		decodeTo: []proto.Message{&TestNoEnforceUTF8{
+			RepeatedBytes: [][]byte{[]byte("foo"), []byte("abc\xff")},
+		}},
+		wire: pack.Message{
+			pack.Tag{4, pack.BytesType}, pack.String("foo"),
+			pack.Tag{4, pack.BytesType}, pack.String("abc\xff"),
+		}.Marshal(),
+	},
+	{
+		desc: "invalid UTF-8 in oneof string field",
+		decodeTo: []proto.Message{
+			&TestNoEnforceUTF8{OneofField: &TestNoEnforceUTF8_OneofString{string("abc\xff")}},
+		},
+		wire: pack.Message{pack.Tag{5, pack.BytesType}, pack.String("abc\xff")}.Marshal(),
+	},
+	{
+		desc: "invalid UTF-8 in oneof string field of Go bytes",
+		decodeTo: []proto.Message{
+			&TestNoEnforceUTF8{OneofField: &TestNoEnforceUTF8_OneofBytes{[]byte("abc\xff")}},
+		},
+		wire: pack.Message{pack.Tag{6, pack.BytesType}, pack.String("abc\xff")}.Marshal(),
+	},
+}
+
+type TestNoEnforceUTF8 struct {
+	OptionalString string       `protobuf:"bytes,1,opt,name=optional_string"`
+	OptionalBytes  []byte       `protobuf:"bytes,2,opt,name=optional_bytes"`
+	RepeatedString []string     `protobuf:"bytes,3,rep,name=repeated_string"`
+	RepeatedBytes  [][]byte     `protobuf:"bytes,4,rep,name=repeated_bytes"`
+	OneofField     isOneofField `protobuf_oneof:"oneof_field"`
+}
+
+type isOneofField interface{ isOneofField() }
+
+type TestNoEnforceUTF8_OneofString struct {
+	OneofString string `protobuf:"bytes,5,opt,name=oneof_string,oneof"`
+}
+type TestNoEnforceUTF8_OneofBytes struct {
+	OneofBytes []byte `protobuf:"bytes,6,opt,name=oneof_bytes,oneof"`
+}
+
+func (*TestNoEnforceUTF8_OneofString) isOneofField() {}
+func (*TestNoEnforceUTF8_OneofBytes) isOneofField()  {}
+
+func (m *TestNoEnforceUTF8) ProtoReflect() pref.Message {
+	return messageInfo_TestNoEnforceUTF8.MessageOf(m)
+}
+
+var messageInfo_TestNoEnforceUTF8 = protoimpl.MessageInfo{
+	GoType: reflect.TypeOf((*TestNoEnforceUTF8)(nil)),
+	PBType: &prototype.Message{
+		MessageDescriptor: func() protoreflect.MessageDescriptor {
+			pb := new(descriptorpb.FileDescriptorProto)
+			if err := prototext.Unmarshal([]byte(`
+				syntax:  "proto3"
+				name:    "test.proto"
+				message_type: [{
+					name: "TestNoEnforceUTF8"
+					field: [
+						{name:"optional_string" number:1 label:LABEL_OPTIONAL type:TYPE_STRING},
+						{name:"optional_bytes"  number:2 label:LABEL_OPTIONAL type:TYPE_STRING},
+						{name:"repeated_string" number:3 label:LABEL_REPEATED type:TYPE_STRING},
+						{name:"repeated_bytes"  number:4 label:LABEL_REPEATED type:TYPE_STRING},
+						{name:"oneof_string"    number:5 label:LABEL_OPTIONAL type:TYPE_STRING, oneof_index:0},
+						{name:"oneof_bytes"     number:6 label:LABEL_OPTIONAL type:TYPE_STRING, oneof_index:0}
+					]
+					oneof_decl: [{name:"oneof_field"}]
+				}]
+			`), pb); err != nil {
+				panic(err)
+			}
+			fd, err := protodesc.NewFile(pb, nil)
+			if err != nil {
+				panic(err)
+			}
+			md := fd.Messages().Get(0)
+			for i := 0; i < md.Fields().Len(); i++ {
+				md.Fields().Get(i).(*filedesc.Field).L1.HasEnforceUTF8 = true
+				md.Fields().Get(i).(*filedesc.Field).L1.EnforceUTF8 = false
+			}
+			return md
+		}(),
+		NewMessage: func() pref.Message {
+			return pref.ProtoMessage(new(TestNoEnforceUTF8)).ProtoReflect()
+		},
+	},
+	OneofWrappers: []interface{}{
+		(*TestNoEnforceUTF8_OneofString)(nil),
+		(*TestNoEnforceUTF8_OneofBytes)(nil),
+	},
+}
+
 func build(m proto.Message, opts ...buildOpt) proto.Message {
 	for _, opt := range opts {
 		opt(m)