proto, internal/impl: store unknown MessageSet items in non-mset format

In the v1 implementation, unknown MessageSet items are stored in a
message's unknown fields section in non-MessageSet format. For example,
consider a MessageSet containing an item with type_id T and value V.
If the type_id is not resolvable, the item will be placed in the unknown
fields as a bytes-valued field with number T and contents V. This
conversion is then reversed when marshaling a MessageSet containing
unknown fields.

Preserve this behavior in v2.

One consequence of this change is that actual unknown fields in a
MessageSet (any field other than 1) are now discarded. This matches
the previous behavior.

Change-Id: I3d913613f84e0ae82481078dbc91cb25628651cc
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/205697
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/codec_message.go b/internal/impl/codec_message.go
index 4694718..ab1d4ec 100644
--- a/internal/impl/codec_message.go
+++ b/internal/impl/codec_message.go
@@ -29,6 +29,7 @@
 	unknownOffset      offset
 	extensionOffset    offset
 	needsInitCheck     bool
+	isMessageSet       bool
 
 	extensionFieldInfosMu sync.RWMutex
 	extensionFieldInfos   map[pref.ExtensionType]*extensionFieldInfo
@@ -97,16 +98,10 @@
 		if !mi.extensionOffset.IsValid() {
 			panic(fmt.Sprintf("%v: MessageSet with no extensions field", mi.Desc.FullName()))
 		}
-		cf := &coderFieldInfo{
-			num:       messageset.FieldItem,
-			offset:    si.extensionOffset,
-			isPointer: true,
-			funcs:     makeMessageSetFieldCoder(mi),
+		if !mi.unknownOffset.IsValid() {
+			panic(fmt.Sprintf("%v: MessageSet with no unknown field", mi.Desc.FullName()))
 		}
-		mi.orderedCoderFields = append(mi.orderedCoderFields, cf)
-		mi.coderFields[cf.num] = cf
-		// Invalidate the extension offset, since the field codec handles extensions.
-		mi.extensionOffset = invalidOffset
+		mi.isMessageSet = true
 	}
 	sort.Slice(mi.orderedCoderFields, func(i, j int) bool {
 		return mi.orderedCoderFields[i].num < mi.orderedCoderFields[j].num
diff --git a/internal/impl/codec_messageset.go b/internal/impl/codec_messageset.go
index 0b3746d..d78afeb 100644
--- a/internal/impl/codec_messageset.go
+++ b/internal/impl/codec_messageset.go
@@ -13,48 +13,36 @@
 	"google.golang.org/protobuf/internal/flags"
 )
 
-func makeMessageSetFieldCoder(mi *MessageInfo) pointerCoderFuncs {
-	return pointerCoderFuncs{
-		size: func(p pointer, tagsize int, opts marshalOptions) int {
-			return sizeMessageSet(mi, p, tagsize, opts)
-		},
-		marshal: func(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
-			return marshalMessageSet(mi, b, p, wiretag, opts)
-		},
-		unmarshal: func(b []byte, p pointer, wtyp wire.Type, opts unmarshalOptions) (int, error) {
-			return unmarshalMessageSet(mi, b, p, wtyp, opts)
-		},
-	}
-}
-
-func sizeMessageSet(mi *MessageInfo, p pointer, tagsize int, opts marshalOptions) (n int) {
-	ext := *p.Extensions()
-	if ext == nil {
+func sizeMessageSet(mi *MessageInfo, p pointer, opts marshalOptions) (size int) {
+	if !flags.ProtoLegacy {
 		return 0
 	}
+
+	ext := *p.Apply(mi.extensionOffset).Extensions()
 	for _, x := range ext {
 		xi := mi.extensionFieldInfo(x.Type())
 		if xi.funcs.size == nil {
 			continue
 		}
 		num, _ := wire.DecodeTag(xi.wiretag)
-		n += messageset.SizeField(num)
-		n += xi.funcs.size(x.Value(), wire.SizeTag(messageset.FieldMessage), opts)
+		size += messageset.SizeField(num)
+		size += xi.funcs.size(x.Value(), wire.SizeTag(messageset.FieldMessage), opts)
 	}
-	return n
+
+	unknown := *p.Apply(mi.unknownOffset).Bytes()
+	size += messageset.SizeUnknown(unknown)
+
+	return size
 }
 
-func marshalMessageSet(mi *MessageInfo, b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+func marshalMessageSet(mi *MessageInfo, b []byte, p pointer, opts marshalOptions) ([]byte, error) {
 	if !flags.ProtoLegacy {
 		return b, errors.New("no support for message_set_wire_format")
 	}
-	ext := *p.Extensions()
-	if ext == nil {
-		return b, nil
-	}
+
+	ext := *p.Apply(mi.extensionOffset).Extensions()
 	switch len(ext) {
 	case 0:
-		return b, nil
 	case 1:
 		// Fast-path for one extension: Don't bother sorting the keys.
 		for _, x := range ext {
@@ -64,7 +52,6 @@
 				return b, err
 			}
 		}
-		return b, nil
 	default:
 		// Sort the keys to provide a deterministic encoding.
 		// Not sure this is required, but the old code does it.
@@ -80,8 +67,15 @@
 				return b, err
 			}
 		}
-		return b, nil
 	}
+
+	unknown := *p.Apply(mi.unknownOffset).Bytes()
+	b, err := messageset.AppendUnknown(b, unknown)
+	if err != nil {
+		return b, err
+	}
+
+	return b, nil
 }
 
 func marshalMessageSetField(mi *MessageInfo, b []byte, x ExtensionField, opts marshalOptions) ([]byte, error) {
@@ -96,24 +90,25 @@
 	return b, nil
 }
 
-func unmarshalMessageSet(mi *MessageInfo, b []byte, p pointer, wtyp wire.Type, opts unmarshalOptions) (int, error) {
+func unmarshalMessageSet(mi *MessageInfo, b []byte, p pointer, opts unmarshalOptions) (int, error) {
 	if !flags.ProtoLegacy {
 		return 0, errors.New("no support for message_set_wire_format")
 	}
-	if wtyp != wire.StartGroupType {
-		return 0, errUnknown
-	}
-	ep := p.Extensions()
+
+	ep := p.Apply(mi.extensionOffset).Extensions()
 	if *ep == nil {
 		*ep = make(map[int32]ExtensionField)
 	}
 	ext := *ep
-	num, v, n, err := messageset.ConsumeFieldValue(b, true)
-	if err != nil {
-		return 0, err
-	}
-	if _, err := mi.unmarshalExtension(v, num, wire.BytesType, ext, opts); err != nil {
-		return 0, err
-	}
-	return n, nil
+	unknown := p.Apply(mi.unknownOffset).Bytes()
+	err := messageset.Unmarshal(b, true, func(num wire.Number, v []byte) error {
+		_, err := mi.unmarshalExtension(v, num, wire.BytesType, ext, opts)
+		if err == errUnknown {
+			*unknown = wire.AppendTag(*unknown, num, wire.BytesType)
+			*unknown = append(*unknown, v...)
+			return nil
+		}
+		return err
+	})
+	return len(b), err
 }
diff --git a/internal/impl/decode.go b/internal/impl/decode.go
index 757bbde..85cc6b0 100644
--- a/internal/impl/decode.go
+++ b/internal/impl/decode.go
@@ -7,6 +7,7 @@
 import (
 	"google.golang.org/protobuf/internal/encoding/wire"
 	"google.golang.org/protobuf/internal/errors"
+	"google.golang.org/protobuf/internal/flags"
 	"google.golang.org/protobuf/proto"
 	pref "google.golang.org/protobuf/reflect/protoreflect"
 	preg "google.golang.org/protobuf/reflect/protoregistry"
@@ -72,6 +73,9 @@
 
 func (mi *MessageInfo) unmarshalPointer(b []byte, p pointer, groupTag wire.Number, opts unmarshalOptions) (int, error) {
 	mi.init()
+	if flags.ProtoLegacy && mi.isMessageSet {
+		return unmarshalMessageSet(mi, b, p, opts)
+	}
 	var exts *map[int32]ExtensionField
 	start := len(b)
 	for len(b) > 0 {
diff --git a/internal/impl/encode.go b/internal/impl/encode.go
index d7dc892..cd57998 100644
--- a/internal/impl/encode.go
+++ b/internal/impl/encode.go
@@ -8,6 +8,7 @@
 	"sort"
 	"sync/atomic"
 
+	"google.golang.org/protobuf/internal/flags"
 	proto "google.golang.org/protobuf/proto"
 	pref "google.golang.org/protobuf/reflect/protoreflect"
 	piface "google.golang.org/protobuf/runtime/protoiface"
@@ -69,6 +70,13 @@
 }
 
 func (mi *MessageInfo) sizePointerSlow(p pointer, opts marshalOptions) (size int) {
+	if flags.ProtoLegacy && mi.isMessageSet {
+		size = sizeMessageSet(mi, p, opts)
+		if mi.sizecacheOffset.IsValid() {
+			atomic.StoreInt32(p.Apply(mi.sizecacheOffset).Int32(), int32(size))
+		}
+		return size
+	}
 	if mi.extensionOffset.IsValid() {
 		e := p.Apply(mi.extensionOffset).Extensions()
 		size += mi.sizeExtensions(e, opts)
@@ -109,6 +117,9 @@
 	if p.IsNil() {
 		return b, nil
 	}
+	if flags.ProtoLegacy && mi.isMessageSet {
+		return marshalMessageSet(mi, b, p, opts)
+	}
 	var err error
 	// The old marshaler encodes extensions at beginning.
 	if mi.extensionOffset.IsValid() {
@@ -132,7 +143,7 @@
 			return b, err
 		}
 	}
-	if mi.unknownOffset.IsValid() {
+	if mi.unknownOffset.IsValid() && !mi.isMessageSet {
 		u := *p.Apply(mi.unknownOffset).Bytes()
 		b = append(b, u...)
 	}