internal/impl: add fast-path marshal implementation

This is a port of the v1 table marshaler, with some substantial
cleanup and refactoring.

Benchstat results from the protobuf reference benchmark data comparing the
v1 package with v2, with AllowPartial:true set for the new package. This
is not an apples-to-apples comparison, since v1 doesn't have a way to
disable required field checks.  Required field checks in v2 package
currently go through reflection, which performs terribly; my initial
experimentation indicates that fast-path required field checks will
not add a large amount of cost; these results are incomplete but not
wholly inaccurate.

name                                           old time/op  new time/op  delta
/dataset.google_message3_1.pb/Marshal-12        219ms ± 1%   232ms ± 1%   +5.85%  (p=0.004 n=6+5)
/dataset.google_message2.pb/Marshal-12          261µs ± 3%   248µs ± 1%   -5.14%  (p=0.002 n=6+6)
/dataset.google_message1_proto2.pb/Marshal-12   681ns ± 2%   637ns ± 3%   -6.53%  (p=0.002 n=6+6)
/dataset.google_message1_proto3.pb/Marshal-12  1.10µs ± 8%  0.99µs ± 3%   -9.63%  (p=0.002 n=6+6)
/dataset.google_message3_3.pb/Marshal-12       44.2ms ± 3%  35.2ms ± 1%  -20.28%  (p=0.004 n=6+5)
/dataset.google_message4.pb/Marshal-12         91.4ms ± 2%  94.9ms ± 2%   +3.78%  (p=0.002 n=6+6)
/dataset.google_message3_2.pb/Marshal-12       78.7ms ± 6%  80.8ms ± 4%     ~     (p=0.310 n=6+6)
/dataset.google_message3_4.pb/Marshal-12       10.6ms ± 3%  10.6ms ± 8%     ~     (p=0.662 n=5+6)
/dataset.google_message3_5.pb/Marshal-12        675ms ± 4%   510ms ± 2%  -24.40%  (p=0.002 n=6+6)
/dataset.google_message3_1.pb/Marshal           219ms ± 1%   236ms ± 7%   +8.06%  (p=0.004 n=5+6)
/dataset.google_message2.pb/Marshal             257µs ± 1%   250µs ± 3%     ~     (p=0.052 n=5+6)
/dataset.google_message1_proto2.pb/Marshal      685ns ± 1%   628ns ± 1%   -8.41%  (p=0.008 n=5+5)
/dataset.google_message1_proto3.pb/Marshal     1.08µs ± 1%  0.98µs ± 2%   -9.31%  (p=0.004 n=5+6)
/dataset.google_message3_3.pb/Marshal          43.7ms ± 1%  35.1ms ± 1%  -19.76%  (p=0.002 n=6+6)
/dataset.google_message4.pb/Marshal            93.4ms ± 4%  94.9ms ± 2%     ~     (p=0.180 n=6+6)
/dataset.google_message3_2.pb/Marshal           105ms ± 2%    98ms ± 7%   -6.81%  (p=0.009 n=5+6)
/dataset.google_message3_4.pb/Marshal          16.3ms ± 6%  15.7ms ± 3%   -3.44%  (p=0.041 n=6+6)
/dataset.google_message3_5.pb/Marshal           676ms ± 4%   504ms ± 2%  -25.50%  (p=0.004 n=6+5)

Change-Id: I72cc4597117f4cf5d236ef505777d49dd4a5f75d
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/171020
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/message.go b/internal/impl/message.go
index 96efec9..1c6ab9b 100644
--- a/internal/impl/message.go
+++ b/internal/impl/message.go
@@ -7,9 +7,11 @@
 import (
 	"fmt"
 	"reflect"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
+	"sync/atomic"
 
 	pvalue "google.golang.org/protobuf/internal/value"
 	pref "google.golang.org/protobuf/reflect/protoreflect"
@@ -28,29 +30,102 @@
 	// Once set, this field must never be mutated.
 	PBType pref.MessageType
 
-	once sync.Once // protects all unexported fields
+	initMu   sync.Mutex // protects all unexported fields
+	initDone uint32
 
-	// TODO: Split fields into dense and sparse maps similar to the current
-	// table-driven implementation in v1?
-	fields map[pref.FieldNumber]*fieldInfo
+	// Keep a separate slice of fields for efficient field encoding in tag order
+	// and because iterating over a slice is substantially faster than a map.
+	fields        map[pref.FieldNumber]*fieldInfo
+	fieldsOrdered []*fieldInfo
+
 	oneofs map[pref.Name]*oneofInfo
 
 	unknownFields   func(*messageDataType) pref.UnknownFields
 	extensionFields func(*messageDataType) pref.KnownFields
+	methods         piface.Methods
+
+	extensionOffset       offset
+	sizecacheOffset       offset
+	unknownOffset         offset
+	extensionFieldInfosMu sync.RWMutex
+	extensionFieldInfos   map[*piface.ExtensionDescV1]*extensionFieldInfo
+}
+
+var prefMessageType = reflect.TypeOf((*pref.Message)(nil)).Elem()
+
+// getMessageType returns the MessageType (if any) for a type.
+//
+// We find the MessageType by calling the ProtoReflect method on the type's
+// zero value and looking at the returned type to see if it is a
+// messageReflectWrapper. Note that the MessageType may still be uninitialized
+// at this point.
+func getMessageType(mt reflect.Type) (mi *MessageType, ok bool) {
+	method, ok := mt.MethodByName("ProtoReflect")
+	if !ok {
+		return nil, false
+	}
+	if method.Type.NumIn() != 1 || method.Type.NumOut() != 1 || method.Type.Out(0) != prefMessageType {
+		return nil, false
+	}
+	ret := reflect.Zero(mt).Method(method.Index).Call(nil)
+	m, ok := ret[0].Elem().Interface().(*messageReflectWrapper)
+	if !ok {
+		return nil, ok
+	}
+	return m.mi, true
 }
 
 func (mi *MessageType) init() {
-	mi.once.Do(func() {
-		t := mi.GoType
-		if t.Kind() != reflect.Ptr && t.Elem().Kind() != reflect.Struct {
-			panic(fmt.Sprintf("got %v, want *struct kind", t))
-		}
+	// This function is called in the hot path. Inline the sync.Once
+	// logic, since allocating a closure for Once.Do is expensive.
+	// Keep init small to ensure that it can be inlined.
+	if atomic.LoadUint32(&mi.initDone) == 1 {
+		return
+	}
+	mi.initOnce()
+}
 
-		si := mi.makeStructInfo(t.Elem())
-		mi.makeKnownFieldsFunc(si)
-		mi.makeUnknownFieldsFunc(t.Elem())
-		mi.makeExtensionFieldsFunc(t.Elem())
-	})
+func (mi *MessageType) initOnce() {
+	mi.initMu.Lock()
+	defer mi.initMu.Unlock()
+	if mi.initDone == 1 {
+		return
+	}
+
+	t := mi.GoType
+	if t.Kind() != reflect.Ptr && t.Elem().Kind() != reflect.Struct {
+		panic(fmt.Sprintf("got %v, want *struct kind", t))
+	}
+
+	si := mi.makeStructInfo(t.Elem())
+	mi.makeKnownFieldsFunc(si)
+	mi.makeUnknownFieldsFunc(t.Elem())
+	mi.makeExtensionFieldsFunc(t.Elem())
+	mi.makeMethods(t.Elem())
+
+	atomic.StoreUint32(&mi.initDone, 1)
+}
+
+var sizecacheType = reflect.TypeOf(int32(0))
+
+func (mi *MessageType) makeMethods(t reflect.Type) {
+	mi.extensionOffset = invalidOffset
+	if fx, _ := t.FieldByName("XXX_InternalExtensions"); fx.Type == extType {
+		mi.extensionOffset = offsetOf(fx)
+	} else if fx, _ = t.FieldByName("XXX_extensions"); fx.Type == extType {
+		mi.extensionOffset = offsetOf(fx)
+	}
+	mi.sizecacheOffset = invalidOffset
+	if fx, _ := t.FieldByName("XXX_sizecache"); fx.Type == sizecacheType {
+		mi.sizecacheOffset = offsetOf(fx)
+	}
+	mi.unknownOffset = invalidOffset
+	if fx, _ := t.FieldByName("XXX_unrecognized"); fx.Type == bytesType {
+		mi.unknownOffset = offsetOf(fx)
+	}
+	mi.methods.Flags = piface.MethodFlagDeterministicMarshal
+	mi.methods.MarshalAppend = mi.marshalAppend
+	mi.methods.Size = mi.size
 }
 
 type structInfo struct {
@@ -113,6 +188,7 @@
 // any discrepancies.
 func (mi *MessageType) makeKnownFieldsFunc(si structInfo) {
 	mi.fields = map[pref.FieldNumber]*fieldInfo{}
+	mi.fieldsOrdered = make([]*fieldInfo, 0, mi.PBType.Fields().Len())
 	for i := 0; i < mi.PBType.Descriptor().Fields().Len(); i++ {
 		fd := mi.PBType.Descriptor().Fields().Get(i)
 		fs := si.fieldsByNumber[fd.Number()]
@@ -120,6 +196,16 @@
 		switch {
 		case fd.ContainingOneof() != nil:
 			fi = fieldInfoForOneof(fd, si.oneofsByName[fd.ContainingOneof().Name()], si.oneofWrappersByNumber[fd.Number()])
+			// There is one fieldInfo for each proto message field, but only one struct
+			// field for all message fields in a oneof. We install the encoder functions
+			// on the fieldInfo for the first field in the oneof.
+			//
+			// A slightly simpler approach would be to have each fieldInfo's encoder
+			// handle the case where that field is set, but this would require more
+			// checks  against the current oneof type than a single map lookup.
+			if fd.ContainingOneof().Fields().Get(0).Name() == fd.Name() {
+				fi.funcs = makeOneofFieldCoder(si.oneofsByName[fd.ContainingOneof().Name()], fd.ContainingOneof(), si.fieldsByNumber, si.oneofWrappersByNumber)
+			}
 		case fd.IsMap():
 			fi = fieldInfoForMap(fd, fs)
 		case fd.IsList():
@@ -129,8 +215,13 @@
 		default:
 			fi = fieldInfoForScalar(fd, fs)
 		}
+		fi.num = fd.Number()
 		mi.fields[fd.Number()] = &fi
+		mi.fieldsOrdered = append(mi.fieldsOrdered, &fi)
 	}
+	sort.Slice(mi.fieldsOrdered, func(i, j int) bool {
+		return mi.fieldsOrdered[i].num < mi.fieldsOrdered[j].num
+	})
 
 	mi.oneofs = map[pref.Name]*oneofInfo{}
 	for i := 0; i < mi.PBType.Descriptor().Oneofs().Len(); i++ {
@@ -164,7 +255,8 @@
 }
 
 func (mi *MessageType) Methods() *piface.Methods {
-	return nil
+	mi.init()
+	return &mi.methods
 }
 
 func (mi *MessageType) dataTypeOf(p interface{}) *messageDataType {
@@ -230,7 +322,6 @@
 func (m *messageReflectWrapper) ProtoUnwrap() interface{} {
 	return m.p.AsIfaceOf(m.mi.GoType.Elem())
 }
-func (m *messageReflectWrapper) ProtoMutable() {}
 
 var _ pvalue.Unwrapper = (*messageReflectWrapper)(nil)
 
@@ -240,11 +331,23 @@
 	return (*messageReflectWrapper)(m)
 }
 func (m *messageIfaceWrapper) XXX_Methods() *piface.Methods {
-	return m.mi.Methods()
+	// TODO: Consider not recreating this on every call.
+	m.mi.init()
+	return &piface.Methods{
+		Flags:         piface.MethodFlagDeterministicMarshal,
+		MarshalAppend: m.marshalAppend,
+		Size:          m.size,
+	}
 }
 func (m *messageIfaceWrapper) ProtoUnwrap() interface{} {
 	return m.p.AsIfaceOf(m.mi.GoType.Elem())
 }
+func (m *messageIfaceWrapper) marshalAppend(b []byte, _ pref.ProtoMessage, opts piface.MarshalOptions) ([]byte, error) {
+	return m.mi.marshalAppendPointer(b, m.p, newMarshalOptions(opts))
+}
+func (m *messageIfaceWrapper) size(msg pref.ProtoMessage) (size int) {
+	return m.mi.sizePointer(m.p, 0)
+}
 
 type knownFields messageDataType