internal/impl: add fast-path marshal implementation

This is a port of the v1 table marshaler, with some substantial
cleanup and refactoring.

Benchstat results from the protobuf reference benchmark data comparing the
v1 package with v2, with AllowPartial:true set for the new package. This
is not an apples-to-apples comparison, since v1 doesn't have a way to
disable required field checks.  Required field checks in v2 package
currently go through reflection, which performs terribly; my initial
experimentation indicates that fast-path required field checks will
not add a large amount of cost; these results are incomplete but not
wholly inaccurate.

name                                           old time/op  new time/op  delta
/dataset.google_message3_1.pb/Marshal-12        219ms ± 1%   232ms ± 1%   +5.85%  (p=0.004 n=6+5)
/dataset.google_message2.pb/Marshal-12          261µs ± 3%   248µs ± 1%   -5.14%  (p=0.002 n=6+6)
/dataset.google_message1_proto2.pb/Marshal-12   681ns ± 2%   637ns ± 3%   -6.53%  (p=0.002 n=6+6)
/dataset.google_message1_proto3.pb/Marshal-12  1.10µs ± 8%  0.99µs ± 3%   -9.63%  (p=0.002 n=6+6)
/dataset.google_message3_3.pb/Marshal-12       44.2ms ± 3%  35.2ms ± 1%  -20.28%  (p=0.004 n=6+5)
/dataset.google_message4.pb/Marshal-12         91.4ms ± 2%  94.9ms ± 2%   +3.78%  (p=0.002 n=6+6)
/dataset.google_message3_2.pb/Marshal-12       78.7ms ± 6%  80.8ms ± 4%     ~     (p=0.310 n=6+6)
/dataset.google_message3_4.pb/Marshal-12       10.6ms ± 3%  10.6ms ± 8%     ~     (p=0.662 n=5+6)
/dataset.google_message3_5.pb/Marshal-12        675ms ± 4%   510ms ± 2%  -24.40%  (p=0.002 n=6+6)
/dataset.google_message3_1.pb/Marshal           219ms ± 1%   236ms ± 7%   +8.06%  (p=0.004 n=5+6)
/dataset.google_message2.pb/Marshal             257µs ± 1%   250µs ± 3%     ~     (p=0.052 n=5+6)
/dataset.google_message1_proto2.pb/Marshal      685ns ± 1%   628ns ± 1%   -8.41%  (p=0.008 n=5+5)
/dataset.google_message1_proto3.pb/Marshal     1.08µs ± 1%  0.98µs ± 2%   -9.31%  (p=0.004 n=5+6)
/dataset.google_message3_3.pb/Marshal          43.7ms ± 1%  35.1ms ± 1%  -19.76%  (p=0.002 n=6+6)
/dataset.google_message4.pb/Marshal            93.4ms ± 4%  94.9ms ± 2%     ~     (p=0.180 n=6+6)
/dataset.google_message3_2.pb/Marshal           105ms ± 2%    98ms ± 7%   -6.81%  (p=0.009 n=5+6)
/dataset.google_message3_4.pb/Marshal          16.3ms ± 6%  15.7ms ± 3%   -3.44%  (p=0.041 n=6+6)
/dataset.google_message3_5.pb/Marshal           676ms ± 4%   504ms ± 2%  -25.50%  (p=0.004 n=6+5)

Change-Id: I72cc4597117f4cf5d236ef505777d49dd4a5f75d
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/171020
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/encode.go b/internal/impl/encode.go
new file mode 100644
index 0000000..c0fce6c
--- /dev/null
+++ b/internal/impl/encode.go
@@ -0,0 +1,183 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package impl
+
+import (
+	"sort"
+	"sync/atomic"
+
+	"google.golang.org/protobuf/internal/errors"
+	proto "google.golang.org/protobuf/proto"
+	pref "google.golang.org/protobuf/reflect/protoreflect"
+	piface "google.golang.org/protobuf/runtime/protoiface"
+)
+
+type marshalOptions uint
+
+const (
+	marshalAllowPartial marshalOptions = 1 << iota
+	marshalDeterministic
+	marshalUseCachedSize
+)
+
+func newMarshalOptions(opts piface.MarshalOptions) marshalOptions {
+	var o marshalOptions
+	if opts.AllowPartial {
+		o |= marshalAllowPartial
+	}
+	if opts.Deterministic {
+		o |= marshalDeterministic
+	}
+	if opts.UseCachedSize {
+		o |= marshalUseCachedSize
+	}
+	return o
+}
+
+func (o marshalOptions) Options() proto.MarshalOptions {
+	return proto.MarshalOptions{
+		AllowPartial:  o.AllowPartial(),
+		Deterministic: o.Deterministic(),
+		UseCachedSize: o.UseCachedSize(),
+	}
+}
+
+func (o marshalOptions) AllowPartial() bool  { return o&marshalAllowPartial != 0 }
+func (o marshalOptions) Deterministic() bool { return o&marshalDeterministic != 0 }
+func (o marshalOptions) UseCachedSize() bool { return o&marshalUseCachedSize != 0 }
+
+// size is protoreflect.Methods.Size.
+func (mi *MessageType) size(msg pref.ProtoMessage) (size int) {
+	return mi.sizePointer(pointerOfIface(msg), 0)
+}
+
+func (mi *MessageType) sizePointer(p pointer, opts marshalOptions) (size int) {
+	mi.init()
+	if p.IsNil() {
+		return 0
+	}
+	if opts.UseCachedSize() && mi.sizecacheOffset.IsValid() {
+		return int(atomic.LoadInt32(p.Apply(mi.sizecacheOffset).Int32()))
+	}
+	return mi.sizePointerSlow(p, opts)
+}
+
+func (mi *MessageType) sizePointerSlow(p pointer, opts marshalOptions) (size int) {
+	if mi.extensionOffset.IsValid() {
+		e := p.Apply(mi.extensionOffset).Extensions()
+		size += mi.sizeExtensions(e, opts)
+	}
+	for _, f := range mi.fieldsOrdered {
+		fptr := p.Apply(f.offset)
+		if f.isPointer && fptr.Elem().IsNil() {
+			continue
+		}
+		if f.funcs.size == nil {
+			continue
+		}
+		size += f.funcs.size(fptr, f.tagsize, opts)
+	}
+	if mi.unknownOffset.IsValid() {
+		u := *p.Apply(mi.unknownOffset).Bytes()
+		size += len(u)
+	}
+	if mi.sizecacheOffset.IsValid() {
+		atomic.StoreInt32(p.Apply(mi.sizecacheOffset).Int32(), int32(size))
+	}
+	return size
+}
+
+// marshalAppend is protoreflect.Methods.MarshalAppend.
+func (mi *MessageType) marshalAppend(b []byte, msg pref.ProtoMessage, opts piface.MarshalOptions) ([]byte, error) {
+	return mi.marshalAppendPointer(b, pointerOfIface(msg), newMarshalOptions(opts))
+}
+
+func (mi *MessageType) marshalAppendPointer(b []byte, p pointer, opts marshalOptions) ([]byte, error) {
+	mi.init()
+	if p.IsNil() {
+		return b, nil
+	}
+	var err error
+	var nerr errors.NonFatal
+	// The old marshaler encodes extensions at beginning.
+	if mi.extensionOffset.IsValid() {
+		e := p.Apply(mi.extensionOffset).Extensions()
+		// TODO: Special handling for MessageSet?
+		b, err = mi.appendExtensions(b, e, opts)
+		if !nerr.Merge(err) {
+			return b, err
+		}
+	}
+	for _, f := range mi.fieldsOrdered {
+		fptr := p.Apply(f.offset)
+		if f.isPointer && fptr.Elem().IsNil() {
+			continue
+		}
+		if f.funcs.marshal == nil {
+			continue
+		}
+		b, err = f.funcs.marshal(b, fptr, f.wiretag, opts)
+		if !nerr.Merge(err) {
+			return b, err
+		}
+	}
+	if mi.unknownOffset.IsValid() {
+		u := *p.Apply(mi.unknownOffset).Bytes()
+		b = append(b, u...)
+	}
+	return b, nerr.E
+}
+
+func (mi *MessageType) sizeExtensions(ext *legacyExtensionMap, opts marshalOptions) (n int) {
+	if ext == nil {
+		return 0
+	}
+	for _, e := range *ext {
+		ei := mi.extensionFieldInfo(e.Desc)
+		if ei.funcs.size == nil {
+			continue
+		}
+		n += ei.funcs.size(e.value, ei.tagsize, opts)
+	}
+	return n
+}
+
+func (mi *MessageType) appendExtensions(b []byte, ext *legacyExtensionMap, opts marshalOptions) ([]byte, error) {
+	if ext == nil {
+		return b, nil
+	}
+
+	switch len(*ext) {
+	case 0:
+		return b, nil
+	case 1:
+		// Fast-path for one extension: Don't bother sorting the keys.
+		var err error
+		for _, e := range *ext {
+			ei := mi.extensionFieldInfo(e.Desc)
+			b, err = ei.funcs.marshal(b, e.value, ei.wiretag, opts)
+		}
+		return b, err
+	default:
+		// Sort the keys to provide a deterministic encoding.
+		// Not sure this is required, but the old code does it.
+		keys := make([]int, 0, len(*ext))
+		for k := range *ext {
+			keys = append(keys, int(k))
+		}
+		sort.Ints(keys)
+		var err error
+		var nerr errors.NonFatal
+		for _, k := range keys {
+			e := (*ext)[int32(k)]
+			ei := mi.extensionFieldInfo(e.Desc)
+			b, err = ei.funcs.marshal(b, e.value, ei.wiretag, opts)
+			if !nerr.Merge(err) {
+				return b, err
+			}
+		}
+		return b, nerr.E
+	}
+}