internal/impl: add fast-path marshal implementation

This is a port of the v1 table marshaler, with some substantial
cleanup and refactoring.

Benchstat results from the protobuf reference benchmark data comparing the
v1 package with v2, with AllowPartial:true set for the new package. This
is not an apples-to-apples comparison, since v1 doesn't have a way to
disable required field checks.  Required field checks in v2 package
currently go through reflection, which performs terribly; my initial
experimentation indicates that fast-path required field checks will
not add a large amount of cost; these results are incomplete but not
wholly inaccurate.

name                                           old time/op  new time/op  delta
/dataset.google_message3_1.pb/Marshal-12        219ms ± 1%   232ms ± 1%   +5.85%  (p=0.004 n=6+5)
/dataset.google_message2.pb/Marshal-12          261µs ± 3%   248µs ± 1%   -5.14%  (p=0.002 n=6+6)
/dataset.google_message1_proto2.pb/Marshal-12   681ns ± 2%   637ns ± 3%   -6.53%  (p=0.002 n=6+6)
/dataset.google_message1_proto3.pb/Marshal-12  1.10µs ± 8%  0.99µs ± 3%   -9.63%  (p=0.002 n=6+6)
/dataset.google_message3_3.pb/Marshal-12       44.2ms ± 3%  35.2ms ± 1%  -20.28%  (p=0.004 n=6+5)
/dataset.google_message4.pb/Marshal-12         91.4ms ± 2%  94.9ms ± 2%   +3.78%  (p=0.002 n=6+6)
/dataset.google_message3_2.pb/Marshal-12       78.7ms ± 6%  80.8ms ± 4%     ~     (p=0.310 n=6+6)
/dataset.google_message3_4.pb/Marshal-12       10.6ms ± 3%  10.6ms ± 8%     ~     (p=0.662 n=5+6)
/dataset.google_message3_5.pb/Marshal-12        675ms ± 4%   510ms ± 2%  -24.40%  (p=0.002 n=6+6)
/dataset.google_message3_1.pb/Marshal           219ms ± 1%   236ms ± 7%   +8.06%  (p=0.004 n=5+6)
/dataset.google_message2.pb/Marshal             257µs ± 1%   250µs ± 3%     ~     (p=0.052 n=5+6)
/dataset.google_message1_proto2.pb/Marshal      685ns ± 1%   628ns ± 1%   -8.41%  (p=0.008 n=5+5)
/dataset.google_message1_proto3.pb/Marshal     1.08µs ± 1%  0.98µs ± 2%   -9.31%  (p=0.004 n=5+6)
/dataset.google_message3_3.pb/Marshal          43.7ms ± 1%  35.1ms ± 1%  -19.76%  (p=0.002 n=6+6)
/dataset.google_message4.pb/Marshal            93.4ms ± 4%  94.9ms ± 2%     ~     (p=0.180 n=6+6)
/dataset.google_message3_2.pb/Marshal           105ms ± 2%    98ms ± 7%   -6.81%  (p=0.009 n=5+6)
/dataset.google_message3_4.pb/Marshal          16.3ms ± 6%  15.7ms ± 3%   -3.44%  (p=0.041 n=6+6)
/dataset.google_message3_5.pb/Marshal           676ms ± 4%   504ms ± 2%  -25.50%  (p=0.004 n=6+5)

Change-Id: I72cc4597117f4cf5d236ef505777d49dd4a5f75d
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/171020
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/encode_reflect.go b/internal/impl/encode_reflect.go
new file mode 100644
index 0000000..6e172d9
--- /dev/null
+++ b/internal/impl/encode_reflect.go
@@ -0,0 +1,94 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build purego appengine
+
+package impl
+
+import (
+	"google.golang.org/protobuf/internal/encoding/wire"
+)
+
+func sizeEnum(p pointer, tagsize int, _ marshalOptions) (size int) {
+	v := p.v.Elem().Int()
+	return tagsize + wire.SizeVarint(uint64(v))
+}
+
+func appendEnum(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+	v := p.v.Elem().Int()
+	b = wire.AppendVarint(b, wiretag)
+	b = wire.AppendVarint(b, uint64(v))
+	return b, nil
+}
+
+var coderEnum = pointerCoderFuncs{sizeEnum, appendEnum}
+
+func sizeEnumNoZero(p pointer, tagsize int, opts marshalOptions) (size int) {
+	if p.v.Elem().Int() == 0 {
+		return 0
+	}
+	return sizeEnum(p, tagsize, opts)
+}
+
+func appendEnumNoZero(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+	if p.v.Elem().Int() == 0 {
+		return b, nil
+	}
+	return appendEnum(b, p, wiretag, opts)
+}
+
+var coderEnumNoZero = pointerCoderFuncs{sizeEnumNoZero, appendEnumNoZero}
+
+func sizeEnumPtr(p pointer, tagsize int, opts marshalOptions) (size int) {
+	return sizeEnum(pointer{p.v.Elem()}, tagsize, opts)
+}
+
+func appendEnumPtr(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+	return appendEnum(b, pointer{p.v.Elem()}, wiretag, opts)
+}
+
+var coderEnumPtr = pointerCoderFuncs{sizeEnumPtr, appendEnumPtr}
+
+func sizeEnumSlice(p pointer, tagsize int, opts marshalOptions) (size int) {
+	return sizeEnumSliceReflect(p.v.Elem(), tagsize, opts)
+}
+
+func appendEnumSlice(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+	return appendEnumSliceReflect(b, p.v.Elem(), wiretag, opts)
+}
+
+var coderEnumSlice = pointerCoderFuncs{sizeEnumSlice, appendEnumSlice}
+
+func sizeEnumPackedSlice(p pointer, tagsize int, _ marshalOptions) (size int) {
+	s := p.v.Elem()
+	slen := s.Len()
+	if slen == 0 {
+		return 0
+	}
+	n := 0
+	for i := 0; i < slen; i++ {
+		n += wire.SizeVarint(uint64(s.Index(i).Int()))
+	}
+	return tagsize + wire.SizeBytes(n)
+}
+
+func appendEnumPackedSlice(b []byte, p pointer, wiretag uint64, opts marshalOptions) ([]byte, error) {
+	s := p.v.Elem()
+	slen := s.Len()
+	if slen == 0 {
+		return b, nil
+	}
+	b = wire.AppendVarint(b, wiretag)
+	n := 0
+	for i := 0; i < slen; i++ {
+		n += wire.SizeVarint(uint64(s.Index(i).Int()))
+	}
+	b = wire.AppendVarint(b, uint64(n))
+	for i := 0; i < slen; i++ {
+		b = wire.AppendVarint(b, uint64(s.Index(i).Int()))
+	}
+	return b, nil
+}
+
+var coderEnumPackedSlice = pointerCoderFuncs{sizeEnumPackedSlice, appendEnumPackedSlice}