internal/impl: add fast-path marshal implementation

This is a port of the v1 table marshaler, with some substantial
cleanup and refactoring.

Benchstat results from the protobuf reference benchmark data comparing the
v1 package with v2, with AllowPartial:true set for the new package. This
is not an apples-to-apples comparison, since v1 doesn't have a way to
disable required field checks.  Required field checks in v2 package
currently go through reflection, which performs terribly; my initial
experimentation indicates that fast-path required field checks will
not add a large amount of cost; these results are incomplete but not
wholly inaccurate.

name                                           old time/op  new time/op  delta
/dataset.google_message3_1.pb/Marshal-12        219ms ± 1%   232ms ± 1%   +5.85%  (p=0.004 n=6+5)
/dataset.google_message2.pb/Marshal-12          261µs ± 3%   248µs ± 1%   -5.14%  (p=0.002 n=6+6)
/dataset.google_message1_proto2.pb/Marshal-12   681ns ± 2%   637ns ± 3%   -6.53%  (p=0.002 n=6+6)
/dataset.google_message1_proto3.pb/Marshal-12  1.10µs ± 8%  0.99µs ± 3%   -9.63%  (p=0.002 n=6+6)
/dataset.google_message3_3.pb/Marshal-12       44.2ms ± 3%  35.2ms ± 1%  -20.28%  (p=0.004 n=6+5)
/dataset.google_message4.pb/Marshal-12         91.4ms ± 2%  94.9ms ± 2%   +3.78%  (p=0.002 n=6+6)
/dataset.google_message3_2.pb/Marshal-12       78.7ms ± 6%  80.8ms ± 4%     ~     (p=0.310 n=6+6)
/dataset.google_message3_4.pb/Marshal-12       10.6ms ± 3%  10.6ms ± 8%     ~     (p=0.662 n=5+6)
/dataset.google_message3_5.pb/Marshal-12        675ms ± 4%   510ms ± 2%  -24.40%  (p=0.002 n=6+6)
/dataset.google_message3_1.pb/Marshal           219ms ± 1%   236ms ± 7%   +8.06%  (p=0.004 n=5+6)
/dataset.google_message2.pb/Marshal             257µs ± 1%   250µs ± 3%     ~     (p=0.052 n=5+6)
/dataset.google_message1_proto2.pb/Marshal      685ns ± 1%   628ns ± 1%   -8.41%  (p=0.008 n=5+5)
/dataset.google_message1_proto3.pb/Marshal     1.08µs ± 1%  0.98µs ± 2%   -9.31%  (p=0.004 n=5+6)
/dataset.google_message3_3.pb/Marshal          43.7ms ± 1%  35.1ms ± 1%  -19.76%  (p=0.002 n=6+6)
/dataset.google_message4.pb/Marshal            93.4ms ± 4%  94.9ms ± 2%     ~     (p=0.180 n=6+6)
/dataset.google_message3_2.pb/Marshal           105ms ± 2%    98ms ± 7%   -6.81%  (p=0.009 n=5+6)
/dataset.google_message3_4.pb/Marshal          16.3ms ± 6%  15.7ms ± 3%   -3.44%  (p=0.041 n=6+6)
/dataset.google_message3_5.pb/Marshal           676ms ± 4%   504ms ± 2%  -25.50%  (p=0.004 n=6+5)

Change-Id: I72cc4597117f4cf5d236ef505777d49dd4a5f75d
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/171020
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/pointer_reflect.go b/internal/impl/pointer_reflect.go
index 72b782c..b851b1b 100644
--- a/internal/impl/pointer_reflect.go
+++ b/internal/impl/pointer_reflect.go
@@ -23,6 +23,15 @@
 	return f.Index
 }
 
+// IsValid reports whether the offset is valid.
+func (f offset) IsValid() bool { return f != nil }
+
+// invalidOffset is an invalid field offset.
+var invalidOffset = offset(nil)
+
+// zeroOffset is a noop when calling pointer.Apply.
+var zeroOffset = offset([]int{0})
+
 // pointer is an abstract representation of a pointer to a struct or field.
 type pointer struct{ v reflect.Value }
 
@@ -62,3 +71,52 @@
 func (p pointer) AsIfaceOf(t reflect.Type) interface{} {
 	return p.AsValueOf(t).Interface()
 }
+
+func (p pointer) Bool() *bool              { return p.v.Interface().(*bool) }
+func (p pointer) BoolPtr() **bool          { return p.v.Interface().(**bool) }
+func (p pointer) BoolSlice() *[]bool       { return p.v.Interface().(*[]bool) }
+func (p pointer) Int32() *int32            { return p.v.Interface().(*int32) }
+func (p pointer) Int32Ptr() **int32        { return p.v.Interface().(**int32) }
+func (p pointer) Int32Slice() *[]int32     { return p.v.Interface().(*[]int32) }
+func (p pointer) Int64() *int64            { return p.v.Interface().(*int64) }
+func (p pointer) Int64Ptr() **int64        { return p.v.Interface().(**int64) }
+func (p pointer) Int64Slice() *[]int64     { return p.v.Interface().(*[]int64) }
+func (p pointer) Uint32() *uint32          { return p.v.Interface().(*uint32) }
+func (p pointer) Uint32Ptr() **uint32      { return p.v.Interface().(**uint32) }
+func (p pointer) Uint32Slice() *[]uint32   { return p.v.Interface().(*[]uint32) }
+func (p pointer) Uint64() *uint64          { return p.v.Interface().(*uint64) }
+func (p pointer) Uint64Ptr() **uint64      { return p.v.Interface().(**uint64) }
+func (p pointer) Uint64Slice() *[]uint64   { return p.v.Interface().(*[]uint64) }
+func (p pointer) Float32() *float32        { return p.v.Interface().(*float32) }
+func (p pointer) Float32Ptr() **float32    { return p.v.Interface().(**float32) }
+func (p pointer) Float32Slice() *[]float32 { return p.v.Interface().(*[]float32) }
+func (p pointer) Float64() *float64        { return p.v.Interface().(*float64) }
+func (p pointer) Float64Ptr() **float64    { return p.v.Interface().(**float64) }
+func (p pointer) Float64Slice() *[]float64 { return p.v.Interface().(*[]float64) }
+func (p pointer) String() *string          { return p.v.Interface().(*string) }
+func (p pointer) StringPtr() **string      { return p.v.Interface().(**string) }
+func (p pointer) StringSlice() *[]string   { return p.v.Interface().(*[]string) }
+func (p pointer) Bytes() *[]byte           { return p.v.Interface().(*[]byte) }
+func (p pointer) BytesSlice() *[][]byte    { return p.v.Interface().(*[][]byte) }
+func (p pointer) Extensions() *legacyExtensionMap {
+	return (*legacyExtensionMap)(p.v.Interface().(*map[int32]ExtensionFieldV1))
+}
+
+func (p pointer) Elem() pointer {
+	return pointer{v: p.v.Elem()}
+}
+
+// PointerSlice copies []*T from p as a new []pointer.
+// This behavior differs from the implementation in pointer_unsafe.go.
+func (p pointer) PointerSlice() []pointer {
+	// TODO: reconsider this
+	if p.v.IsNil() {
+		return nil
+	}
+	n := p.v.Elem().Len()
+	s := make([]pointer, n)
+	for i := 0; i < n; i++ {
+		s[i] = pointer{v: p.v.Elem().Index(i)}
+	}
+	return s
+}