internal/impl: add fast-path marshal implementation

This is a port of the v1 table marshaler, with some substantial
cleanup and refactoring.

Benchstat results from the protobuf reference benchmark data comparing the
v1 package with v2, with AllowPartial:true set for the new package. This
is not an apples-to-apples comparison, since v1 doesn't have a way to
disable required field checks.  Required field checks in v2 package
currently go through reflection, which performs terribly; my initial
experimentation indicates that fast-path required field checks will
not add a large amount of cost; these results are incomplete but not
wholly inaccurate.

name                                           old time/op  new time/op  delta
/dataset.google_message3_1.pb/Marshal-12        219ms ± 1%   232ms ± 1%   +5.85%  (p=0.004 n=6+5)
/dataset.google_message2.pb/Marshal-12          261µs ± 3%   248µs ± 1%   -5.14%  (p=0.002 n=6+6)
/dataset.google_message1_proto2.pb/Marshal-12   681ns ± 2%   637ns ± 3%   -6.53%  (p=0.002 n=6+6)
/dataset.google_message1_proto3.pb/Marshal-12  1.10µs ± 8%  0.99µs ± 3%   -9.63%  (p=0.002 n=6+6)
/dataset.google_message3_3.pb/Marshal-12       44.2ms ± 3%  35.2ms ± 1%  -20.28%  (p=0.004 n=6+5)
/dataset.google_message4.pb/Marshal-12         91.4ms ± 2%  94.9ms ± 2%   +3.78%  (p=0.002 n=6+6)
/dataset.google_message3_2.pb/Marshal-12       78.7ms ± 6%  80.8ms ± 4%     ~     (p=0.310 n=6+6)
/dataset.google_message3_4.pb/Marshal-12       10.6ms ± 3%  10.6ms ± 8%     ~     (p=0.662 n=5+6)
/dataset.google_message3_5.pb/Marshal-12        675ms ± 4%   510ms ± 2%  -24.40%  (p=0.002 n=6+6)
/dataset.google_message3_1.pb/Marshal           219ms ± 1%   236ms ± 7%   +8.06%  (p=0.004 n=5+6)
/dataset.google_message2.pb/Marshal             257µs ± 1%   250µs ± 3%     ~     (p=0.052 n=5+6)
/dataset.google_message1_proto2.pb/Marshal      685ns ± 1%   628ns ± 1%   -8.41%  (p=0.008 n=5+5)
/dataset.google_message1_proto3.pb/Marshal     1.08µs ± 1%  0.98µs ± 2%   -9.31%  (p=0.004 n=5+6)
/dataset.google_message3_3.pb/Marshal          43.7ms ± 1%  35.1ms ± 1%  -19.76%  (p=0.002 n=6+6)
/dataset.google_message4.pb/Marshal            93.4ms ± 4%  94.9ms ± 2%     ~     (p=0.180 n=6+6)
/dataset.google_message3_2.pb/Marshal           105ms ± 2%    98ms ± 7%   -6.81%  (p=0.009 n=5+6)
/dataset.google_message3_4.pb/Marshal          16.3ms ± 6%  15.7ms ± 3%   -3.44%  (p=0.041 n=6+6)
/dataset.google_message3_5.pb/Marshal           676ms ± 4%   504ms ± 2%  -25.50%  (p=0.004 n=6+5)

Change-Id: I72cc4597117f4cf5d236ef505777d49dd4a5f75d
Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/171020
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
diff --git a/internal/impl/pointer_unsafe.go b/internal/impl/pointer_unsafe.go
index 16078bd..f04a561 100644
--- a/internal/impl/pointer_unsafe.go
+++ b/internal/impl/pointer_unsafe.go
@@ -20,6 +20,15 @@
 	return offset(f.Offset)
 }
 
+// IsValid reports whether the offset is valid.
+func (f offset) IsValid() bool { return f != invalidOffset }
+
+// invalidOffset is an invalid field offset.
+var invalidOffset = ^offset(0)
+
+// zeroOffset is a noop when calling pointer.Apply.
+var zeroOffset = offset(0)
+
 // pointer is a pointer to a message struct or field.
 type pointer struct{ p unsafe.Pointer }
 
@@ -63,3 +72,44 @@
 	// TODO: Use tricky unsafe magic to directly create ifaceHeader.
 	return p.AsValueOf(t).Interface()
 }
+
+func (p pointer) Bool() *bool                     { return (*bool)(p.p) }
+func (p pointer) BoolPtr() **bool                 { return (**bool)(p.p) }
+func (p pointer) BoolSlice() *[]bool              { return (*[]bool)(p.p) }
+func (p pointer) Int32() *int32                   { return (*int32)(p.p) }
+func (p pointer) Int32Ptr() **int32               { return (**int32)(p.p) }
+func (p pointer) Int32Slice() *[]int32            { return (*[]int32)(p.p) }
+func (p pointer) Int64() *int64                   { return (*int64)(p.p) }
+func (p pointer) Int64Ptr() **int64               { return (**int64)(p.p) }
+func (p pointer) Int64Slice() *[]int64            { return (*[]int64)(p.p) }
+func (p pointer) Uint32() *uint32                 { return (*uint32)(p.p) }
+func (p pointer) Uint32Ptr() **uint32             { return (**uint32)(p.p) }
+func (p pointer) Uint32Slice() *[]uint32          { return (*[]uint32)(p.p) }
+func (p pointer) Uint64() *uint64                 { return (*uint64)(p.p) }
+func (p pointer) Uint64Ptr() **uint64             { return (**uint64)(p.p) }
+func (p pointer) Uint64Slice() *[]uint64          { return (*[]uint64)(p.p) }
+func (p pointer) Float32() *float32               { return (*float32)(p.p) }
+func (p pointer) Float32Ptr() **float32           { return (**float32)(p.p) }
+func (p pointer) Float32Slice() *[]float32        { return (*[]float32)(p.p) }
+func (p pointer) Float64() *float64               { return (*float64)(p.p) }
+func (p pointer) Float64Ptr() **float64           { return (**float64)(p.p) }
+func (p pointer) Float64Slice() *[]float64        { return (*[]float64)(p.p) }
+func (p pointer) String() *string                 { return (*string)(p.p) }
+func (p pointer) StringPtr() **string             { return (**string)(p.p) }
+func (p pointer) StringSlice() *[]string          { return (*[]string)(p.p) }
+func (p pointer) Bytes() *[]byte                  { return (*[]byte)(p.p) }
+func (p pointer) BytesSlice() *[][]byte           { return (*[][]byte)(p.p) }
+func (p pointer) Extensions() *legacyExtensionMap { return (*legacyExtensionMap)(p.p) }
+
+func (p pointer) Elem() pointer {
+	return pointer{p: *(*unsafe.Pointer)(p.p)}
+}
+
+// PointerSlice loads []*T from p as a []pointer.
+// The value returned is aliased with the original slice.
+// This behavior differs from the implementation in pointer_reflect.go.
+func (p pointer) PointerSlice() []pointer {
+	// Super-tricky - p should point to a []*T where T is a
+	// message type. We load it as []pointer.
+	return *(*[]pointer)(p.p)
+}