Add divide to Sk2x, use native vdiv and vsqrt on ARM 64.

Tests pass on N7 + N9.

BUG=skia:

CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Mac10.7-Clang-Arm7-Debug-iOS-Trybot,Build-Ubuntu-GCC-Arm64-Release-Android-Trybot

Review URL: https://codereview.chromium.org/1027753003
diff --git a/src/core/Sk2x.h b/src/core/Sk2x.h
index a64ad72..9b4e5ee 100644
--- a/src/core/Sk2x.h
+++ b/src/core/Sk2x.h
@@ -49,14 +49,17 @@
     Sk2x      add(const Sk2x&) const;
     Sk2x subtract(const Sk2x&) const;
     Sk2x multiply(const Sk2x&) const;
+    Sk2x   divide(const Sk2x&) const;
 
     Sk2x operator +(const Sk2x& o) const { return this->add(o); }
     Sk2x operator -(const Sk2x& o) const { return this->subtract(o); }
     Sk2x operator *(const Sk2x& o) const { return this->multiply(o); }
+    Sk2x operator /(const Sk2x& o) const { return this->divide(o); }
 
     Sk2x& operator +=(const Sk2x& o) { return (*this = *this + o); }
     Sk2x& operator -=(const Sk2x& o) { return (*this = *this - o); }
     Sk2x& operator *=(const Sk2x& o) { return (*this = *this * o); }
+    Sk2x& operator /=(const Sk2x& o) { return (*this = *this / o); }
 
     Sk2x negate() const { return Sk2x((T)0) - *this; }
     Sk2x operator -() const { return this->negate(); }
diff --git a/src/opts/Sk2x_neon.h b/src/opts/Sk2x_neon.h
index 00ab00a..ef61df4 100644
--- a/src/opts/Sk2x_neon.h
+++ b/src/opts/Sk2x_neon.h
@@ -41,6 +41,16 @@
 M(Sk2f)      add(const Sk2f& o) const { return vadd_f32(fVec, o.fVec); }
 M(Sk2f) subtract(const Sk2f& o) const { return vsub_f32(fVec, o.fVec); }
 M(Sk2f) multiply(const Sk2f& o) const { return vmul_f32(fVec, o.fVec); }
+M(Sk2f)   divide(const Sk2f& o) const {
+#if defined(SK_CPU_ARM64)
+    return vdiv_f32(fVec, o.fVec);
+#else
+    float32x2_t est0 = vrecpe_f32(o.fVec),
+                est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
+                est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
+    return vmul_f32(est2, fVec);
+#endif
+}
 
 M(Sk2f) Min(const Sk2f& a, const Sk2f& b) { return vmin_f32(a.fVec, b.fVec); }
 M(Sk2f) Max(const Sk2f& a, const Sk2f& b) { return vmax_f32(a.fVec, b.fVec); }
@@ -51,10 +61,14 @@
     return est1;
 }
 M(Sk2f)  sqrt() const {
+#if defined(SK_CPU_ARM64)
+    return vsqrt_f32(fVec);
+#else
     float32x2_t est1 = this->rsqrt().fVec,
     // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
                 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
     return vmul_f32(fVec, est2);
+#endif
 }
 
 #undef M
@@ -73,6 +87,7 @@
     M(Sk2d)      add(const Sk2d& o) const { return vaddq_f64(fVec, o.fVec); }
     M(Sk2d) subtract(const Sk2d& o) const { return vsubq_f64(fVec, o.fVec); }
     M(Sk2d) multiply(const Sk2d& o) const { return vmulq_f64(fVec, o.fVec); }
+    M(Sk2d)   divide(const Sk2d& o) const { return vdivq_f64(fVec, o.fVec); }
 
     M(Sk2d) Min(const Sk2d& a, const Sk2d& b) { return vminq_f64(a.fVec, b.fVec); }
     M(Sk2d) Max(const Sk2d& a, const Sk2d& b) { return vmaxq_f64(a.fVec, b.fVec); }
@@ -82,13 +97,7 @@
                     est1 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0);
         return est1;
     }
-    M(Sk2d)  sqrt() const {
-        float64x2_t est1 = this->rsqrt().fVec,
-        // Two extra steps of Newton's method to refine the estimate of 1/sqrt(this).
-                    est2 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est1, est1)), est1),
-                    est3 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est2, est2)), est2);
-        return vmulq_f64(fVec, est3);
-    }
+    M(Sk2d)  sqrt() const { return vsqrtq_f64(fVec); }
 
 #else  // Scalar implementation for 32-bit chips, which don't have float64x2_t.
     M() Sk2x() {}
@@ -106,6 +115,7 @@
     M(Sk2d)      add(const Sk2d& o) const { return Sk2d(fVec[0] + o.fVec[0], fVec[1] + o.fVec[1]); }
     M(Sk2d) subtract(const Sk2d& o) const { return Sk2d(fVec[0] - o.fVec[0], fVec[1] - o.fVec[1]); }
     M(Sk2d) multiply(const Sk2d& o) const { return Sk2d(fVec[0] * o.fVec[0], fVec[1] * o.fVec[1]); }
+    M(Sk2d)   divide(const Sk2d& o) const { return Sk2d(fVec[0] / o.fVec[0], fVec[1] / o.fVec[1]); }
 
     M(Sk2d) Min(const Sk2d& a, const Sk2d& b) {
         return Sk2d(SkTMin(a.fVec[0], b.fVec[0]), SkTMin(a.fVec[1], b.fVec[1]));
diff --git a/src/opts/Sk2x_none.h b/src/opts/Sk2x_none.h
index 12daffd..65cc670 100644
--- a/src/opts/Sk2x_none.h
+++ b/src/opts/Sk2x_none.h
@@ -43,6 +43,9 @@
 M(Sk2x<T>) multiply(const Sk2x<T>& o) const {
     return Sk2x<T>(fVec[0] * o.fVec[0], fVec[1] * o.fVec[1]);
 }
+M(Sk2x<T>) divide(const Sk2x<T>& o) const {
+    return Sk2x<T>(fVec[0] / o.fVec[0], fVec[1] / o.fVec[1]);
+}
 
 M(Sk2x<T>) Min(const Sk2x<T>& a, const Sk2x<T>& b) {
     return Sk2x<T>(SkTMin(a.fVec[0], b.fVec[0]), SkTMin(a.fVec[1], b.fVec[1]));
diff --git a/src/opts/Sk2x_sse.h b/src/opts/Sk2x_sse.h
index 71071c0..111d3c2 100644
--- a/src/opts/Sk2x_sse.h
+++ b/src/opts/Sk2x_sse.h
@@ -38,6 +38,7 @@
 M(Sk2f)      add(const Sk2f& o) const { return _mm_add_ps(fVec, o.fVec); }
 M(Sk2f) subtract(const Sk2f& o) const { return _mm_sub_ps(fVec, o.fVec); }
 M(Sk2f) multiply(const Sk2f& o) const { return _mm_mul_ps(fVec, o.fVec); }
+M(Sk2f)   divide(const Sk2f& o) const { return _mm_div_ps(fVec, o.fVec); }
 
 M(Sk2f) Min(const Sk2f& a, const Sk2f& b) { return _mm_min_ps(a.fVec, b.fVec); }
 M(Sk2f) Max(const Sk2f& a, const Sk2f& b) { return _mm_max_ps(a.fVec, b.fVec); }
@@ -60,6 +61,7 @@
 M(Sk2d)      add(const Sk2d& o) const { return _mm_add_pd(fVec, o.fVec); }
 M(Sk2d) subtract(const Sk2d& o) const { return _mm_sub_pd(fVec, o.fVec); }
 M(Sk2d) multiply(const Sk2d& o) const { return _mm_mul_pd(fVec, o.fVec); }
+M(Sk2d)   divide(const Sk2d& o) const { return _mm_div_pd(fVec, o.fVec); }
 
 M(Sk2d) Min(const Sk2d& a, const Sk2d& b) { return _mm_min_pd(a.fVec, b.fVec); }
 M(Sk2d) Max(const Sk2d& a, const Sk2d& b) { return _mm_max_pd(a.fVec, b.fVec); }
diff --git a/src/opts/Sk4x_neon.h b/src/opts/Sk4x_neon.h
index 92cde11..41f6f46 100644
--- a/src/opts/Sk4x_neon.h
+++ b/src/opts/Sk4x_neon.h
@@ -62,10 +62,14 @@
 M(Sk4f) multiply(const Sk4f& o) const { return vmulq_f32(fVec, o.fVec); }
 
 M(Sk4f) divide  (const Sk4f& o) const {
-    float32x4_t est0 = vrecpeq_f32(o.fVec);
-    float32x4_t est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0);
-    float32x4_t est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
+#if defined(SK_CPU_ARM64)
+    return vdivq_f32(fVec, o.fVec);
+#else
+    float32x4_t est0 = vrecpeq_f32(o.fVec),
+                est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
+                est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
     return vmulq_f32(est2, fVec);
+#endif
 }
 
 M(Sk4f) rsqrt() const {
@@ -75,10 +79,14 @@
 }
 
 M(Sk4f)  sqrt() const {
+#if defined(SK_CPU_ARM64)
+    return vsqrtq_f32(fVec);
+#else
     float32x4_t est1 = this->rsqrt().fVec,
     // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
                 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
     return vmulq_f32(fVec, est2);
+#endif
 }
 
 M(Sk4i) equal           (const Sk4f& o) const { return vreinterpretq_s32_u32(vceqq_f32(fVec, o.fVec)); }
diff --git a/tests/Sk2xTest.cpp b/tests/Sk2xTest.cpp
index a3b86b2..b74cc5b 100644
--- a/tests/Sk2xTest.cpp
+++ b/tests/Sk2xTest.cpp
@@ -44,6 +44,7 @@
     REPORTER_ASSERT(r, eq(a + d, 6,   9));
     REPORTER_ASSERT(r, eq(a - d, 2,  -1));
     REPORTER_ASSERT(r, eq(a * d, 8,  20));
+    REPORTER_ASSERT(r, eq(a / d, 2, 0.8));
 
     REPORTER_ASSERT(r, nearly_eq(0.001, a.rsqrt(), 0.5, 0.5));
     REPORTER_ASSERT(r, eq(a.sqrt(), 2, 2));