Add sqrt() and rsqrt() to Sk4f.

This doesn't add them to the second-stringer Sk4i.  It's unclear we should be
doing that often, and we don't have efficient ways to do it except via floats.

BUG=skia:

Review URL: https://codereview.chromium.org/964603002
diff --git a/src/core/Sk4x.h b/src/core/Sk4x.h
index 058c400..b01b6f1 100644
--- a/src/core/Sk4x.h
+++ b/src/core/Sk4x.h
@@ -50,6 +50,9 @@
     Sk4x multiply(const Sk4x&) const;
     Sk4x   divide(const Sk4x&) const;
 
+    Sk4x rsqrt() const;   // Approximate reciprocal sqrt().
+    Sk4x  sqrt() const;   // this->multiply(this->rsqrt()) may be faster, but less precise.
+
     Sk4i            equal(const Sk4x&) const;
     Sk4i         notEqual(const Sk4x&) const;
     Sk4i         lessThan(const Sk4x&) const;
diff --git a/src/core/Sk4x_portable.h b/src/core/Sk4x_portable.h
index bd056c7..440e91f 100644
--- a/src/core/Sk4x_portable.h
+++ b/src/core/Sk4x_portable.h
@@ -2,6 +2,8 @@
 // This file will be intentionally included three times.
 
 #if defined(SK4X_PREAMBLE)
+    #include "SkFloatingPoint.h"
+    #include <math.h>
 
 #elif defined(SK4X_PRIVATE)
     typedef T Type;
@@ -60,6 +62,20 @@
 M(Sk4x<T>)   divide(const Sk4x<T>& other) const { return Sk4x(BINOP(/)); }
 #undef BINOP
 
+template<> inline Sk4f Sk4f::rsqrt() const {
+    return Sk4f(sk_float_rsqrt(fVec[0]),
+                sk_float_rsqrt(fVec[1]),
+                sk_float_rsqrt(fVec[2]),
+                sk_float_rsqrt(fVec[3]));
+}
+
+template<> inline Sk4f Sk4f::sqrt() const {
+    return Sk4f(sqrtf(fVec[0]),
+                sqrtf(fVec[1]),
+                sqrtf(fVec[2]),
+                sqrtf(fVec[3]));
+}
+
 #define BOOL_BINOP(op) fVec[0] op other.fVec[0] ? -1 : 0, \
                        fVec[1] op other.fVec[1] ? -1 : 0, \
                        fVec[2] op other.fVec[2] ? -1 : 0, \
diff --git a/src/core/Sk4x_sse.h b/src/core/Sk4x_sse.h
index ee09f77..6077d02 100644
--- a/src/core/Sk4x_sse.h
+++ b/src/core/Sk4x_sse.h
@@ -99,6 +99,9 @@
 M(Sk4f) multiply(const Sk4f& o) const { return _mm_mul_ps(fVec, o.fVec); }
 M(Sk4f) divide  (const Sk4f& o) const { return _mm_div_ps(fVec, o.fVec); }
 
+M(Sk4f) rsqrt() const { return _mm_rsqrt_ps(fVec); }
+M(Sk4f)  sqrt() const { return _mm_sqrt_ps( fVec); }
+
 M(Sk4i) equal           (const Sk4f& o) const { return _mm_cmpeq_ps (fVec, o.fVec); }
 M(Sk4i) notEqual        (const Sk4f& o) const { return _mm_cmpneq_ps(fVec, o.fVec); }
 M(Sk4i) lessThan        (const Sk4f& o) const { return _mm_cmplt_ps (fVec, o.fVec); }
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp
index 0985c3b..23eaf68 100644
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -87,6 +87,18 @@
     ASSERT_EQ(Sk4f(2,4,6,8), Sk4f(1,2,3,4).multiply(2.0f));
 }
 
+DEF_TEST(Sk4x_Sqrt, r) {
+    Sk4f squares(4, 16, 25, 121),
+           roots(2,  4,  5,  11);
+    // .sqrt() should be pretty precise.
+    ASSERT_EQ(roots, squares.sqrt());
+
+    // .rsqrt() isn't so precise, but should be pretty close.
+    Sk4f error = roots.subtract(squares.multiply(squares.rsqrt()));
+    REPORTER_ASSERT(r, error.greaterThan(0.0f).allTrue());
+    REPORTER_ASSERT(r, error.lessThan(0.01f).allTrue());
+}
+
 DEF_TEST(Sk4x_Comparison, r) {
     ASSERT_EQ(Sk4f(1,2,3,4), Sk4f(1,2,3,4));
     ASSERT_NE(Sk4f(4,3,2,1), Sk4f(1,2,3,4));