Port to real Sk2f.

The bench improves from 39 to 30, about half from porting to Sk2f, half from
x.add(x) instead of x.multiply(two).

Remove Sk4f Load2/store2 now that we have Sk2f.

BUG=skia:

Review URL: https://codereview.chromium.org/1019773004
diff --git a/src/core/Sk4x.h b/src/core/Sk4x.h
index a53b975..d280c1b 100644
--- a/src/core/Sk4x.h
+++ b/src/core/Sk4x.h
@@ -37,18 +37,6 @@
     void store       (T[4]) const;
     void storeAligned(T[4]) const;
 
-    // Experimental!
-    static Sk4x Load2(const T src[2]) {
-        const T padded[4] = { src[0], src[1], 0, 0 };
-        return Load(padded);
-    }
-    void store2(T dst[2]) const {
-        T padded[4];
-        this->store(padded);
-        dst[0] = padded[0];
-        dst[1] = padded[1];
-    }
-
     template <typename Dst> Dst reinterpret() const;
     template <typename Dst> Dst        cast() const;
 
diff --git a/src/core/SkGeometry.cpp b/src/core/SkGeometry.cpp
index 063c782..88c4b60 100644
--- a/src/core/SkGeometry.cpp
+++ b/src/core/SkGeometry.cpp
@@ -117,7 +117,7 @@
 void SkEvalQuadAt(const SkPoint src[3], SkScalar t, SkPoint* pt, SkVector* tangent) {
     SkASSERT(src);
     SkASSERT(t >= 0 && t <= SK_Scalar1);
-    
+
     if (pt) {
         pt->set(eval_quad(&src[0].fX, t), eval_quad(&src[0].fY, t));
     }
@@ -127,24 +127,24 @@
     }
 }
 
-#include "Sk4x.h"
+#include "Sk2x.h"
 
 SkPoint SkEvalQuadAt(const SkPoint src[3], SkScalar t) {
     SkASSERT(src);
     SkASSERT(t >= 0 && t <= SK_Scalar1);
 
-    const Sk4f t2(t);
-    const Sk4f two(2);
-    
-    Sk4f P0 = Sk4f::Load2(&src[0].fX);
-    Sk4f P1 = Sk4f::Load2(&src[1].fX);
-    Sk4f P2 = Sk4f::Load2(&src[2].fX);
-    
-    Sk4f A = P2.subtract(P1.multiply(two)).add(P0);
-    Sk4f B = P1.subtract(P0).multiply(two);
-    
+    const Sk2f t2(t);
+
+    Sk2f P0 = Sk2f::Load(&src[0].fX);
+    Sk2f P1 = Sk2f::Load(&src[1].fX);
+    Sk2f P2 = Sk2f::Load(&src[2].fX);
+
+    Sk2f A = P2.subtract(P1.add(P1)).add(P0);
+    Sk2f B = P1.subtract(P0);
+    B = B.add(B);
+
     SkPoint result;
-    A.multiply(t2).add(B).multiply(t2).add(P0).store2(&result.fX);
+    A.multiply(t2).add(B).multiply(t2).add(P0).store(&result.fX);
     return result;
 }
 
@@ -1389,7 +1389,7 @@
     }
 
     SkP3D src[3], dst[3];
-    
+
     ratquad_mapTo3D(pts, w, src);
 
     matrix.mapHomogeneousPoints(&dst[0].fX, &src[0].fX, 3);
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp
index 05863ff..1cecd4f 100644
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -42,15 +42,6 @@
                        fs[2] == 6 &&
                        fs[3] == 7 &&
                        fs[4] == 8);
-
-    // Load2 and store2().
-    float two[2] = { 1.0f, 2.0f };
-    Sk4f twoIn4f = Sk4f::Load2(two);
-    twoIn4f = twoIn4f.multiply(Sk4f(2.0f));
-    twoIn4f.store2(two);
-
-    REPORTER_ASSERT(r, two[0] == 2.0f);
-    REPORTER_ASSERT(r, two[1] == 4.0f);
 }
 
 DEF_TEST(Sk4x_Conversions, r) {