Replace _mm_cvtps_epi32(x) with _mm_cvttps_epi32(_mm_add_ps(0.5f), x).

We don't have control over which way _mm_cvtps_epi32 rounds.

  - This makes the SSE SkPMFloat rounding consistent with _neon and _none.
  - Sk4f::cast<Sk4i>() is closer to (int)float's behavior.  (Correct when >=0).

Add tests that would fail at head.

BUG=skia:

Review URL: https://codereview.chromium.org/1029163002
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp
index d7a016c..8c3b977 100644
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -59,6 +59,9 @@
     ASSERT_NE(twoi, twof.reinterpret<Sk4i>());
     ASSERT_EQ(twof, twoi.cast<Sk4f>());
     ASSERT_NE(twof, twoi.reinterpret<Sk4f>());
+
+    ASSERT_EQ(Sk4i(0,0,0,0), Sk4f(0.5f, 0.49f, 0.51f, 0.99f).cast<Sk4i>());
+    ASSERT_EQ(Sk4i(1,1,1,1), Sk4f(1.5f, 1.49f, 1.51f, 1.99f).cast<Sk4i>());
 }
 
 DEF_TEST(Sk4x_Bits, r) {