Fix alignment problems in NEON Sk4b.

As written at head, the compiler can assume these loads and stores are 4
byte aligned [1].  We want Sk4b to load from any 1-byte aligned address,
to prevent crashes like [2].

[1] https://llvm.org/bugs/show_bug.cgi?id=24421
[2] https://luci-milo.appspot.com/swarming/task/304079e125b1b910/steps/nanobench/0/stdout

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2183133002
CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2183133002
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index df11de3..23567b7 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -296,6 +296,8 @@
 template <>
 class SkNx<4, uint8_t> {
 public:
+    typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
+
     SkNx(const uint8x8_t& vec) : fVec(vec) {}
 
     SkNx() {}
@@ -303,10 +305,10 @@
         fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
     }
     static SkNx Load(const void* ptr) {
-        return (uint8x8_t)vld1_dup_u32((const uint32_t*)ptr);
+        return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
     }
     void store(void* ptr) const {
-        return vst1_lane_u32((uint32_t*)ptr, (uint32x2_t)fVec, 0);
+        return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
     }
     uint8_t operator[](int k) const {
         SkASSERT(0 <= k && k < 4);