Align all allocations to a 16-byte boundary.

This change also fixes an issue in the Blur intrinsic, where we mis-cast a
float array to float4 (and thus encountered some new alignment errors with
the updated LLVM).

Change-Id: I3955b38f156c35f4d160652c75ab416bae09b2c8
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 0d9fde8..d44b07a 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -272,8 +272,8 @@
                                          uint32_t xstart, uint32_t xend,
                                          uint32_t instep, uint32_t outstep) {
 
-    float stackbuf[4 * 2048];
-    float *buf = &stackbuf[0];
+    float4 stackbuf[2048];
+    float4 *buf = &stackbuf[0];
     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
     if (!cp->mAlloc.get()) {
         ALOGE("Blur executed without input, skipping");
@@ -291,7 +291,7 @@
             cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], p->dimX * 16);
             cp->mScratchSize[p->lid] = p->dimX;
         }
-        buf = (float *)cp->mScratch[p->lid];
+        buf = (float4 *)cp->mScratch[p->lid];
     }
     float4 *fout = (float4 *)buf;
     int y = p->y;
@@ -308,20 +308,20 @@
 
     x1 = xstart;
     while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
-        OneHU4(p, out, x1, (float4 *)buf, cp->mFp, cp->mIradius);
+        OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
 #if defined(ARCH_ARM_HAVE_NEON)
     if ((x1 + cp->mIradius) < x2) {
-        rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->mIradius, cp->mFp,
+        rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
                                cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
         out += (x2 - cp->mIradius) - x1;
         x1 = x2 - cp->mIradius;
     }
 #endif
     while(x2 > x1) {
-        OneHU4(p, out, x1, (float4 *)buf, cp->mFp, cp->mIradius);
+        OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
         out++;
         x1++;
     }
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 2f0c0d8..57e8b6e 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -310,16 +310,16 @@
         }
         ptr = (uint8_t*)alloc->mHal.state.userProvidedPtr;
     } else {
-        if (forceZero) {
-            ptr = (uint8_t *)calloc(1, allocSize);
-        } else {
-            ptr = (uint8_t *)malloc(allocSize);
-        }
+        // We align all allocations to a 16-byte boundary.
+        ptr = (uint8_t *)memalign(16, allocSize);
         if (!ptr) {
             alloc->mHal.drv = NULL;
             free(drv);
             return false;
         }
+        if (forceZero) {
+            memset(ptr, 0, allocSize);
+        }
     }
     // Build the pointer tables
     size_t verifySize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), ptr);