Align all allocations to a 16-byte boundary.

This change also fixes an issue in the Blur intrinsic, where we mis-cast a
float array to float4 (and thus encountered some new alignment errors with
the updated LLVM).

Change-Id: I3955b38f156c35f4d160652c75ab416bae09b2c8
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 2f0c0d8..57e8b6e 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -310,16 +310,16 @@
         }
         ptr = (uint8_t*)alloc->mHal.state.userProvidedPtr;
     } else {
-        if (forceZero) {
-            ptr = (uint8_t *)calloc(1, allocSize);
-        } else {
-            ptr = (uint8_t *)malloc(allocSize);
-        }
+        // We align all allocations to a 16-byte boundary.
+        ptr = (uint8_t *)memalign(16, allocSize);
         if (!ptr) {
             alloc->mHal.drv = NULL;
             free(drv);
             return false;
         }
+        if (forceZero) {
+            memset(ptr, 0, allocSize);
+        }
     }
     // Build the pointer tables
     size_t verifySize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), ptr);