Merge "Fix crash running blur on 4K images"
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index f3a656d..b2bd3ce 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -289,10 +289,12 @@
 
     if (p->dimX > 2048) {
         if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) {
-            cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], p->dimX * 16);
+            // Pad the side of the allocation by one unit to allow alignment later
+            cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], (p->dimX + 1) * 16);
             cp->mScratchSize[p->lid] = p->dimX;
         }
-        buf = (float4 *)cp->mScratch[p->lid];
+        // realloc only aligns to 8 bytes so we manually align to 16.
+        buf = (float4 *) ((((intptr_t)cp->mScratch[p->lid]) + 15) & ~0xf);
     }
     float4 *fout = (float4 *)buf;
     int y = p->y;
@@ -407,6 +409,8 @@
 
     mScratch = new void *[mCtx->getThreadCount()];
     mScratchSize = new size_t[mCtx->getThreadCount()];
+    memset(mScratch, 0, sizeof(void *) * mCtx->getThreadCount());
+    memset(mScratchSize, 0, sizeof(size_t) * mCtx->getThreadCount());
 
     ComputeGaussianWeights();
 }
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index e2ac102..755e8d0 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -58,13 +58,23 @@
     static void kernelP1U2(const RsForEachStubParamStruct *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
     static void kernelP1U1(const RsForEachStubParamStruct *p,
                           uint32_t xstart, uint32_t xend,
                           uint32_t instep, uint32_t outstep);
 
+    static void kernelP1L4(const RsForEachStubParamStruct *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t instep, uint32_t outstep);
+    static void kernelP1L3(const RsForEachStubParamStruct *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t instep, uint32_t outstep);
+    static void kernelP1L2(const RsForEachStubParamStruct *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t instep, uint32_t outstep);
+    static void kernelP1L1(const RsForEachStubParamStruct *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t instep, uint32_t outstep);
+
 };
 
 }
@@ -92,11 +102,11 @@
                                       uint32_t usrLen, const RsScriptCall *sc) {
 
     const uint32_t threads = mCtx->getThreadCount();
-    const uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
+    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
 
     switch (slot) {
     case 0:
-        switch(mAllocOut->getType()->getElement()->getVectorSize()) {
+        switch(vSize) {
         case 1:
             mRootPtr = &kernelP1U1;
             break;
@@ -105,6 +115,7 @@
             break;
         case 3:
             mRootPtr = &kernelP1U3;
+            vSize = 4;
             break;
         case 4:
             mRootPtr = &kernelP1U4;
@@ -112,10 +123,23 @@
         }
         break;
     case 1:
-        mRootPtr = &kernelP1L;
+        switch(ain->getType()->getElement()->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelP1L1;
+            break;
+        case 2:
+            mRootPtr = &kernelP1L2;
+            break;
+        case 3:
+            mRootPtr = &kernelP1L3;
+            break;
+        case 4:
+            mRootPtr = &kernelP1L4;
+            break;
+        }
         break;
     }
-    memset(mSums, 0, 256 * 4 * threads * vSize);
+    memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
 }
 
 void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
@@ -126,6 +150,8 @@
     uint32_t threads = mCtx->getThreadCount();
     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
 
+    if (vSize == 3) vSize = 4;
+
     for (uint32_t ct=0; ct < (256 * vSize); ct++) {
         o[ct] = mSums[ct];
         for (uint32_t t=1; t < threads; t++) {
@@ -147,7 +173,7 @@
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += 4;
+        in += instep;
     }
 }
 
@@ -163,7 +189,7 @@
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += 4;
+        in += instep;
     }
 }
 
@@ -176,15 +202,15 @@
     int * sums = &cp->mSums[256 * 2 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
-        sums[(in[0] << 2)    ] ++;
-        sums[(in[1] << 2) + 1] ++;
-        in += 2;
+        sums[(in[0] << 1)    ] ++;
+        sums[(in[1] << 1) + 1] ++;
+        in += instep;
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L(const RsForEachStubParamStruct *p,
-                                               uint32_t xstart, uint32_t xend,
-                                               uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
     uchar *in = (uchar *)p->in;
@@ -195,8 +221,56 @@
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
-        sums[t >> 8] ++;
-        in += 4;
+        sums[(t + 0x7f) >> 8] ++;
+        in += instep;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar *in = (uchar *)p->in;
+    int * sums = &cp->mSums[256 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (cp->mDotI[0] * in[0]) +
+                (cp->mDotI[1] * in[1]) +
+                (cp->mDotI[2] * in[2]);
+        sums[(t + 0x7f) >> 8] ++;
+        in += instep;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar *in = (uchar *)p->in;
+    int * sums = &cp->mSums[256 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (cp->mDotI[0] * in[0]) +
+                (cp->mDotI[1] * in[1]);
+        sums[(t + 0x7f) >> 8] ++;
+        in += instep;
+    }
+}
+
+void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
+                                                uint32_t xstart, uint32_t xend,
+                                                uint32_t instep, uint32_t outstep) {
+
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar *in = (uchar *)p->in;
+    int * sums = &cp->mSums[256 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (cp->mDotI[0] * in[0]);
+        sums[(t + 0x7f) >> 8] ++;
+        in += instep;
     }
 }
 
@@ -204,6 +278,14 @@
                                                 uint32_t xstart, uint32_t xend,
                                                 uint32_t instep, uint32_t outstep) {
 
+    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
+    uchar *in = (uchar *)p->in;
+    int * sums = &cp->mSums[256 * p->lid];
+
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[in[0]] ++;
+        in += instep;
+    }
 }
 
 
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 927168a..ab439e6 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -22,9 +22,13 @@
 using namespace android::renderscript;
 
 ScriptIntrinsic::ScriptIntrinsic(Context *rsc) : Script(rsc) {
+    mIntrinsicID = 0;
 }
 
 ScriptIntrinsic::~ScriptIntrinsic() {
+    if (mIntrinsicID != 0) {
+        mRSC->mHal.funcs.script.destroy(mRSC, this);
+    }
 }
 
 bool ScriptIntrinsic::init(Context *rsc, RsScriptIntrinsicID iid, Element *e) {