add array launch support.

Change-Id: I66cd89b5b44eafa92f391708a06464cd7cdde3ed
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 7d614da..a4e3059 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -880,7 +880,7 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
@@ -898,7 +898,7 @@
 
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                          "rsForEach called with null in allocations");
-            return;
+            return false;
         }
     }
 
@@ -907,7 +907,7 @@
 
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                      "rsForEach called with null out allocations");
-        return;
+        return false;
     }
 
     if (inLen > 0) {
@@ -923,7 +923,7 @@
                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                   "Failed to launch kernel; dimensions of input and output allocations do not match.");
 
-                return;
+                return false;
             }
         }
 
@@ -937,7 +937,7 @@
     } else {
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                      "rsForEach called with null allocations");
-        return;
+        return false;
     }
 
     if (inLen > 0 && aout != nullptr) {
@@ -945,49 +945,70 @@
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
 
-            return;
+            return false;
         }
     }
 
     if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dim.x;
+        mtls->end.x = mtls->fep.dim.x;
     } else {
-        rsAssert(sc->xStart < mtls->fep.dim.x);
-        rsAssert(sc->xEnd <= mtls->fep.dim.x);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dim.x, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dim.x, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
+        mtls->start.x = rsMin(mtls->fep.dim.x, sc->xStart);
+        mtls->end.x = rsMin(mtls->fep.dim.x, sc->xEnd);
+        if (mtls->start.x >= mtls->end.x) return false;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dim.y;
+        mtls->end.y = mtls->fep.dim.y;
     } else {
-        rsAssert(sc->yStart < mtls->fep.dim.y);
-        rsAssert(sc->yEnd <= mtls->fep.dim.y);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dim.y, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dim.y, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
+        mtls->start.y = rsMin(mtls->fep.dim.y, sc->yStart);
+        mtls->end.y = rsMin(mtls->fep.dim.y, sc->yEnd);
+        if (mtls->start.y >= mtls->end.y) return false;
     }
 
     if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dim.z;
+        mtls->end.z = mtls->fep.dim.z;
     } else {
-        rsAssert(sc->zStart < mtls->fep.dim.z);
-        rsAssert(sc->zEnd <= mtls->fep.dim.z);
-        rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dim.z, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dim.z, sc->zEnd);
-        if (mtls->zStart >= mtls->zEnd) return;
+        mtls->start.z = rsMin(mtls->fep.dim.z, sc->zStart);
+        mtls->end.z = rsMin(mtls->fep.dim.z, sc->zEnd);
+        if (mtls->start.z >= mtls->end.z) return false;
     }
 
-    mtls->xEnd     = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd     = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+    if (!sc || (sc->arrayEnd == 0)) {
+        mtls->end.array[0] = mtls->fep.dim.array[0];
+    } else {
+        mtls->start.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayStart);
+        mtls->end.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayEnd);
+        if (mtls->start.array[0] >= mtls->end.array[0]) return false;
+    }
 
-    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
+    if (!sc || (sc->array2End == 0)) {
+        mtls->end.array[1] = mtls->fep.dim.array[1];
+    } else {
+        mtls->start.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2Start);
+        mtls->end.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2End);
+        if (mtls->start.array[1] >= mtls->end.array[1]) return false;
+    }
+
+    if (!sc || (sc->array3End == 0)) {
+        mtls->end.array[2] = mtls->fep.dim.array[2];
+    } else {
+        mtls->start.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3Start);
+        mtls->end.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3End);
+        if (mtls->start.array[2] >= mtls->end.array[2]) return false;
+    }
+
+    if (!sc || (sc->array4End == 0)) {
+        mtls->end.array[3] = mtls->fep.dim.array[3];
+    } else {
+        mtls->start.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4Start);
+        mtls->end.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4End);
+        if (mtls->start.array[3] >= mtls->end.array[3]) return false;
+    }
+
+
+    // The X & Y walkers always want 0-1 min even if dim is not present
+    mtls->end.x    = rsMax((uint32_t)1, mtls->end.x);
+    mtls->end.y    = rsMax((uint32_t)1, mtls->end.y);
 
     mtls->rsc        = mCtx;
     if (ains) {
@@ -1013,6 +1034,9 @@
         mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
         mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
     }
+
+    // All validation passed, ok to launch threads
+    return true;
 }
 
 
@@ -1026,12 +1050,13 @@
 
     MTLaunchStruct mtls;
 
-    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
-    forEachKernelSetup(slot, &mtls);
+    if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
+        forEachKernelSetup(slot, &mtls);
 
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
+        RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+        mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+        mCtx->setTLS(oldTLS);
+    }
 }
 
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {