add array launch support.

Change-Id: I66cd89b5b44eafa92f391708a06464cd7cdde3ed
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 40f4745..0ec7b28 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -164,7 +164,7 @@
 
     // fast path for very small launches
     MTLaunchStruct *mtls = (MTLaunchStruct *)data;
-    if (mtls && mtls->fep.dim.y <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
+    if (mtls && mtls->fep.dim.y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
         if (mWorkers.mLaunchCallback) {
             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
         }
@@ -370,7 +370,7 @@
     kparams->z = fep->current.z;
 }
 
-static inline void fepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
+static inline void FepPtrSetup(const MTLaunchStruct *mtls, RsExpandKernelDriverInfo *fep,
                                uint32_t x, uint32_t y,
                                uint32_t z = 0, uint32_t lod = 0,
                                RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
@@ -385,31 +385,89 @@
     }
 }
 
+static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
+    if (start >= end) {
+        *p = start;
+        return val;
+    }
+
+    uint32_t div = end - start;
+
+    uint32_t n = val / div;
+    *p = (val - (n * div)) + start;
+    return n;
+}
+
+static bool SelectOuterSlice(MTLaunchStruct* mtls, uint32_t sliceNum) {
+
+    uint32_t r = sliceNum;
+    r = sliceInt(&mtls->fep.current.z, r, mtls->start.z, mtls->end.z);
+    r = sliceInt(&mtls->fep.current.lod, r, mtls->start.lod, mtls->end.lod);
+    r = sliceInt(&mtls->fep.current.face, r, mtls->start.face, mtls->end.face);
+    r = sliceInt(&mtls->fep.current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
+    r = sliceInt(&mtls->fep.current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
+    r = sliceInt(&mtls->fep.current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
+    r = sliceInt(&mtls->fep.current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
+    return r == 0;
+}
+
+
+static void walk_general(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsExpandKernelDriverInfo fep = mtls->fep;
+    fep.lid = idx;
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+
+
+    while(1) {
+        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+
+        if (!SelectOuterSlice(mtls, slice)) {
+            return;
+        }
+
+        for (mtls->fep.current.y = mtls->start.y;
+             mtls->fep.current.y < mtls->end.y;
+             mtls->fep.current.y++) {
+
+            FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
+                        mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
+                        (RsAllocationCubemapFace)mtls->fep.current.face,
+                        mtls->fep.current.array[0], mtls->fep.current.array[1],
+                        mtls->fep.current.array[2], mtls->fep.current.array[3]);
+
+            RsExpandKernelParams kparams;
+            kparamSetup(&kparams, &mtls->fep);
+            fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
+        }
+    }
+
+}
 
 static void walk_2d(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
 
     while (1) {
         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
         uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-        yEnd = rsMin(yEnd, mtls->yEnd);
+        yEnd = rsMin(yEnd, mtls->end.y);
 
         if (yEnd <= yStart) {
             return;
         }
 
         for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
-            fepPtrSetup(mtls, &fep, mtls->xStart, fep.current.y);
+            FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
 
             RsExpandKernelParams kparams;
             kparamSetup(&kparams, &fep);
 
-            outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-            fn(&kparams, mtls->xStart, mtls->xEnd, fep.outStride[0]);
+            fn(&kparams, mtls->start.x, mtls->end.x, fep.outStride[0]);
         }
     }
 }
@@ -418,29 +476,28 @@
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsExpandKernelDriverInfo fep = mtls->fep;
     fep.lid = idx;
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
 
     while (1) {
         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
         uint32_t xEnd   = xStart + mtls->mSliceSize;
 
-        xEnd = rsMin(xEnd, mtls->xEnd);
+        xEnd = rsMin(xEnd, mtls->end.x);
 
         if (xEnd <= xStart) {
             return;
         }
 
-        fepPtrSetup(mtls, &fep, xStart, 0);
+        FepPtrSetup(mtls, &fep, xStart, 0);
 
         RsExpandKernelParams kparams;
         kparamSetup(&kparams, &fep);
 
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
         fn(&kparams, xStart, xEnd, fep.outStride[0]);
     }
 }
 
-
 void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation* aout,
@@ -449,11 +506,23 @@
 
     //android::StopWatch kernel_time("kernel time");
 
+    bool outerDims = (mtls->start.z != mtls->end.z) ||
+                     (mtls->start.face != mtls->end.face) ||
+                     (mtls->start.lod != mtls->end.lod) ||
+                     (mtls->start.array[0] != mtls->end.array[0]) ||
+                     (mtls->start.array[1] != mtls->end.array[1]) ||
+                     (mtls->start.array[2] != mtls->end.array[2]) ||
+                     (mtls->start.array[3] != mtls->end.array[3]);
+
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
 
-        if (mtls->fep.dim.y > 1) {
+        if (outerDims) {
+            // No fancy logic for chunk size
+            mtls->mSliceSize = 1;
+            launchThreads(walk_general, mtls);
+        } else if (mtls->fep.dim.y > 1) {
             uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
@@ -496,21 +565,23 @@
 
     } else {
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (uint32_t arrayIndex = mtls->arrayStart;
-             arrayIndex < mtls->arrayEnd; arrayIndex++) {
+        uint32_t slice = 0;
 
-            for (mtls->fep.current.z = mtls->zStart; mtls->fep.current.z < mtls->zEnd;
-                 mtls->fep.current.z++) {
 
-                for (mtls->fep.current.y = mtls->yStart; mtls->fep.current.y < mtls->yEnd;
-                     mtls->fep.current.y++) {
+        while(SelectOuterSlice(mtls, slice++)) {
+            for (mtls->fep.current.y = mtls->start.y;
+                 mtls->fep.current.y < mtls->end.y;
+                 mtls->fep.current.y++) {
 
-                    fepPtrSetup(mtls, &mtls->fep, mtls->xStart, mtls->fep.current.y, mtls->fep.current.z);
+                FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
+                            mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
+                            (RsAllocationCubemapFace) mtls->fep.current.face,
+                            mtls->fep.current.array[0], mtls->fep.current.array[1],
+                            mtls->fep.current.array[2], mtls->fep.current.array[3]);
 
-                    RsExpandKernelParams kparams;
-                    kparamSetup(&kparams, &mtls->fep);
-                    fn(&kparams, mtls->xStart, mtls->xEnd, mtls->fep.outStride[0]);
-                }
+                RsExpandKernelParams kparams;
+                kparamSetup(&kparams, &mtls->fep);
+                fn(&kparams, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
             }
         }
     }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index a42cef7..ff087b0 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -46,7 +46,7 @@
     uint32_t y;
     uint32_t z;
     uint32_t lod;
-    uint32_t faces;
+    uint32_t face;
     uint32_t array[4 /*make a define*/];
 };
 
@@ -134,18 +134,8 @@
     volatile int mSliceNum;
     bool isThreadable;
 
-    // origin of the launch
-    RsLaunchDimensions origin;
-
-    // TODO: convert to RsLaunchDimensions
-    uint32_t xStart;
-    uint32_t xEnd;
-    uint32_t yStart;
-    uint32_t yEnd;
-    uint32_t zStart;
-    uint32_t zEnd;
-    uint32_t arrayStart;
-    uint32_t arrayEnd;
+    RsLaunchDimensions start;
+    RsLaunchDimensions end;
 };
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 8437c99..1636369 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -97,16 +97,17 @@
 
     preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 
-    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
-    mtls.script = this;
-    mtls.fep.slot = slot;
+    if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
+        mtls.script = this;
+        mtls.fep.slot = slot;
 
-    mtls.kernel = (void (*)())mRootPtr;
-    mtls.fep.usr = this;
+        mtls.kernel = (void (*)())mRootPtr;
+        mtls.fep.usr = this;
 
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
+        RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+        mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+        mCtx->setTLS(oldTLS);
+    }
 
     postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 }
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index 7d614da..a4e3059 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -880,7 +880,7 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
                                         uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
@@ -898,7 +898,7 @@
 
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                          "rsForEach called with null in allocations");
-            return;
+            return false;
         }
     }
 
@@ -907,7 +907,7 @@
 
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                      "rsForEach called with null out allocations");
-        return;
+        return false;
     }
 
     if (inLen > 0) {
@@ -923,7 +923,7 @@
                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                   "Failed to launch kernel; dimensions of input and output allocations do not match.");
 
-                return;
+                return false;
             }
         }
 
@@ -937,7 +937,7 @@
     } else {
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
                                      "rsForEach called with null allocations");
-        return;
+        return false;
     }
 
     if (inLen > 0 && aout != nullptr) {
@@ -945,49 +945,70 @@
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
 
-            return;
+            return false;
         }
     }
 
     if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dim.x;
+        mtls->end.x = mtls->fep.dim.x;
     } else {
-        rsAssert(sc->xStart < mtls->fep.dim.x);
-        rsAssert(sc->xEnd <= mtls->fep.dim.x);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dim.x, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dim.x, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
+        mtls->start.x = rsMin(mtls->fep.dim.x, sc->xStart);
+        mtls->end.x = rsMin(mtls->fep.dim.x, sc->xEnd);
+        if (mtls->start.x >= mtls->end.x) return false;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dim.y;
+        mtls->end.y = mtls->fep.dim.y;
     } else {
-        rsAssert(sc->yStart < mtls->fep.dim.y);
-        rsAssert(sc->yEnd <= mtls->fep.dim.y);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dim.y, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dim.y, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
+        mtls->start.y = rsMin(mtls->fep.dim.y, sc->yStart);
+        mtls->end.y = rsMin(mtls->fep.dim.y, sc->yEnd);
+        if (mtls->start.y >= mtls->end.y) return false;
     }
 
     if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dim.z;
+        mtls->end.z = mtls->fep.dim.z;
     } else {
-        rsAssert(sc->zStart < mtls->fep.dim.z);
-        rsAssert(sc->zEnd <= mtls->fep.dim.z);
-        rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dim.z, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dim.z, sc->zEnd);
-        if (mtls->zStart >= mtls->zEnd) return;
+        mtls->start.z = rsMin(mtls->fep.dim.z, sc->zStart);
+        mtls->end.z = rsMin(mtls->fep.dim.z, sc->zEnd);
+        if (mtls->start.z >= mtls->end.z) return false;
     }
 
-    mtls->xEnd     = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd     = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+    if (!sc || (sc->arrayEnd == 0)) {
+        mtls->end.array[0] = mtls->fep.dim.array[0];
+    } else {
+        mtls->start.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayStart);
+        mtls->end.array[0] = rsMin(mtls->fep.dim.array[0], sc->arrayEnd);
+        if (mtls->start.array[0] >= mtls->end.array[0]) return false;
+    }
 
-    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
+    if (!sc || (sc->array2End == 0)) {
+        mtls->end.array[1] = mtls->fep.dim.array[1];
+    } else {
+        mtls->start.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2Start);
+        mtls->end.array[1] = rsMin(mtls->fep.dim.array[1], sc->array2End);
+        if (mtls->start.array[1] >= mtls->end.array[1]) return false;
+    }
+
+    if (!sc || (sc->array3End == 0)) {
+        mtls->end.array[2] = mtls->fep.dim.array[2];
+    } else {
+        mtls->start.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3Start);
+        mtls->end.array[2] = rsMin(mtls->fep.dim.array[2], sc->array3End);
+        if (mtls->start.array[2] >= mtls->end.array[2]) return false;
+    }
+
+    if (!sc || (sc->array4End == 0)) {
+        mtls->end.array[3] = mtls->fep.dim.array[3];
+    } else {
+        mtls->start.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4Start);
+        mtls->end.array[3] = rsMin(mtls->fep.dim.array[3], sc->array4End);
+        if (mtls->start.array[3] >= mtls->end.array[3]) return false;
+    }
+
+
+    // The X & Y walkers always want 0-1 min even if dim is not present
+    mtls->end.x    = rsMax((uint32_t)1, mtls->end.x);
+    mtls->end.y    = rsMax((uint32_t)1, mtls->end.y);
 
     mtls->rsc        = mCtx;
     if (ains) {
@@ -1013,6 +1034,9 @@
         mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
         mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
     }
+
+    // All validation passed, ok to launch threads
+    return true;
 }
 
 
@@ -1026,12 +1050,13 @@
 
     MTLaunchStruct mtls;
 
-    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
-    forEachKernelSetup(slot, &mtls);
+    if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
+        forEachKernelSetup(slot, &mtls);
 
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
+        RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+        mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+        mCtx->setTLS(oldTLS);
+    }
 }
 
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 43be88b..b84abb3 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -182,7 +182,7 @@
 
     const Script * getScript() {return mScript;}
 
-    void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+    bool forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                           Allocation * aout, const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
 
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 3d32a51..166c80d 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -223,13 +223,15 @@
                 ains  = const_cast<const Allocation**>(&ins[ct]);
             }
 
-            si->forEachMtlsSetup(ains, inLen, outs[ct], nullptr, 0, nullptr, &mtls);
+            bool launchOK = si->forEachMtlsSetup(ains, inLen, outs[ct], nullptr, 0, nullptr, &mtls);
 
             si->forEachKernelSetup(slot, &mtls);
             si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
                           mtls.fep.usrLen, nullptr);
 
-            mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+            if (launchOK) {
+                mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+            }
 
             si->postLaunch(slot, ains, inLen, outs[ct], nullptr, 0, nullptr);
         }
@@ -283,13 +285,14 @@
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
 
-        si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls);
+        if (si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls)) {
 
-        mtls.script = nullptr;
-        mtls.kernel = (void (*)())&scriptGroupRoot;
-        mtls.fep.usr = &sl;
+            mtls.script = nullptr;
+            mtls.kernel = (void (*)())&scriptGroupRoot;
+            mtls.fep.usr = &sl;
 
-        mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
+            mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
+        }
 
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 9dc4d90..52cd8a0 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -167,16 +167,17 @@
   const Closure* closure = cpuClosure->mClosure;
   MTLaunchStruct mtls;
 
-  cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
-                                    closure->mArgs.size(),
-                                    closure->mReturnValue,
-                                    nullptr, 0, nullptr, &mtls);
+  if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
+                                        closure->mArgs.size(),
+                                        closure->mReturnValue,
+                                        nullptr, 0, nullptr, &mtls)) {
 
-  mtls.script = nullptr;
-  mtls.kernel = (void (*)())&groupRoot;
-  mtls.fep.usr = &batch;
+      mtls.script = nullptr;
+      mtls.kernel = (void (*)())&groupRoot;
+      mtls.fep.usr = &batch;
 
-  mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+      mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+  }
 
   for (CPUClosure* cpuClosure : batch) {
     const Closure* closure = cpuClosure->mClosure;
diff --git a/rsDefines.h b/rsDefines.h
index 9345eb9..427be03 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -238,6 +238,12 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
+    uint32_t array2Start;
+    uint32_t array2End;
+    uint32_t array3Start;
+    uint32_t array3End;
+    uint32_t array4Start;
+    uint32_t array4End;
 
 } RsScriptCall;