Migrate thread launch to driver.

Change-Id: If182c524cceb327547640f22f956856d291d1787
diff --git a/libs/rs/driver/rsdBcc.cpp b/libs/rs/driver/rsdBcc.cpp
index 36a4b01..2038a4c 100644
--- a/libs/rs/driver/rsdBcc.cpp
+++ b/libs/rs/driver/rsdBcc.cpp
@@ -55,6 +55,15 @@
 
 };
 
+static Script * setTLS(Script *sc) {
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(Context::gThreadTLSKey);
+    rsAssert(tls);
+    Script *old = tls->mScript;
+    tls->mScript = sc;
+    return old;
+}
+
+
 // Input: cacheDir
 // Input: resName
 // Input: extName
@@ -234,13 +243,215 @@
 
 }
 
+typedef struct {
+    Context *rsc;
+    Script *script;
+    const Allocation * ain;
+    Allocation * aout;
+    const void * usr;
 
-int rsdScriptInvokeRoot(const Context *dc, const Script *script) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-    return drv->mRoot();
+    uint32_t mSliceSize;
+    volatile int mSliceNum;
+
+    const uint8_t *ptrIn;
+    uint32_t eStrideIn;
+    uint8_t *ptrOut;
+    uint32_t eStrideOut;
+
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+    uint32_t dimArray;
+} MTLaunchStruct;
+typedef int (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+static void wc_xy(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd = yStart + mtls->mSliceSize;
+        yEnd = rsMin(yEnd, mtls->yEnd);
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
+        for (uint32_t y = yStart; y < yEnd; y++) {
+            uint32_t offset = mtls->dimX * y;
+            uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * offset);
+            const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * offset);
+
+            for (uint32_t x = mtls->xStart; x < mtls->xEnd; x++) {
+                ((rs_t)mtls->script->mHal.info.root) (xPtrIn, xPtrOut, mtls->usr, x, y, 0, 0);
+                xPtrIn += mtls->eStrideIn;
+                xPtrOut += mtls->eStrideOut;
+            }
+        }
+    }
 }
 
-void rsdScriptInvokeInit(const Context *dc, const Script *script) {
+static void wc_x(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd = xStart + mtls->mSliceSize;
+        xEnd = rsMin(xEnd, mtls->xEnd);
+        if (xEnd <= xStart) {
+            return;
+        }
+
+        //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
+        uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * xStart);
+        const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * xStart);
+        for (uint32_t x = xStart; x < xEnd; x++) {
+            ((rs_t)mtls->script->mHal.info.root) (xPtrIn, xPtrOut, mtls->usr, x, 0, 0, 0);
+            xPtrIn += mtls->eStrideIn;
+            xPtrOut += mtls->eStrideOut;
+        }
+    }
+}
+
+void rsdScriptInvokeForEach(const Context *rsc,
+                            Script *s,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            uint32_t usrLen,
+                            const RsScriptCall *sc) {
+
+    RsHal * dc = (RsHal *)rsc->mHal.drv;
+
+    MTLaunchStruct mtls;
+    memset(&mtls, 0, sizeof(mtls));
+
+    if (ain) {
+        mtls.dimX = ain->getType()->getDimX();
+        mtls.dimY = ain->getType()->getDimY();
+        mtls.dimZ = ain->getType()->getDimZ();
+        //mtls.dimArray = ain->getType()->getDimArray();
+    } else if (aout) {
+        mtls.dimX = aout->getType()->getDimX();
+        mtls.dimY = aout->getType()->getDimY();
+        mtls.dimZ = aout->getType()->getDimZ();
+        //mtls.dimArray = aout->getType()->getDimArray();
+    } else {
+        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls.xEnd = mtls.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls.dimX);
+        rsAssert(sc->xEnd <= mtls.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls.xStart = rsMin(mtls.dimX, sc->xStart);
+        mtls.xEnd = rsMin(mtls.dimX, sc->xEnd);
+        if (mtls.xStart >= mtls.xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls.yEnd = mtls.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls.dimY);
+        rsAssert(sc->yEnd <= mtls.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls.yStart = rsMin(mtls.dimY, sc->yStart);
+        mtls.yEnd = rsMin(mtls.dimY, sc->yEnd);
+        if (mtls.yStart >= mtls.yEnd) return;
+    }
+
+    mtls.xEnd = rsMax((uint32_t)1, mtls.xEnd);
+    mtls.yEnd = rsMax((uint32_t)1, mtls.yEnd);
+    mtls.zEnd = rsMax((uint32_t)1, mtls.zEnd);
+    mtls.arrayEnd = rsMax((uint32_t)1, mtls.arrayEnd);
+
+    rsAssert(ain->getType()->getDimZ() == 0);
+
+    Context *mrsc = (Context *)rsc;
+    Script * oldTLS = setTLS(s);
+
+    mtls.rsc = mrsc;
+    mtls.ain = ain;
+    mtls.aout = aout;
+    mtls.script = s;
+    mtls.usr = usr;
+    mtls.mSliceSize = 10;
+    mtls.mSliceNum = 0;
+
+    mtls.ptrIn = NULL;
+    mtls.eStrideIn = 0;
+    if (ain) {
+        mtls.ptrIn = (const uint8_t *)ain->getPtr();
+        mtls.eStrideIn = ain->getType()->getElementSizeBytes();
+    }
+
+    mtls.ptrOut = NULL;
+    mtls.eStrideOut = 0;
+    if (aout) {
+        mtls.ptrOut = (uint8_t *)aout->getPtr();
+        mtls.eStrideOut = aout->getType()->getElementSizeBytes();
+    }
+
+    if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable) {
+        if (mtls.dimY > 1) {
+            rsdLaunchThreads(mrsc, wc_xy, &mtls);
+        } else {
+            rsdLaunchThreads(mrsc, wc_x, &mtls);
+        }
+
+        //LOGE("launch 1");
+    } else {
+        //LOGE("launch 3");
+        for (uint32_t ar = mtls.arrayStart; ar < mtls.arrayEnd; ar++) {
+            for (uint32_t z = mtls.zStart; z < mtls.zEnd; z++) {
+                for (uint32_t y = mtls.yStart; y < mtls.yEnd; y++) {
+                    uint32_t offset = mtls.dimX * mtls.dimY * mtls.dimZ * ar +
+                                      mtls.dimX * mtls.dimY * z +
+                                      mtls.dimX * y;
+                    uint8_t *xPtrOut = mtls.ptrOut + (mtls.eStrideOut * offset);
+                    const uint8_t *xPtrIn = mtls.ptrIn + (mtls.eStrideIn * offset);
+
+                    for (uint32_t x = mtls.xStart; x < mtls.xEnd; x++) {
+                        ((rs_t)s->mHal.info.root) (xPtrIn, xPtrOut, usr, x, y, z, ar);
+                        xPtrIn += mtls.eStrideIn;
+                        xPtrOut += mtls.eStrideOut;
+                    }
+                }
+            }
+        }
+    }
+
+    setTLS(oldTLS);
+}
+
+
+int rsdScriptInvokeRoot(const Context *dc, Script *script) {
+    DrvScript *drv = (DrvScript *)script->mHal.drv;
+
+    Script * oldTLS = setTLS(script);
+    int ret = drv->mRoot();
+    setTLS(oldTLS);
+
+    return ret;
+}
+
+void rsdScriptInvokeInit(const Context *dc, Script *script) {
     DrvScript *drv = (DrvScript *)script->mHal.drv;
 
     if (drv->mInit) {
@@ -249,15 +460,17 @@
 }
 
 
-void rsdScriptInvokeFunction(const Context *dc, const Script *script,
+void rsdScriptInvokeFunction(const Context *dc, Script *script,
                             uint32_t slot,
                             const void *params,
                             size_t paramLength) {
     DrvScript *drv = (DrvScript *)script->mHal.drv;
     //LOGE("invoke %p %p %i %p %i", dc, script, slot, params, paramLength);
 
+    Script * oldTLS = setTLS(script);
     ((void (*)(const void *, uint32_t))
         drv->mInvokeFunctions[slot])(params, paramLength);
+    setTLS(oldTLS);
 }
 
 void rsdScriptSetGlobalVar(const Context *dc, const Script *script,
diff --git a/libs/rs/driver/rsdBcc.h b/libs/rs/driver/rsdBcc.h
index f697f29..6723a36 100644
--- a/libs/rs/driver/rsdBcc.h
+++ b/libs/rs/driver/rsdBcc.h
@@ -25,14 +25,23 @@
                    uint8_t const *bitcode, size_t bitcodeSize,
                    uint32_t flags, android::renderscript::RsHalSymbolLookupFunc lookupFunc);
 void rsdScriptInvokeFunction(const android::renderscript::Context *dc,
-                             const android::renderscript::Script *script,
+                             android::renderscript::Script *script,
                              uint32_t slot,
                              const void *params,
                              size_t paramLength);
+
+void rsdScriptInvokeForEach(const android::renderscript::Context *rsc,
+                            android::renderscript::Script *s,
+                            const android::renderscript::Allocation * ain,
+                            android::renderscript::Allocation * aout,
+                            const void * usr,
+                            uint32_t usrLen,
+                            const RsScriptCall *sc);
+
 int rsdScriptInvokeRoot(const android::renderscript::Context *dc,
-                        const android::renderscript::Script *script);
+                        android::renderscript::Script *script);
 void rsdScriptInvokeInit(const android::renderscript::Context *dc,
-                         const android::renderscript::Script *script);
+                         android::renderscript::Script *script);
 
 void rsdScriptSetGlobalVar(const android::renderscript::Context *,
                            const android::renderscript::Script *,
diff --git a/libs/rs/driver/rsdCore.cpp b/libs/rs/driver/rsdCore.cpp
index 79fcab5..bb6cce9 100644
--- a/libs/rs/driver/rsdCore.cpp
+++ b/libs/rs/driver/rsdCore.cpp
@@ -20,16 +20,29 @@
 #include <malloc.h>
 #include "rsContext.h"
 
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sched.h>
+#include <cutils/properties.h>
+#include <cutils/sched_policy.h>
+#include <sys/syscall.h>
+#include <string.h>
+
 using namespace android;
 using namespace android::renderscript;
 
+static void Shutdown(Context *rsc);
+static void SetPriority(const Context *rsc, int32_t priority);
+
 static RsdHalFunctions FunctionTable = {
+    Shutdown,
     NULL,
-    NULL,
+    SetPriority,
     {
         rsdScriptInit,
         rsdScriptInvokeFunction,
         rsdScriptInvokeRoot,
+        rsdScriptInvokeForEach,
         rsdScriptInvokeInit,
         rsdScriptSetGlobalVar,
         rsdScriptSetGlobalBind,
@@ -39,16 +52,134 @@
 };
 
 
+
+static void * HelperThreadProc(void *vrsc) {
+    Context *rsc = static_cast<Context *>(vrsc);
+    RsHal *dc = (RsHal *)rsc->mHal.drv;
+
+
+    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
+
+    //LOGV("RS helperThread starting %p idx=%i", rsc, idx);
+
+    dc->mWorkers.mLaunchSignals[idx].init();
+    dc->mWorkers.mNativeThreadId[idx] = gettid();
+
+#if 0
+    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
+    cpu_set_t cpuset;
+    memset(&cpuset, 0, sizeof(cpuset));
+    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
+    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
+              sizeof(cpuset), &cpuset);
+    LOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
+#endif
+
+    int status = pthread_setspecific(rsc->gThreadTLSKey, rsc->mTlsStruct);
+    if (status) {
+        LOGE("pthread_setspecific %i", status);
+    }
+
+    while (!dc->mExit) {
+        dc->mWorkers.mLaunchSignals[idx].wait();
+        if (dc->mWorkers.mLaunchCallback) {
+           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx);
+        }
+        android_atomic_dec(&dc->mWorkers.mRunningCount);
+        dc->mWorkers.mCompleteSignal.set();
+    }
+
+    //LOGV("RS helperThread exited %p idx=%i", rsc, idx);
+    return NULL;
+}
+
+void rsdLaunchThreads(Context *rsc, WorkerCallback_t cbk, void *data) {
+    RsHal *dc = (RsHal *)rsc->mHal.drv;
+
+    dc->mWorkers.mLaunchData = data;
+    dc->mWorkers.mLaunchCallback = cbk;
+    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
+        dc->mWorkers.mLaunchSignals[ct].set();
+    }
+    while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
+        dc->mWorkers.mCompleteSignal.wait();
+    }
+}
+
 bool rsdHalInit(Context *rsc, uint32_t version_major, uint32_t version_minor) {
     rsc->mHal.funcs = FunctionTable;
 
-    /*
-    rsc->mHal.drv = (RsHal *)calloc(1, sizeof(RsHal));
+    RsHal *dc = (RsHal *)calloc(1, sizeof(RsHal));
     if (!rsc->mHal.drv) {
         return false;
     }
-    */
+    rsc->mHal.drv = dc;
 
+
+    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    LOGV("RS Launching thread(s), reported CPU count %i", cpu);
+    if (cpu < 2) cpu = 0;
+
+    dc->mWorkers.mCount = (uint32_t)cpu;
+    dc->mWorkers.mThreadId = (pthread_t *) calloc(dc->mWorkers.mCount, sizeof(pthread_t));
+    dc->mWorkers.mNativeThreadId = (pid_t *) calloc(dc->mWorkers.mCount, sizeof(pid_t));
+    dc->mWorkers.mLaunchSignals = new Signal[dc->mWorkers.mCount];
+    dc->mWorkers.mLaunchCallback = NULL;
+
+    dc->mWorkers.mCompleteSignal.init();
+
+    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
+    android_atomic_release_store(0, &dc->mWorkers.mLaunchCount);
+
+    int status;
+    pthread_attr_t threadAttr;
+    status = pthread_attr_init(&threadAttr);
+    if (status) {
+        LOGE("Failed to init thread attribute.");
+        return false;
+    }
+
+    for (uint32_t ct=0; ct < dc->mWorkers.mCount; ct++) {
+        status = pthread_create(&dc->mWorkers.mThreadId[ct], &threadAttr, HelperThreadProc, rsc);
+        if (status) {
+            dc->mWorkers.mCount = ct;
+            LOGE("Created fewer than expected number of RS threads.");
+            break;
+        }
+    }
+    while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
+        usleep(100);
+    }
+
+    pthread_attr_destroy(&threadAttr);
     return true;
 }
 
+
+void SetPriority(const Context *rsc, int32_t priority) {
+    RsHal *dc = (RsHal *)rsc->mHal.drv;
+    for (uint32_t ct=0; ct < dc->mWorkers.mCount; ct++) {
+        setpriority(PRIO_PROCESS, dc->mWorkers.mNativeThreadId[ct], priority);
+    }
+}
+
+void Shutdown(Context *rsc) {
+    RsHal *dc = (RsHal *)rsc->mHal.drv;
+
+    dc->mExit = true;
+    dc->mWorkers.mLaunchData = NULL;
+    dc->mWorkers.mLaunchCallback = NULL;
+    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
+        dc->mWorkers.mLaunchSignals[ct].set();
+    }
+    int status;
+    void *res;
+    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
+        status = pthread_join(dc->mWorkers.mThreadId[ct], &res);
+    }
+    rsAssert(android_atomic_acquire_load(&dc->mWorkers.mRunningCount) == 0);
+}
+
+
diff --git a/libs/rs/driver/rsdCore.h b/libs/rs/driver/rsdCore.h
index 78596a1..02b2fbc 100644
--- a/libs/rs/driver/rsdCore.h
+++ b/libs/rs/driver/rsdCore.h
@@ -20,28 +20,36 @@
 #include <rs_hal.h>
 #include <bcc/bcc.h>
 
-typedef void (* InvokeFunc_t)(void);
+#include "rsMutex.h"
+#include "rsSignal.h"
 
-struct RsHalRec {
+
+typedef void (* InvokeFunc_t)(void);
+typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
+
+typedef struct RsHalRec {
     uint32_t version_major;
     uint32_t version_minor;
-};
 
-struct RsHalProgramStoreRec {
-};
+    struct Workers {
+        volatile int mRunningCount;
+        volatile int mLaunchCount;
+        uint32_t mCount;
+        pthread_t *mThreadId;
+        pid_t *mNativeThreadId;
+        android::renderscript::Signal mCompleteSignal;
 
-struct RsHalProgramRasterRec {
-};
-
-struct RsHalProgramVertexRec {
-};
-
-struct RsHalProgramFragmentRec {
-
-};
+        android::renderscript::Signal *mLaunchSignals;
+        WorkerCallback_t mLaunchCallback;
+        void *mLaunchData;
+    };
+    Workers mWorkers;
+    bool mExit;
+} RsHal;
 
 
 
+void rsdLaunchThreads(android::renderscript::Context *rsc, WorkerCallback_t cbk, void *data);
 
 #endif
 
diff --git a/libs/rs/rsContext.cpp b/libs/rs/rsContext.cpp
index 7dc26d2..339a773 100644
--- a/libs/rs/rsContext.cpp
+++ b/libs/rs/rsContext.cpp
@@ -554,56 +554,6 @@
     mExit = true;
 }
 
-void * Context::helperThreadProc(void *vrsc) {
-     Context *rsc = static_cast<Context *>(vrsc);
-     uint32_t idx = (uint32_t)android_atomic_inc(&rsc->mWorkers.mLaunchCount);
-
-     //LOGV("RS helperThread starting %p idx=%i", rsc, idx);
-
-     rsc->mWorkers.mLaunchSignals[idx].init();
-     rsc->mWorkers.mNativeThreadId[idx] = gettid();
-
-#if 0
-     typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
-     cpu_set_t cpuset;
-     memset(&cpuset, 0, sizeof(cpuset));
-     cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
-     int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
-               sizeof(cpuset), &cpuset);
-     LOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
-#endif
-
-     setpriority(PRIO_PROCESS, rsc->mWorkers.mNativeThreadId[idx], rsc->mThreadPriority);
-     int status = pthread_setspecific(rsc->gThreadTLSKey, rsc->mTlsStruct);
-     if (status) {
-         LOGE("pthread_setspecific %i", status);
-     }
-
-     while (!rsc->mExit) {
-         rsc->mWorkers.mLaunchSignals[idx].wait();
-         if (rsc->mWorkers.mLaunchCallback) {
-            rsc->mWorkers.mLaunchCallback(rsc->mWorkers.mLaunchData, idx);
-         }
-         android_atomic_dec(&rsc->mWorkers.mRunningCount);
-         rsc->mWorkers.mCompleteSignal.set();
-     }
-
-     //LOGV("RS helperThread exited %p idx=%i", rsc, idx);
-     return NULL;
-}
-
-void Context::launchThreads(WorkerCallback_t cbk, void *data) {
-    mWorkers.mLaunchData = data;
-    mWorkers.mLaunchCallback = cbk;
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
-    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
-        mWorkers.mLaunchSignals[ct].set();
-    }
-    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
-        mWorkers.mCompleteSignal.wait();
-    }
-}
-
 void Context::setPriority(int32_t p) {
     // Note: If we put this in the proper "background" policy
     // the wallpapers can become completly unresponsive at times.
@@ -620,9 +570,6 @@
     }
 #else
     setpriority(PRIO_PROCESS, mNativeThreadId, p);
-    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
-        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], p);
-    }
 #endif
 }
 
@@ -691,16 +638,8 @@
     if (!rsdHalInit(this, 0, 0)) {
         return false;
     }
+    mHal.funcs.setPriority(this, mThreadPriority);
 
-    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
-    LOGV("RS Launching thread(s), reported CPU count %i", cpu);
-    if (cpu < 2) cpu = 0;
-
-    mWorkers.mCount = (uint32_t)cpu;
-    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
-    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
-    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
-    mWorkers.mLaunchCallback = NULL;
     status = pthread_create(&mThreadId, &threadAttr, threadProc, this);
     if (status) {
         LOGE("Failed to start rs context thread.");
@@ -714,20 +653,6 @@
         return false;
     }
 
-    mWorkers.mCompleteSignal.init();
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
-    android_atomic_release_store(0, &mWorkers.mLaunchCount);
-    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
-        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
-        if (status) {
-            mWorkers.mCount = ct;
-            LOGE("Created fewer than expected number of RS threads.");
-            break;
-        }
-    }
-    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
-        usleep(100);
-    }
     pthread_attr_destroy(&threadAttr);
     return true;
 }
@@ -744,17 +669,10 @@
     mIO.shutdown();
     int status = pthread_join(mThreadId, &res);
 
-    // Cleanup compute threads.
-    mWorkers.mLaunchData = NULL;
-    mWorkers.mLaunchCallback = NULL;
-    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
-    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
-        mWorkers.mLaunchSignals[ct].set();
+
+    if (mHal.funcs.shutdownDriver) {
+        mHal.funcs.shutdownDriver(this);
     }
-    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
-        status = pthread_join(mWorkers.mThreadId[ct], &res);
-    }
-    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
 
     // Global structure cleanup.
     pthread_mutex_lock(&gInitMutex);
diff --git a/libs/rs/rsContext.h b/libs/rs/rsContext.h
index dee16d6..72574a60 100644
--- a/libs/rs/rsContext.h
+++ b/libs/rs/rsContext.h
@@ -43,7 +43,6 @@
 #include "rsgApiStructs.h"
 #include "rsLocklessFifo.h"
 
-
 #include <ui/egl/android_natives.h>
 #endif // ANDROID_RS_SERIALIZE
 
@@ -91,15 +90,6 @@
     // Library mutex (for providing thread-safe calls from the runtime)
     static pthread_mutex_t gLibMutex;
 
-    struct ScriptTLSStruct {
-        Context * mContext;
-        Script * mScript;
-    };
-
-    //const RsHalComputeFunctions *mHalComputeFuncs;
-    //const RsHalGraphicsFunctions *mHalGraphicsFuncs;
-    //RsHal *mHal;
-
     class PushState {
     public:
         PushState(Context *);
@@ -117,9 +107,6 @@
     ScriptTLSStruct *mTlsStruct;
     RsSurfaceConfig mUserSurfaceConfig;
 
-    typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
-
-    //StructuredAllocationContext mStateAllocation;
     ElementState mStateElement;
     TypeState mStateType;
     SamplerState mStateSampler;
@@ -230,8 +217,6 @@
     uint32_t getMaxVertexUniformVectors() const {return mGL.mMaxVertexUniformVectors;}
     uint32_t getMaxVertexAttributes() const {return mGL.mMaxVertexAttribs;}
 
-    void launchThreads(WorkerCallback_t cbk, void *data);
-    uint32_t getWorkerPoolSize() const {return (uint32_t)mWorkers.mCount;}
     uint32_t getDPI() const {return mDPI;}
     void setDPI(uint32_t dpi) {mDPI = dpi;}
 
@@ -288,20 +273,6 @@
     pthread_t mThreadId;
     pid_t mNativeThreadId;
 
-    struct Workers {
-        volatile int mRunningCount;
-        volatile int mLaunchCount;
-        uint32_t mCount;
-        pthread_t *mThreadId;
-        pid_t *mNativeThreadId;
-        Signal mCompleteSignal;
-
-        Signal *mLaunchSignals;
-        WorkerCallback_t mLaunchCallback;
-        void *mLaunchData;
-    };
-    Workers mWorkers;
-
     ObjectBaseRef<Script> mRootScript;
     ObjectBaseRef<ProgramFragment> mFragment;
     ObjectBaseRef<ProgramVertex> mVertex;
diff --git a/libs/rs/rsScriptC.cpp b/libs/rs/rsScriptC.cpp
index f99534f..d5c486b 100644
--- a/libs/rs/rsScriptC.cpp
+++ b/libs/rs/rsScriptC.cpp
@@ -38,9 +38,6 @@
 
 ScriptC::~ScriptC() {
     mRSC->mHal.funcs.script.destroy(mRSC, this);
-
-    //free(mEnviroment.mScriptText);
-    //mEnviroment.mScriptText = NULL;
 }
 
 void ScriptC::setupScript(Context *rsc) {
@@ -79,15 +76,6 @@
     return NULL;
 }
 
-Script * ScriptC::setTLS(Script *sc) {
-    Context::ScriptTLSStruct * tls = (Context::ScriptTLSStruct *)
-                                  pthread_getspecific(Context::gThreadTLSKey);
-    rsAssert(tls);
-    Script *old = tls->mScript;
-    tls->mScript = sc;
-    return old;
-}
-
 void ScriptC::setupGLState(Context *rsc) {
     if (mEnviroment.mFragmentStore.get()) {
         rsc->setProgramStore(mEnviroment.mFragmentStore.get());
@@ -113,215 +101,32 @@
     setupScript(rsc);
 
     uint32_t ret = 0;
-    Script * oldTLS = setTLS(this);
 
     if (rsc->props.mLogScripts) {
         LOGV("%p ScriptC::run invoking root,  ptr %p", rsc, mHal.info.root);
     }
 
-    ret = mHal.info.root();
+    ret = rsc->mHal.funcs.script.invokeRoot(rsc, this);
 
     if (rsc->props.mLogScripts) {
         LOGV("%p ScriptC::run invoking complete, ret=%i", rsc, ret);
     }
 
-    setTLS(oldTLS);
     return ret;
 }
 
-typedef struct {
-    Context *rsc;
-    ScriptC *script;
-    const Allocation * ain;
-    Allocation * aout;
-    const void * usr;
-
-    uint32_t mSliceSize;
-    volatile int mSliceNum;
-
-    const uint8_t *ptrIn;
-    uint32_t eStrideIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideOut;
-
-    uint32_t xStart;
-    uint32_t xEnd;
-    uint32_t yStart;
-    uint32_t yEnd;
-    uint32_t zStart;
-    uint32_t zEnd;
-    uint32_t arrayStart;
-    uint32_t arrayEnd;
-
-    uint32_t dimX;
-    uint32_t dimY;
-    uint32_t dimZ;
-    uint32_t dimArray;
-} MTLaunchStruct;
-typedef int (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-
-static void wc_xy(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
-    while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd = yStart + mtls->mSliceSize;
-        yEnd = rsMin(yEnd, mtls->yEnd);
-        if (yEnd <= yStart) {
-            return;
-        }
-
-        //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
-        for (uint32_t y = yStart; y < yEnd; y++) {
-            uint32_t offset = mtls->dimX * y;
-            uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * offset);
-            const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * offset);
-
-            for (uint32_t x = mtls->xStart; x < mtls->xEnd; x++) {
-                ((rs_t)mtls->script->mHal.info.root) (xPtrIn, xPtrOut, mtls->usr, x, y, 0, 0);
-                xPtrIn += mtls->eStrideIn;
-                xPtrOut += mtls->eStrideOut;
-            }
-        }
-    }
-}
-
-static void wc_x(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-
-    while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd = xStart + mtls->mSliceSize;
-        xEnd = rsMin(xEnd, mtls->xEnd);
-        if (xEnd <= xStart) {
-            return;
-        }
-
-        //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
-        uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * xStart);
-        const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * xStart);
-        for (uint32_t x = xStart; x < xEnd; x++) {
-            ((rs_t)mtls->script->mHal.info.root) (xPtrIn, xPtrOut, mtls->usr, x, 0, 0, 0);
-            xPtrIn += mtls->eStrideIn;
-            xPtrOut += mtls->eStrideOut;
-        }
-    }
-}
 
 void ScriptC::runForEach(Context *rsc,
                          const Allocation * ain,
                          Allocation * aout,
                          const void * usr,
                          const RsScriptCall *sc) {
-    MTLaunchStruct mtls;
-    memset(&mtls, 0, sizeof(mtls));
+
     Context::PushState ps(rsc);
 
-
-    if (ain) {
-        mtls.dimX = ain->getType()->getDimX();
-        mtls.dimY = ain->getType()->getDimY();
-        mtls.dimZ = ain->getType()->getDimZ();
-        //mtls.dimArray = ain->getType()->getDimArray();
-    } else if (aout) {
-        mtls.dimX = aout->getType()->getDimX();
-        mtls.dimY = aout->getType()->getDimY();
-        mtls.dimZ = aout->getType()->getDimZ();
-        //mtls.dimArray = aout->getType()->getDimArray();
-    } else {
-        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
-
-    if (!sc || (sc->xEnd == 0)) {
-        mtls.xEnd = mtls.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls.dimX);
-        rsAssert(sc->xEnd <= mtls.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls.xStart = rsMin(mtls.dimX, sc->xStart);
-        mtls.xEnd = rsMin(mtls.dimX, sc->xEnd);
-        if (mtls.xStart >= mtls.xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls.yEnd = mtls.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls.dimY);
-        rsAssert(sc->yEnd <= mtls.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls.yStart = rsMin(mtls.dimY, sc->yStart);
-        mtls.yEnd = rsMin(mtls.dimY, sc->yEnd);
-        if (mtls.yStart >= mtls.yEnd) return;
-    }
-
-    mtls.xEnd = rsMax((uint32_t)1, mtls.xEnd);
-    mtls.yEnd = rsMax((uint32_t)1, mtls.yEnd);
-    mtls.zEnd = rsMax((uint32_t)1, mtls.zEnd);
-    mtls.arrayEnd = rsMax((uint32_t)1, mtls.arrayEnd);
-
-    rsAssert(ain->getType()->getDimZ() == 0);
-
     setupGLState(rsc);
     setupScript(rsc);
-    Script * oldTLS = setTLS(this);
-
-    mtls.rsc = rsc;
-    mtls.ain = ain;
-    mtls.aout = aout;
-    mtls.script = this;
-    mtls.usr = usr;
-    mtls.mSliceSize = 10;
-    mtls.mSliceNum = 0;
-
-    mtls.ptrIn = NULL;
-    mtls.eStrideIn = 0;
-    if (ain) {
-        mtls.ptrIn = (const uint8_t *)ain->getPtr();
-        mtls.eStrideIn = ain->getType()->getElementSizeBytes();
-    }
-
-    mtls.ptrOut = NULL;
-    mtls.eStrideOut = 0;
-    if (aout) {
-        mtls.ptrOut = (uint8_t *)aout->getPtr();
-        mtls.eStrideOut = aout->getType()->getElementSizeBytes();
-    }
-
-    if ((rsc->getWorkerPoolSize() > 1) && mHal.info.isThreadable) {
-        if (mtls.dimY > 1) {
-            rsc->launchThreads(wc_xy, &mtls);
-        } else {
-            rsc->launchThreads(wc_x, &mtls);
-        }
-
-        //LOGE("launch 1");
-    } else {
-        //LOGE("launch 3");
-        for (uint32_t ar = mtls.arrayStart; ar < mtls.arrayEnd; ar++) {
-            for (uint32_t z = mtls.zStart; z < mtls.zEnd; z++) {
-                for (uint32_t y = mtls.yStart; y < mtls.yEnd; y++) {
-                    uint32_t offset = mtls.dimX * mtls.dimY * mtls.dimZ * ar +
-                                      mtls.dimX * mtls.dimY * z +
-                                      mtls.dimX * y;
-                    uint8_t *xPtrOut = mtls.ptrOut + (mtls.eStrideOut * offset);
-                    const uint8_t *xPtrIn = mtls.ptrIn + (mtls.eStrideIn * offset);
-
-                    for (uint32_t x = mtls.xStart; x < mtls.xEnd; x++) {
-                        ((rs_t)mHal.info.root) (xPtrIn, xPtrOut, usr, x, y, z, ar);
-                        xPtrIn += mtls.eStrideIn;
-                        xPtrOut += mtls.eStrideOut;
-                    }
-                }
-            }
-        }
-    }
-
-    setTLS(oldTLS);
+    rsc->mHal.funcs.script.invokeForEach(rsc, this, ain, aout, usr, 0, sc);
 }
 
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, uint32_t len) {
@@ -330,14 +135,11 @@
         return;
     }
     setupScript(rsc);
-    Script * oldTLS = setTLS(this);
 
     if (rsc->props.mLogScripts) {
         LOGV("%p ScriptC::Invoke invoking slot %i,  ptr %p", rsc, slot, this);
     }
     rsc->mHal.funcs.script.invokeFunction(rsc, this, slot, data, len);
-
-    setTLS(oldTLS);
 }
 
 ScriptCState::ScriptCState() {
diff --git a/libs/rs/rsScriptC.h b/libs/rs/rsScriptC.h
index da5cb2b..2edeb9b 100644
--- a/libs/rs/rsScriptC.h
+++ b/libs/rs/rsScriptC.h
@@ -56,7 +56,7 @@
     bool runCompiler(Context *rsc, const char *resName, const char *cacheDir,
                      const uint8_t *bitcode, size_t bitcodeLen);
 
-protected:
+//protected:
     void setupScript(Context *);
     void setupGLState(Context *);
     Script * setTLS(Script *);
diff --git a/libs/rs/rsScriptC_Lib.cpp b/libs/rs/rsScriptC_Lib.cpp
index 8095f5a..4e8cbdc 100644
--- a/libs/rs/rsScriptC_Lib.cpp
+++ b/libs/rs/rsScriptC_Lib.cpp
@@ -25,8 +25,8 @@
 using namespace android;
 using namespace android::renderscript;
 
-#define GET_TLS()  Context::ScriptTLSStruct * tls = \
-    (Context::ScriptTLSStruct *)pthread_getspecific(Context::gThreadTLSKey); \
+#define GET_TLS()  ScriptTLSStruct * tls = \
+    (ScriptTLSStruct *)pthread_getspecific(Context::gThreadTLSKey); \
     Context * rsc = tls->mContext; \
     ScriptC * sc = (ScriptC *) tls->mScript
 
@@ -1006,6 +1006,3 @@
     return NULL;
 }
 
-
-
-
diff --git a/libs/rs/rsScriptC_LibGL.cpp b/libs/rs/rsScriptC_LibGL.cpp
index 15426bc..4047049 100644
--- a/libs/rs/rsScriptC_LibGL.cpp
+++ b/libs/rs/rsScriptC_LibGL.cpp
@@ -32,8 +32,8 @@
 using namespace android;
 using namespace android::renderscript;
 
-#define GET_TLS()  Context::ScriptTLSStruct * tls = \
-    (Context::ScriptTLSStruct *)pthread_getspecific(Context::gThreadTLSKey); \
+#define GET_TLS()  ScriptTLSStruct * tls = \
+    (ScriptTLSStruct *)pthread_getspecific(Context::gThreadTLSKey); \
     Context * rsc = tls->mContext; \
     ScriptC * sc = (ScriptC *) tls->mScript
 
diff --git a/libs/rs/rs_hal.h b/libs/rs/rs_hal.h
index 48e3f36..17983ce 100644
--- a/libs/rs/rs_hal.h
+++ b/libs/rs/rs_hal.h
@@ -31,18 +31,21 @@
 class ScriptC;
 
 
-typedef struct RsHalRec RsHal;
-
 typedef void *(*RsHalSymbolLookupFunc)(void *usrptr, char const *symbolName);
 
+typedef struct ScriptTLSStructRec {
+    Context * mContext;
+    Script * mScript;
+} ScriptTLSStruct;
 
 
 /**
  * Script management functions
  */
 typedef struct {
-    void (*shutdownDriver)(RsHal dc);
+    void (*shutdownDriver)(Context *);
     void (*getVersion)(unsigned int *major, unsigned int *minor);
+    void (*setPriority)(const Context *, int32_t priority);
 
 
 
@@ -55,12 +58,19 @@
                            uint32_t flags,
                            RsHalSymbolLookupFunc lookupFunc);
 
-        void (*invokeFunction)(const Context *rsc, const Script *s,
+        void (*invokeFunction)(const Context *rsc, Script *s,
                                uint32_t slot,
                                const void *params,
                                size_t paramLength);
-        int (*invokeRoot)(const Context *rsc, const Script *s);
-        void (*invokeInit)(const Context *rsc, const Script *s);
+        int (*invokeRoot)(const Context *rsc, Script *s);
+        void (*invokeForEach)(const Context *rsc,
+                              Script *s,
+                              const Allocation * ain,
+                              Allocation * aout,
+                              const void * usr,
+                              uint32_t usrLen,
+                              const RsScriptCall *sc);
+        void (*invokeInit)(const Context *rsc, Script *s);
 
         void (*setGlobalVar)(const Context *rsc, const Script *s,
                              uint32_t slot,