Separate CPU driver impl from reference driver.

Change-Id: Ifb484edda665959b81d7b1f890d108bfa20a535d
diff --git a/Android.mk b/Android.mk
index e169e4e..ffde29f 100644
--- a/Android.mk
+++ b/Android.mk
@@ -21,21 +21,12 @@
 	driver/rsdFrameBuffer.cpp \
 	driver/rsdFrameBufferObj.cpp \
 	driver/rsdGL.cpp \
-	driver/rsdIntrinsics.cpp \
-	driver/rsdIntrinsicBlend.cpp \
-	driver/rsdIntrinsicBlur.cpp \
-	driver/rsdIntrinsicConvolve3x3.cpp \
-	driver/rsdIntrinsicConvolve5x5.cpp \
-	driver/rsdIntrinsicLUT.cpp \
-	driver/rsdIntrinsicColorMatrix.cpp \
-	driver/rsdIntrinsicYuvToRGB.cpp \
 	driver/rsdMesh.cpp \
 	driver/rsdMeshObj.cpp \
 	driver/rsdPath.cpp \
 	driver/rsdProgram.cpp \
 	driver/rsdProgramRaster.cpp \
 	driver/rsdProgramStore.cpp \
-	driver/rsdRuntimeMath.cpp \
 	driver/rsdRuntimeStubs.cpp \
 	driver/rsdSampler.cpp \
 	driver/rsdScriptGroup.cpp \
@@ -43,13 +34,8 @@
 	driver/rsdShaderCache.cpp \
 	driver/rsdVertexArray.cpp
 
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-    LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
-    LOCAL_SRC_FILES+= \
-        driver/rsdIntrinsics_Convolve.S
-endif
 
-LOCAL_SHARED_LIBRARIES += libRS
+LOCAL_SHARED_LIBRARIES += libRS libRSCpuRef
 LOCAL_SHARED_LIBRARIES += libcutils libutils libEGL libGLESv1_CM libGLESv2
 LOCAL_SHARED_LIBRARIES += libbcc libbcinfo libui libgui libsync
 
@@ -258,3 +244,6 @@
 LOCAL_LDLIBS := -lpthread
 
 include $(BUILD_HOST_STATIC_LIBRARY)
+
+include $(call all-makefiles-under,$(LOCAL_PATH))
+
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
new file mode 100644
index 0000000..062a916
--- /dev/null
+++ b/cpu_ref/Android.mk
@@ -0,0 +1,51 @@
+
+LOCAL_PATH:=$(call my-dir)
+
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable
+ifeq ($(TARGET_BUILD_PDK), true)
+  rs_base_CFLAGS += -D__RS_PDK__
+endif
+
+ifneq ($(OVERRIDE_RS_DRIVER),)
+  rs_base_CFLAGS += -DOVERRIDE_RS_DRIVER=$(OVERRIDE_RS_DRIVER)
+endif
+
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_MODULE := libRSCpuRef
+
+LOCAL_SRC_FILES:= \
+	rsCpuCore.cpp \
+	rsCpuScript.cpp \
+	rsCpuRuntimeMath.cpp \
+	rsCpuRuntimeStubs.cpp \
+	rsCpuScriptGroup.cpp \
+	rsCpuIntrinsic.cpp \
+	rsCpuIntrinsicBlend.cpp \
+	rsCpuIntrinsicBlur.cpp \
+	rsCpuIntrinsicColorMatrix.cpp \
+	rsCpuIntrinsicConvolve3x3.cpp \
+	rsCpuIntrinsicConvolve5x5.cpp \
+	rsCpuIntrinsicLUT.cpp \
+	rsCpuIntrinsicYuvToRGB.cpp
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+    LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
+    LOCAL_SRC_FILES+= \
+        rsCpuIntrinsics_neon.S
+endif
+
+LOCAL_SHARED_LIBRARIES += libRS libcutils libutils libsync
+LOCAL_SHARED_LIBRARIES += libbcc libbcinfo
+
+LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
+LOCAL_C_INCLUDES += frameworks/rs
+
+LOCAL_CFLAGS += $(rs_base_CFLAGS)
+
+LOCAL_LDLIBS := -lpthread -ldl
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_SHARED_LIBRARY)
+
+
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
new file mode 100644
index 0000000..29539da
--- /dev/null
+++ b/cpu_ref/rsCpuCore.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsCpuScriptGroup.h"
+
+#include <malloc.h>
+#include "rsContext.h"
+
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sched.h>
+#include <cutils/properties.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include "utils/StopWatch.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+typedef void (*outer_foreach_t)(
+    const android::renderscript::RsForEachStubParamStruct *,
+    uint32_t x1, uint32_t x2,
+    uint32_t instep, uint32_t outstep);
+
+
+static pthread_key_t gThreadTLSKey = 0;
+static uint32_t gThreadTLSKeyCount = 0;
+static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
+
+RsdCpuReference::~RsdCpuReference() {
+}
+
+RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
+                                          uint32_t version_minor, sym_lookup_t lfn,
+                                          script_lookup_t slfn) {
+
+    RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
+    if (!cpu) {
+        return NULL;
+    }
+    if (!cpu->init(version_major, version_minor, lfn, slfn)) {
+        delete cpu;
+        return NULL;
+    }
+    return cpu;
+}
+
+
+Context * RsdCpuReference::getTlsContext() {
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    return tls->mContext;
+}
+
+const Script * RsdCpuReference::getTlsScript() {
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    return tls->mScript;
+}
+
+
+////////////////////////////////////////////////////////////
+///
+
+RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
+    mRSC = rsc;
+
+    version_major = 0;
+    version_minor = 0;
+    mInForEach = false;
+    memset(&mWorkers, 0, sizeof(mWorkers));
+    memset(&mTlsStruct, 0, sizeof(mTlsStruct));
+    mExit = false;
+
+}
+
+
+void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
+    RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
+
+
+    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
+
+    //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
+
+    dc->mWorkers.mLaunchSignals[idx].init();
+    dc->mWorkers.mNativeThreadId[idx] = gettid();
+
+    memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
+    int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
+    if (status) {
+        ALOGE("pthread_setspecific %i", status);
+    }
+
+#if 0
+    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
+    cpu_set_t cpuset;
+    memset(&cpuset, 0, sizeof(cpuset));
+    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
+    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
+              sizeof(cpuset), &cpuset);
+    ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
+#endif
+
+    while (!dc->mExit) {
+        dc->mWorkers.mLaunchSignals[idx].wait();
+        if (dc->mWorkers.mLaunchCallback) {
+           // idx +1 is used because the calling thread is always worker 0.
+           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
+        }
+        android_atomic_dec(&dc->mWorkers.mRunningCount);
+        dc->mWorkers.mCompleteSignal.set();
+    }
+
+    //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
+    return NULL;
+}
+
+void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
+    mWorkers.mLaunchData = data;
+    mWorkers.mLaunchCallback = cbk;
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+
+    // We use the calling thread as one of the workers so we can start without
+    // the delay of the thread wakeup.
+    if (mWorkers.mLaunchCallback) {
+       mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+    }
+
+    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+        mWorkers.mCompleteSignal.wait();
+    }
+}
+
+
+void RsdCpuReferenceImpl::lockMutex() {
+    pthread_mutex_lock(&gInitMutex);
+}
+
+void RsdCpuReferenceImpl::unlockMutex() {
+    pthread_mutex_unlock(&gInitMutex);
+}
+
+bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
+                               sym_lookup_t lfn, script_lookup_t slfn) {
+
+    mSymLookupFn = lfn;
+    mScriptLookupFn = slfn;
+
+    lockMutex();
+    if (!gThreadTLSKeyCount) {
+        int status = pthread_key_create(&gThreadTLSKey, NULL);
+        if (status) {
+            ALOGE("Failed to init thread tls key.");
+            unlockMutex();
+            return false;
+        }
+    }
+    gThreadTLSKeyCount++;
+    unlockMutex();
+
+    mTlsStruct.mContext = mRSC;
+    mTlsStruct.mScript = NULL;
+    int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
+    if (status) {
+        ALOGE("pthread_setspecific %i", status);
+    }
+
+    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if(mRSC->props.mDebugMaxThreads) {
+        cpu = mRSC->props.mDebugMaxThreads;
+    }
+    if (cpu < 2) {
+        mWorkers.mCount = 0;
+        return true;
+    }
+
+    // Subtract one from the cpu count because we also use the command thread as a worker.
+    mWorkers.mCount = (uint32_t)(cpu - 1);
+
+    ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount);
+
+    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
+    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
+    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
+    mWorkers.mLaunchCallback = NULL;
+
+    mWorkers.mCompleteSignal.init();
+
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    android_atomic_release_store(0, &mWorkers.mLaunchCount);
+
+    pthread_attr_t threadAttr;
+    status = pthread_attr_init(&threadAttr);
+    if (status) {
+        ALOGE("Failed to init thread attribute.");
+        return false;
+    }
+
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
+        if (status) {
+            mWorkers.mCount = ct;
+            ALOGE("Created fewer than expected number of RS threads.");
+            break;
+        }
+    }
+    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+        usleep(100);
+    }
+
+    pthread_attr_destroy(&threadAttr);
+    return true;
+}
+
+
+void RsdCpuReferenceImpl::setPriority(int32_t priority) {
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
+    }
+}
+
+RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
+    mExit = true;
+    mWorkers.mLaunchData = NULL;
+    mWorkers.mLaunchCallback = NULL;
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+    void *res;
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        pthread_join(mWorkers.mThreadId[ct], &res);
+    }
+    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
+
+    // Global structure cleanup.
+    lockMutex();
+    --gThreadTLSKeyCount;
+    if (!gThreadTLSKeyCount) {
+        pthread_key_delete(gThreadTLSKey);
+    }
+    unlockMutex();
+
+}
+
+typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+static void wc_xy(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsForEachStubParamStruct p;
+    memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
+    uint32_t sig = mtls->sig;
+
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd = yStart + mtls->mSliceSize;
+        yEnd = rsMin(yEnd, mtls->yEnd);
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        for (p.y = yStart; p.y < yEnd; p.y++) {
+            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
+                    (mtls->fep.eStrideOut * mtls->xStart);
+            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
+                   (mtls->fep.eStrideIn * mtls->xStart);
+            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+        }
+    }
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsForEachStubParamStruct p;
+    memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
+    uint32_t sig = mtls->sig;
+
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd = xStart + mtls->mSliceSize;
+        xEnd = rsMin(xEnd, mtls->xEnd);
+        if (xEnd <= xStart) {
+            return;
+        }
+
+        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    }
+}
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
+
+    //android::StopWatch kernel_time("kernel time");
+
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            launchThreads(wc_x, mtls);
+        }
+        mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsForEachStubParamStruct p;
+        memcpy(&p, &mtls->fep, sizeof(p));
+        uint32_t sig = mtls->sig;
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
+                                      mtls->fep.dimY * p.z + p.y;
+                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
+                            (mtls->fep.eStrideOut * mtls->xStart);
+                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
+                           (mtls->fep.eStrideIn * mtls->xStart);
+                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+                }
+            }
+        }
+    }
+}
+
+RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
+    //ALOGE("setTls %p", sc);
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    rsAssert(tls);
+    RsdCpuScriptImpl *old = tls->mImpl;
+    tls->mImpl = sc;
+    tls->mContext = mRSC;
+    if (sc) {
+        tls->mScript = sc->getScript();
+    } else {
+        tls->mScript = NULL;
+    }
+    return old;
+}
+
+const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
+    return mSymLookupFn(mRSC, name);
+}
+
+
+RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
+                                    char const *resName, char const *cacheDir,
+                                    uint8_t const *bitcode, size_t bitcodeSize,
+                                    uint32_t flags) {
+
+    RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
+    if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags)) {
+        delete i;
+        return NULL;
+    }
+    return i;
+}
+
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s);
+
+RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
+                                    RsScriptIntrinsicID iid, Element *e) {
+
+    RsdCpuScriptImpl *i = NULL;
+    switch (iid) {
+    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
+        i = rsdIntrinsic_Convolve3x3(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
+        i = rsdIntrinsic_ColorMatrix(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_LUT:
+        i = rsdIntrinsic_LUT(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
+        i = rsdIntrinsic_Convolve5x5(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_BLUR:
+        i = rsdIntrinsic_Blur(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
+        i = rsdIntrinsic_YuvToRGB(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_BLEND:
+        i = rsdIntrinsic_Blend(this, s);
+        break;
+
+    default:
+        rsAssert(0);
+    }
+
+    return i;
+}
+
+RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
+    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
+    if (!sgi->init()) {
+        delete sgi;
+        return NULL;
+    }
+    return sgi;
+}
+
+
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
new file mode 100644
index 0000000..4883591
--- /dev/null
+++ b/cpu_ref/rsCpuCore.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_CORE_H
+#define RSD_CPU_CORE_H
+
+#include "rsd_cpu.h"
+#include "rsSignal.h"
+#include "rsContext.h"
+#include "rsElement.h"
+#include "rsScriptC.h"
+
+namespace bcc {
+    class BCCContext;
+    class RSCompilerDriver;
+    class RSExecutable;
+}
+
+namespace android {
+namespace renderscript {
+
+
+typedef void (* InvokeFunc_t)(void);
+typedef void (* ForEachFunc_t)(void);
+typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
+
+class RsdCpuScriptImpl;
+class RsdCpuReferenceImpl;
+
+typedef struct ScriptTLSStructRec {
+    android::renderscript::Context * mContext;
+    const android::renderscript::Script * mScript;
+    RsdCpuScriptImpl *mImpl;
+} ScriptTLSStruct;
+
+typedef struct {
+    RsForEachStubParamStruct fep;
+
+    RsdCpuReferenceImpl *rsc;
+    RsdCpuScriptImpl *script;
+
+    ForEachFunc_t kernel;
+    uint32_t sig;
+    const Allocation * ain;
+    Allocation * aout;
+
+    uint32_t mSliceSize;
+    volatile int mSliceNum;
+    bool isThreadable;
+
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+} MTLaunchStruct;
+
+
+
+
+class RsdCpuReferenceImpl : public RsdCpuReference {
+public:
+    virtual ~RsdCpuReferenceImpl();
+    RsdCpuReferenceImpl(Context *);
+
+    void lockMutex();
+    void unlockMutex();
+
+    bool init(uint32_t version_major, uint32_t version_minor, sym_lookup_t, script_lookup_t);
+    virtual void setPriority(int32_t priority);
+    virtual void launchThreads(WorkerCallback_t cbk, void *data);
+    static void * helperThreadProc(void *vrsc);
+    RsdCpuScriptImpl * setTLS(RsdCpuScriptImpl *sc);
+
+    Context * getContext() {return mRSC;}
+
+    void launchThreads(const Allocation * ain, Allocation * aout,
+                       const RsScriptCall *sc, MTLaunchStruct *mtls);
+
+    virtual CpuScript * createScript(const ScriptC *s,
+                                     char const *resName, char const *cacheDir,
+                                     uint8_t const *bitcode, size_t bitcodeSize,
+                                     uint32_t flags);
+    virtual CpuScript * createIntrinsic(const Script *s,
+                                        RsScriptIntrinsicID iid, Element *e);
+    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg);
+
+    const RsdCpuReference::CpuSymbol *symLookup(const char *);
+
+    RsdCpuReference::CpuScript * lookupScript(const Script *s) {
+        return mScriptLookupFn(mRSC, s);
+    }
+
+
+protected:
+    Context *mRSC;
+    uint32_t version_major;
+    uint32_t version_minor;
+    //bool mHasGraphics;
+    bool mInForEach;
+
+    struct Workers {
+        volatile int mRunningCount;
+        volatile int mLaunchCount;
+        uint32_t mCount;
+        pthread_t *mThreadId;
+        pid_t *mNativeThreadId;
+        Signal mCompleteSignal;
+        Signal *mLaunchSignals;
+        WorkerCallback_t mLaunchCallback;
+        void *mLaunchData;
+    };
+    Workers mWorkers;
+    bool mExit;
+    sym_lookup_t mSymLookupFn;
+    script_lookup_t mScriptLookupFn;
+
+    ScriptTLSStruct mTlsStruct;
+};
+
+
+}
+}
+
+#endif
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
new file mode 100644
index 0000000..a4eef21
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+RsdCpuScriptIntrinsic::RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s,
+                                             RsScriptIntrinsicID iid)
+        : RsdCpuScriptImpl(ctx, s) {
+
+    mID = iid;
+}
+
+RsdCpuScriptIntrinsic::~RsdCpuScriptIntrinsic() {
+}
+
+void RsdCpuScriptIntrinsic::invokeFunction(uint32_t slot, const void *params, size_t paramLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeFunction");
+}
+
+int RsdCpuScriptIntrinsic::invokeRoot() {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeRoot");
+    return 0;
+}
+
+void RsdCpuScriptIntrinsic::invokeInit() {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeInit");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalVar");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalVarWithElemDims(uint32_t slot, const void *data,
+                                                     size_t dataLength, const Element *e,
+                                                     const size_t *dims, size_t dimLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalVarWithElemDims");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalBind(uint32_t slot, Allocation *data) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalBind");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalObj");
+}
+
+void RsdCpuScriptIntrinsic::invokeFreeChildren() {
+}
+
+
+void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
+                                          const Allocation * ain,
+                                          Allocation * aout,
+                                          const void * usr,
+                                          uint32_t usrLen,
+                                          const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    mtls.script = this;
+    mtls.fep.slot = slot;
+
+    mtls.kernel = (void (*)())mRootPtr;
+    mtls.fep.usr = this;
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+
+    mtls->script = this;
+    mtls->fep.slot = slot;
+    mtls->kernel = (void (*)())mRootPtr;
+    mtls->fep.usr = this;
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
new file mode 100644
index 0000000..1756115
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_SCRIPT_INTRINSIC_H
+#define RSD_CPU_SCRIPT_INTRINSIC_H
+
+#include "rsCpuScript.h"
+
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsic : public RsdCpuScriptImpl {
+public:
+    virtual void populateScript(Script *) = 0;
+
+    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual int invokeRoot();
+    virtual void invokeForEach(uint32_t slot,
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+    virtual void invokeInit();
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                  const Element *e, const size_t *dims, size_t dimLength);
+    virtual void setGlobalBind(uint32_t slot, Allocation *data);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsic();
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, RsScriptIntrinsicID iid);
+
+protected:
+    RsScriptIntrinsicID mID;
+    outer_foreach_t mRootPtr;
+
+};
+
+
+
+}
+}
+
+#endif
diff --git a/driver/rsdIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
similarity index 90%
rename from driver/rsdIntrinsicBlend.cpp
rename to cpu_ref/rsCpuIntrinsicBlend.cpp
index c35c379..57286d5 100644
--- a/driver/rsdIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -15,19 +15,32 @@
  */
 
 
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
 
 using namespace android;
 using namespace android::renderscript;
 
-struct ConvolveParams {
-    float f[4];
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBlend : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+
+    virtual ~RsdCpuScriptIntrinsicBlend();
+    RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    static void kernel(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
 };
 
+}
+}
+
 
 enum {
     BLEND_CLEAR = 0,
@@ -92,10 +105,10 @@
 
 //#undef ARCH_ARM_HAVE_NEON
 
-static void ColorMatrix_uchar4(const RsForEachStubParamStruct *p,
-                               uint32_t xstart, uint32_t xend,
-                               uint32_t instep, uint32_t outstep) {
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
+void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
+                                        uint32_t xstart, uint32_t xend,
+                                        uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
     uchar4 *out = (uchar4 *)p->out;
@@ -442,15 +455,23 @@
     }
 }
 
-void * rsdIntrinsic_InitBlend(const android::renderscript::Context *dc,
-                              android::renderscript::Script *script,
-                              RsdIntriniscFuncs_t *funcs) {
 
-    script->mHal.info.exportedVariableCount = 0;
-    funcs->root = ColorMatrix_uchar4;
+RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLEND) {
 
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    return cp;
+    mRootPtr = &kernel;
 }
 
+RsdCpuScriptIntrinsicBlend::~RsdCpuScriptIntrinsicBlend() {
+}
+
+void RsdCpuScriptIntrinsicBlend::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 0;
+}
+
+RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s) {
+    return new RsdCpuScriptIntrinsicBlend(ctx, s);
+}
+
+
 
diff --git a/driver/rsdIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
similarity index 64%
rename from driver/rsdIntrinsicBlur.cpp
rename to cpu_ref/rsCpuIntrinsicBlur.cpp
index b67e8d5..48363d1 100644
--- a/driver/rsdIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -14,25 +14,45 @@
  * limitations under the License.
  */
 
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
 
 using namespace android;
 using namespace android::renderscript;
 
-struct ConvolveParams {
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicBlur();
+    RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
     float fp[104];
     short ip[104];
     float radius;
     int iradius;
     ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+    void ComputeGaussianWeights();
 };
 
-static void ComputeGaussianWeights(ConvolveParams *cp) {
+}
+}
+
+
+void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
     // Compute gaussian weights for the blur
     // e is the euler's number
     float e = 2.718281828459045f;
@@ -46,7 +66,7 @@
     // The larger the radius gets, the more our gaussian blur
     // will resemble a box blur since with large sigma
     // the gaussian curve begins to lose its shape
-    float sigma = 0.4f * cp->radius + 0.6f;
+    float sigma = 0.4f * radius + 0.6f;
 
     // Now compute the coefficients. We will store some redundant values to save
     // some math during the blur calculations precompute some values
@@ -56,35 +76,30 @@
     float normalizeFactor = 0.0f;
     float floatR = 0.0f;
     int r;
-    cp->iradius = (float)ceil(cp->radius) + 0.5f;
-    for (r = -cp->iradius; r <= cp->iradius; r ++) {
+    iradius = (float)ceil(radius) + 0.5f;
+    for (r = -iradius; r <= iradius; r ++) {
         floatR = (float)r;
-        cp->fp[r + cp->iradius] = coeff1 * powf(e, floatR * floatR * coeff2);
-        normalizeFactor += cp->fp[r + cp->iradius];
+        fp[r + iradius] = coeff1 * powf(e, floatR * floatR * coeff2);
+        normalizeFactor += fp[r + iradius];
     }
 
     //Now we need to normalize the weights because all our coefficients need to add up to one
     normalizeFactor = 1.0f / normalizeFactor;
-    for (r = -cp->iradius; r <= cp->iradius; r ++) {
-        cp->fp[r + cp->iradius] *= normalizeFactor;
-        cp->ip[r + cp->iradius] = (short)(cp->ip[r + cp->iradius] * 32768);
+    for (r = -iradius; r <= iradius; r ++) {
+        fp[r + iradius] *= normalizeFactor;
+        ip[r + iradius] = (short)(ip[r + iradius] * 32768);
     }
 }
 
-static void Blur_Bind(const Context *dc, const Script *script,
-                             void * intrinsicData, uint32_t slot, Allocation *data) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
     rsAssert(slot == 1);
-    cp->alloc.set(data);
+    alloc.set(static_cast<Allocation *>(data));
 }
 
-static void Blur_SetVar(const Context *dc, const Script *script, void * intrinsicData,
-                               uint32_t slot, void *data, size_t dataLength) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
+void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
     rsAssert(slot == 0);
-
-    cp->radius = ((const float *)data)[0];
-    ComputeGaussianWeights(cp);
+    radius = ((const float *)data)[0];
+    ComputeGaussianWeights();
 }
 
 
@@ -158,17 +173,17 @@
 }
 
 
-static void Blur_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsicBlur::kernel(const RsForEachStubParamStruct *p,
+                                       uint32_t xstart, uint32_t xend,
+                                       uint32_t instep, uint32_t outstep) {
     float buf[4 * 2048];
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
+    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Blur executed without input, skipping");
         return;
     }
-    DrvAllocation *din = (DrvAllocation *)cp->alloc->mHal.drv;
-    const uchar *pin = (const uchar *)din->lod[0].mallocPtr;
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
 
     uchar4 *out = (uchar4 *)p->out;
     uint32_t x1 = xstart;
@@ -177,11 +192,11 @@
     float4 *fout = (float4 *)buf;
     int y = p->y;
     if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
-        const uchar *pi = pin + (y - cp->iradius) * din->lod[0].stride;
-        OneVF(fout, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+        const uchar *pi = pin + (y - cp->iradius) * stride;
+        OneVF(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
     } else {
         while(x2 > x1) {
-            OneV(p, fout, x1, y, pin, din->lod[0].stride, cp->fp, cp->iradius);
+            OneV(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
             fout++;
             x1++;
         }
@@ -208,19 +223,29 @@
 
 }
 
-void * rsdIntrinsic_InitBlur(const android::renderscript::Context *dc,
-                                    android::renderscript::Script *script,
-                                    RsdIntriniscFuncs_t *funcs) {
+RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLUR) {
 
-    script->mHal.info.exportedVariableCount = 2;
-    funcs->setVarObj = Blur_Bind;
-    funcs->setVar = Blur_SetVar;
-    funcs->root = Blur_uchar4;
+    mRootPtr = &kernel;
+    radius = 5;
+    ComputeGaussianWeights();
+}
 
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    cp->radius = 5;
-    ComputeGaussianWeights(cp);
-    return cp;
+RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
+}
+
+void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicBlur(ctx, s);
 }
 
 
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
new file mode 100644
index 0000000..8f3196d
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+
+    virtual ~RsdCpuScriptIntrinsicColorMatrix();
+    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[16];
+    short ip[16];
+
+    static void kernel4x4(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernel3x3(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernelDot(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
+                                                    size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (fp, data, dataLength);
+    for(int ct=0; ct < 16; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+
+    mRootPtr = &kernel4x4;
+    if ((ip[3] == 0) && (ip[7] == 0) && (ip[11] == 0) &&
+        (ip[12] == 0) && (ip[13] == 0) && (ip[14] == 0) && (ip[15] == 255)) {
+        mRootPtr = &kernel3x3;
+
+        if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
+            (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
+            (ip[8] == ip[9]) && (ip[8] == ip[10])) {
+            mRootPtr = &kernelDot;
+        }
+    }
+}
+
+extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
+
+static void One(const RsForEachStubParamStruct *p, uchar4 *out,
+                const uchar4 *py, const float* coeff) {
+    float4 i = convert_float4(py[0]);
+
+    float4 sum;
+    sum.x = i.x * coeff[0] +
+            i.y * coeff[4] +
+            i.z * coeff[8] +
+            i.w * coeff[12];
+    sum.y = i.x * coeff[1] +
+            i.y * coeff[5] +
+            i.z * coeff[9] +
+            i.w * coeff[13];
+    sum.z = i.x * coeff[2] +
+            i.y * coeff[6] +
+            i.z * coeff[10] +
+            i.w * coeff[14];
+    sum.w = i.x * coeff[3] +
+            i.y * coeff[7] +
+            i.z * coeff[11] +
+            i.w * coeff[15];
+
+    sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
+    sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
+    sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
+    sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+
+    *out = convert_uchar4(sum);
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernel4x4(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernel3x3(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernelDot(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+
+RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
+
+    const static float defaultMatrix[] = {
+        1.f, 0.f, 0.f, 0.f,
+        0.f, 1.f, 0.f, 0.f,
+        0.f, 0.f, 1.f, 0.f,
+        0.f, 0.f, 0.f, 1.f
+    };
+    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
+}
+
+RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s);
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
new file mode 100644
index 0000000..18a5311
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicConvolve3x3();
+    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[16];
+    short ip[16];
+    ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
+                                                    size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (&fp, data, dataLength);
+    for(int ct=0; ct < 9; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
+                                          const void *y2, const short *coef, uint32_t count);
+
+
+static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                        const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
+                        const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+
+    float4 px = convert_float4(py0[x1]) * coeff[0] +
+                convert_float4(py0[x]) * coeff[1] +
+                convert_float4(py0[x2]) * coeff[2] +
+                convert_float4(py1[x1]) * coeff[3] +
+                convert_float4(py1[x]) * coeff[4] +
+                convert_float4(py1[x2]) * coeff[5] +
+                convert_float4(py2[x1]) * coeff[6] +
+                convert_float4(py2[x]) * coeff[7] +
+                convert_float4(py2[x2]) * coeff[8];
+
+    px = clamp(px, 0.f, 255.f);
+    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
+    *out = o;
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernel(const RsForEachStubParamStruct *p,
+                                              uint32_t xstart, uint32_t xend,
+                                              uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->alloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
+    const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y);
+    const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
+
+    mRootPtr = &kernel;
+    for(int ct=0; ct < 9; ct++) {
+        fp[ct] = 1.f / 9.f;
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
new file mode 100644
index 0000000..2cae2c0
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicConvolve5x5();
+    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[28];
+    short ip[28];
+    ObjectBaseRef<Allocation> alloc;
+
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
+                                                    const void *data, size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (&fp, data, dataLength);
+    for(int ct=0; ct < 25; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+
+static void One(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
+                const float* coeff) {
+
+    uint32_t x0 = rsMax((int32_t)x-2, 0);
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+
+    float4 px = convert_float4(py0[x0]) * coeff[0] +
+                convert_float4(py0[x1]) * coeff[1] +
+                convert_float4(py0[x2]) * coeff[2] +
+                convert_float4(py0[x3]) * coeff[3] +
+                convert_float4(py0[x4]) * coeff[4] +
+
+                convert_float4(py1[x0]) * coeff[5] +
+                convert_float4(py1[x1]) * coeff[6] +
+                convert_float4(py1[x2]) * coeff[7] +
+                convert_float4(py1[x3]) * coeff[8] +
+                convert_float4(py1[x4]) * coeff[9] +
+
+                convert_float4(py2[x0]) * coeff[10] +
+                convert_float4(py2[x1]) * coeff[11] +
+                convert_float4(py2[x2]) * coeff[12] +
+                convert_float4(py2[x3]) * coeff[13] +
+                convert_float4(py2[x4]) * coeff[14] +
+
+                convert_float4(py3[x0]) * coeff[15] +
+                convert_float4(py3[x1]) * coeff[16] +
+                convert_float4(py3[x2]) * coeff[17] +
+                convert_float4(py3[x3]) * coeff[18] +
+                convert_float4(py3[x4]) * coeff[19] +
+
+                convert_float4(py4[x0]) * coeff[20] +
+                convert_float4(py4[x1]) * coeff[21] +
+                convert_float4(py4[x2]) * coeff[22] +
+                convert_float4(py4[x3]) * coeff[23] +
+                convert_float4(py4[x4]) * coeff[24];
+
+    px = clamp(px, 0.f, 255.f);
+    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
+    *out = o;
+}
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
+                                          const void *y2, const void *y3, const void *y4,
+                                          const short *coef, uint32_t count);
+
+void RsdCpuScriptIntrinsicConvolve5x5::kernel(const RsForEachStubParamStruct *p,
+                                              uint32_t xstart, uint32_t xend,
+                                              uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("Convolve5x5 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
+    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
+    uint32_t y2 = p->y;
+    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
+    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+
+    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
+    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
+    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
+    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
+    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while((x1 < x2) && (x1 < 2)) {
+        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
+        out++;
+        x1++;
+    }
+
+#if defined(ARCH_ARM_HAVE_NEON)
+    if((x1 + 3) < x2) {
+        uint32_t len = (x2 - x1 - 3) >> 1;
+        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
+        out += len << 1;
+        x1 += len << 1;
+    }
+#endif
+
+    while(x1 < x2) {
+        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
+        out++;
+        x1++;
+    }
+}
+
+
+RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
+
+    mRootPtr = &kernel;
+    for(int ct=0; ct < 9; ct++) {
+        fp[ct] = 1.f / 25.f;
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s);
+}
+
+
+
diff --git a/driver/rsdIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h
similarity index 100%
rename from driver/rsdIntrinsicInlines.h
rename to cpu_ref/rsCpuIntrinsicInlines.h
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
new file mode 100644
index 0000000..188ed2b
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicLUT : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicLUT();
+    RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    ObjectBaseRef<Allocation> lut;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 0);
+    lut.set(static_cast<Allocation *>(data));
+}
+
+
+void RsdCpuScriptIntrinsicLUT::kernel(const RsForEachStubParamStruct *p,
+                                      uint32_t xstart, uint32_t xend,
+                                      uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    const uchar *tr = (const uchar *)cp->lut->mHal.drvState.lod[0].mallocPtr;
+    const uchar *tg = &tr[256];
+    const uchar *tb = &tg[256];
+    const uchar *ta = &tb[256];
+
+    while (x1 < x2) {
+        uchar4 p = *in;
+        uchar4 o = {tr[p.x], tg[p.y], tb[p.z], ta[p.w]};
+        *out = o;
+        in++;
+        out++;
+        x1++;
+    }
+}
+
+RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_LUT) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsicLUT::~RsdCpuScriptIntrinsicLUT() {
+}
+
+void RsdCpuScriptIntrinsicLUT::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsicLUT::invokeFreeChildren() {
+    lut.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicLUT(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
new file mode 100644
index 0000000..7b8f768
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicYuvToRGB : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicYuvToRGB();
+    RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicYuvToRGB::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 0);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+
+
+
+static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
+    short Y = ((short)y) - 16;
+    short U = ((short)u) - 128;
+    short V = ((short)v) - 128;
+
+    short4 p;
+    p.r = (Y * 298 + V * 409 + 128) >> 8;
+    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+    p.b = (Y * 298 + U * 516 + 128) >> 8;
+    p.a = 255;
+    if(p.r < 0) {
+        p.r = 0;
+    }
+    if(p.r > 255) {
+        p.r = 255;
+    }
+    if(p.g < 0) {
+        p.g = 0;
+    }
+    if(p.g > 255) {
+        p.g = 255;
+    }
+    if(p.b < 0) {
+        p.b = 0;
+    }
+    if(p.b > 255) {
+        p.b = 255;
+    }
+
+    return (uchar4){p.r, p.g, p.b, p.a};
+}
+
+
+static short YuvCoeff[] = {
+    298, 409, -100, 516,   -208, 255, 0, 0,
+    16, 16, 16, 16,        16, 16, 16, 16,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    298, 298, 298, 298, 298, 298, 298, 298,
+    255, 255, 255, 255, 255, 255, 255, 255
+
+
+};
+
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
+                                           uint32_t xstart, uint32_t xend,
+                                           uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("YuvToRGB executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    const uchar *Y = pin + (p->y * p->dimX);
+    const uchar *uv = pin + (p->dimX * p->dimY);
+    uv += (p->y>>1) * p->dimX;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 3;
+        if(len > 0) {
+            rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+            x1 += len << 3;
+            out += len << 3;
+        }
+#endif
+
+       // ALOGE("y %i  %i  %i", p->y, x1, x2);
+        while(x1 < x2) {
+            uchar u = uv[(x1 & 0xffffe) + 1];
+            uchar v = uv[(x1 & 0xffffe) + 0];
+            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+            out++;
+            x1++;
+        }
+    }
+}
+
+RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsicYuvToRGB::~RsdCpuScriptIntrinsicYuvToRGB() {
+}
+
+void RsdCpuScriptIntrinsicYuvToRGB::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsicYuvToRGB::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s) {
+    return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s);
+}
+
+
diff --git a/driver/rsdIntrinsics_Convolve.S b/cpu_ref/rsCpuIntrinsics_neon.S
similarity index 100%
rename from driver/rsdIntrinsics_Convolve.S
rename to cpu_ref/rsCpuIntrinsics_neon.S
diff --git a/driver/rsdRuntimeMath.cpp b/cpu_ref/rsCpuRuntimeMath.cpp
similarity index 97%
rename from driver/rsdRuntimeMath.cpp
rename to cpu_ref/rsCpuRuntimeMath.cpp
index ba37243..cf2c8a4 100644
--- a/driver/rsdRuntimeMath.cpp
+++ b/cpu_ref/rsCpuRuntimeMath.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
 #include "rsMatrix3x3.h"
 #include "rsMatrix2x2.h"
 
-#include "rsdCore.h"
-#include "rsdRuntime.h"
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
 
 
 using namespace android;
@@ -375,7 +375,7 @@
 //                 ::= f  # float
 //                 ::= d  # double
 
-static RsdSymbolTable gSyms[] = {
+static RsdCpuReference::CpuSymbol gSyms[] = {
     { "_Z4acosf", (void *)&acosf, true },
     { "_Z5acoshf", (void *)&acoshf, true },
     { "_Z4asinf", (void *)&asinf, true },
@@ -532,11 +532,11 @@
     { NULL, NULL, false }
 };
 
-const RsdSymbolTable * rsdLookupSymbolMath(const char *sym) {
-    const RsdSymbolTable *syms = gSyms;
+const RsdCpuReference::CpuSymbol * RsdCpuScriptImpl::lookupSymbolMath(const char *sym) {
+    const RsdCpuReference::CpuSymbol *syms = gSyms;
 
-    while (syms->mPtr) {
-        if (!strcmp(syms->mName, sym)) {
+    while (syms->fnPtr) {
+        if (!strcmp(syms->name, sym)) {
             return syms;
         }
         syms++;
diff --git a/cpu_ref/rsCpuRuntimeStubs.cpp b/cpu_ref/rsCpuRuntimeStubs.cpp
new file mode 100644
index 0000000..b87a639
--- /dev/null
+++ b/cpu_ref/rsCpuRuntimeStubs.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsContext.h"
+#include "rsScriptC.h"
+#include "rsMatrix4x4.h"
+#include "rsMatrix3x3.h"
+#include "rsMatrix2x2.h"
+#include "rsRuntime.h"
+
+#include "utils/Timers.h"
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+
+#include <time.h>
+
+using namespace android;
+using namespace android::renderscript;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+typedef unsigned char uchar3 __attribute__((ext_vector_type(3)));
+typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
+typedef unsigned short ushort3 __attribute__((ext_vector_type(3)));
+typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+typedef int32_t int2 __attribute__((ext_vector_type(2)));
+typedef int32_t int3 __attribute__((ext_vector_type(3)));
+typedef int32_t int4 __attribute__((ext_vector_type(4)));
+typedef uint32_t uint2 __attribute__((ext_vector_type(2)));
+typedef uint32_t uint3 __attribute__((ext_vector_type(3)));
+typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
+typedef long long long2 __attribute__((ext_vector_type(2)));
+typedef long long long3 __attribute__((ext_vector_type(3)));
+typedef long long long4 __attribute__((ext_vector_type(4)));
+typedef unsigned long long ulong2 __attribute__((ext_vector_type(2)));
+typedef unsigned long long ulong3 __attribute__((ext_vector_type(3)));
+typedef unsigned long long ulong4 __attribute__((ext_vector_type(4)));
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Message routines
+//////////////////////////////////////////////////////////////////////////////
+
+
+int SC_divsi3(int a, int b) {
+    return a / b;
+}
+
+int SC_modsi3(int a, int b) {
+    return a % b;
+}
+
+unsigned int SC_udivsi3(unsigned int a, unsigned int b) {
+    return a / b;
+}
+
+unsigned int SC_umodsi3(unsigned int a, unsigned int b) {
+    return a % b;
+}
+
+static void SC_debugF(const char *s, float f) {
+    ALOGD("%s %f, 0x%08x", s, f, *((int *) (&f)));
+}
+static void SC_debugFv2(const char *s, float f1, float f2) {
+    ALOGD("%s {%f, %f}", s, f1, f2);
+}
+static void SC_debugFv3(const char *s, float f1, float f2, float f3) {
+    ALOGD("%s {%f, %f, %f}", s, f1, f2, f3);
+}
+static void SC_debugFv4(const char *s, float f1, float f2, float f3, float f4) {
+    ALOGD("%s {%f, %f, %f, %f}", s, f1, f2, f3, f4);
+}
+static void SC_debugF2(const char *s, float2 f) {
+    ALOGD("%s {%f, %f}", s, f.x, f.y);
+}
+static void SC_debugF3(const char *s, float3 f) {
+    ALOGD("%s {%f, %f, %f}", s, f.x, f.y, f.z);
+}
+static void SC_debugF4(const char *s, float4 f) {
+    ALOGD("%s {%f, %f, %f, %f}", s, f.x, f.y, f.z, f.w);
+}
+static void SC_debugD(const char *s, double d) {
+    ALOGD("%s %f, 0x%08llx", s, d, *((long long *) (&d)));
+}
+static void SC_debugFM4v4(const char *s, const float *f) {
+    ALOGD("%s {%f, %f, %f, %f", s, f[0], f[4], f[8], f[12]);
+    ALOGD("%s  %f, %f, %f, %f", s, f[1], f[5], f[9], f[13]);
+    ALOGD("%s  %f, %f, %f, %f", s, f[2], f[6], f[10], f[14]);
+    ALOGD("%s  %f, %f, %f, %f}", s, f[3], f[7], f[11], f[15]);
+}
+static void SC_debugFM3v3(const char *s, const float *f) {
+    ALOGD("%s {%f, %f, %f", s, f[0], f[3], f[6]);
+    ALOGD("%s  %f, %f, %f", s, f[1], f[4], f[7]);
+    ALOGD("%s  %f, %f, %f}",s, f[2], f[5], f[8]);
+}
+static void SC_debugFM2v2(const char *s, const float *f) {
+    ALOGD("%s {%f, %f", s, f[0], f[2]);
+    ALOGD("%s  %f, %f}",s, f[1], f[3]);
+}
+static void SC_debugI8(const char *s, char c) {
+    ALOGD("%s %hhd  0x%hhx", s, c, (unsigned char)c);
+}
+static void SC_debugC2(const char *s, char2 c) {
+    ALOGD("%s {%hhd, %hhd}  0x%hhx 0x%hhx", s, c.x, c.y, (unsigned char)c.x, (unsigned char)c.y);
+}
+static void SC_debugC3(const char *s, char3 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z);
+}
+static void SC_debugC4(const char *s, char4 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z, (unsigned char)c.w);
+}
+static void SC_debugU8(const char *s, unsigned char c) {
+    ALOGD("%s %hhu  0x%hhx", s, c, c);
+}
+static void SC_debugUC2(const char *s, uchar2 c) {
+    ALOGD("%s {%hhu, %hhu}  0x%hhx 0x%hhx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUC3(const char *s, uchar3 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUC4(const char *s, uchar4 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugI16(const char *s, short c) {
+    ALOGD("%s %hd  0x%hx", s, c, c);
+}
+static void SC_debugS2(const char *s, short2 c) {
+    ALOGD("%s {%hd, %hd}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugS3(const char *s, short3 c) {
+    ALOGD("%s {%hd, %hd, %hd}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugS4(const char *s, short4 c) {
+    ALOGD("%s {%hd, %hd, %hd, %hd}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugU16(const char *s, unsigned short c) {
+    ALOGD("%s %hu  0x%hx", s, c, c);
+}
+static void SC_debugUS2(const char *s, ushort2 c) {
+    ALOGD("%s {%hu, %hu}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUS3(const char *s, ushort3 c) {
+    ALOGD("%s {%hu, %hu, %hu}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUS4(const char *s, ushort4 c) {
+    ALOGD("%s {%hu, %hu, %hu, %hu}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugI32(const char *s, int32_t i) {
+    ALOGD("%s %d  0x%x", s, i, i);
+}
+static void SC_debugI2(const char *s, int2 i) {
+    ALOGD("%s {%d, %d}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugI3(const char *s, int3 i) {
+    ALOGD("%s {%d, %d, %d}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugI4(const char *s, int4 i) {
+    ALOGD("%s {%d, %d, %d, %d}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
+}
+static void SC_debugU32(const char *s, uint32_t i) {
+    ALOGD("%s %u  0x%x", s, i, i);
+}
+static void SC_debugUI2(const char *s, uint2 i) {
+    ALOGD("%s {%u, %u}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugUI3(const char *s, uint3 i) {
+    ALOGD("%s {%u, %u, %u}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugUI4(const char *s, uint4 i) {
+    ALOGD("%s {%u, %u, %u, %u}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
+}
+static void SC_debugLL64(const char *s, long long ll) {
+    ALOGD("%s %lld  0x%llx", s, ll, ll);
+}
+static void SC_debugL2(const char *s, long2 ll) {
+    ALOGD("%s {%lld, %lld}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugL3(const char *s, long3 ll) {
+    ALOGD("%s {%lld, %lld, %lld}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugL4(const char *s, long4 ll) {
+    ALOGD("%s {%lld, %lld, %lld, %lld}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
+static void SC_debugULL64(const char *s, unsigned long long ll) {
+    ALOGD("%s %llu  0x%llx", s, ll, ll);
+}
+static void SC_debugUL2(const char *s, ulong2 ll) {
+    ALOGD("%s {%llu, %llu}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugUL3(const char *s, ulong3 ll) {
+    ALOGD("%s {%llu, %llu, %llu}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugUL4(const char *s, ulong4 ll) {
+    ALOGD("%s {%llu, %llu, %llu, %llu}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
+static void SC_debugP(const char *s, const void *p) {
+    ALOGD("%s %p", s, p);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Stub implementation
+//////////////////////////////////////////////////////////////////////////////
+
+// llvm name mangling ref
+//  <builtin-type> ::= v  # void
+//                 ::= b  # bool
+//                 ::= c  # char
+//                 ::= a  # signed char
+//                 ::= h  # unsigned char
+//                 ::= s  # short
+//                 ::= t  # unsigned short
+//                 ::= i  # int
+//                 ::= j  # unsigned int
+//                 ::= l  # long
+//                 ::= m  # unsigned long
+//                 ::= x  # long long, __int64
+//                 ::= y  # unsigned long long, __int64
+//                 ::= f  # float
+//                 ::= d  # double
+
+static RsdCpuReference::CpuSymbol gSyms[] = {
+    { "memset", (void *)&memset, true },
+    { "memcpy", (void *)&memcpy, true },
+
+    // Debug
+    { "_Z7rsDebugPKcf", (void *)&SC_debugF, true },
+    { "_Z7rsDebugPKcff", (void *)&SC_debugFv2, true },
+    { "_Z7rsDebugPKcfff", (void *)&SC_debugFv3, true },
+    { "_Z7rsDebugPKcffff", (void *)&SC_debugFv4, true },
+    { "_Z7rsDebugPKcDv2_f", (void *)&SC_debugF2, true },
+    { "_Z7rsDebugPKcDv3_f", (void *)&SC_debugF3, true },
+    { "_Z7rsDebugPKcDv4_f", (void *)&SC_debugF4, true },
+    { "_Z7rsDebugPKcd", (void *)&SC_debugD, true },
+    { "_Z7rsDebugPKcPK12rs_matrix4x4", (void *)&SC_debugFM4v4, true },
+    { "_Z7rsDebugPKcPK12rs_matrix3x3", (void *)&SC_debugFM3v3, true },
+    { "_Z7rsDebugPKcPK12rs_matrix2x2", (void *)&SC_debugFM2v2, true },
+    { "_Z7rsDebugPKcc", (void *)&SC_debugI8, true },
+    { "_Z7rsDebugPKcDv2_c", (void *)&SC_debugC2, true },
+    { "_Z7rsDebugPKcDv3_c", (void *)&SC_debugC3, true },
+    { "_Z7rsDebugPKcDv4_c", (void *)&SC_debugC4, true },
+    { "_Z7rsDebugPKch", (void *)&SC_debugU8, true },
+    { "_Z7rsDebugPKcDv2_h", (void *)&SC_debugUC2, true },
+    { "_Z7rsDebugPKcDv3_h", (void *)&SC_debugUC3, true },
+    { "_Z7rsDebugPKcDv4_h", (void *)&SC_debugUC4, true },
+    { "_Z7rsDebugPKcs", (void *)&SC_debugI16, true },
+    { "_Z7rsDebugPKcDv2_s", (void *)&SC_debugS2, true },
+    { "_Z7rsDebugPKcDv3_s", (void *)&SC_debugS3, true },
+    { "_Z7rsDebugPKcDv4_s", (void *)&SC_debugS4, true },
+    { "_Z7rsDebugPKct", (void *)&SC_debugU16, true },
+    { "_Z7rsDebugPKcDv2_t", (void *)&SC_debugUS2, true },
+    { "_Z7rsDebugPKcDv3_t", (void *)&SC_debugUS3, true },
+    { "_Z7rsDebugPKcDv4_t", (void *)&SC_debugUS4, true },
+    { "_Z7rsDebugPKci", (void *)&SC_debugI32, true },
+    { "_Z7rsDebugPKcDv2_i", (void *)&SC_debugI2, true },
+    { "_Z7rsDebugPKcDv3_i", (void *)&SC_debugI3, true },
+    { "_Z7rsDebugPKcDv4_i", (void *)&SC_debugI4, true },
+    { "_Z7rsDebugPKcj", (void *)&SC_debugU32, true },
+    { "_Z7rsDebugPKcDv2_j", (void *)&SC_debugUI2, true },
+    { "_Z7rsDebugPKcDv3_j", (void *)&SC_debugUI3, true },
+    { "_Z7rsDebugPKcDv4_j", (void *)&SC_debugUI4, true },
+    // Both "long" and "unsigned long" need to be redirected to their
+    // 64-bit counterparts, since we have hacked Slang to use 64-bit
+    // for "long" on Arm (to be similar to Java).
+    { "_Z7rsDebugPKcl", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_l", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_l", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_l", (void *)&SC_debugL4, true },
+    { "_Z7rsDebugPKcm", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_m", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_m", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_m", (void *)&SC_debugUL4, true },
+    { "_Z7rsDebugPKcx", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_x", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_x", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_x", (void *)&SC_debugL4, true },
+    { "_Z7rsDebugPKcy", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_y", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_y", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_y", (void *)&SC_debugUL4, true },
+    { "_Z7rsDebugPKcPKv", (void *)&SC_debugP, true },
+
+    { NULL, NULL, false }
+};
+
+
+void * RsdCpuScriptImpl::lookupRuntimeStub(void* pContext, char const* name) {
+    RsdCpuScriptImpl *s = (RsdCpuScriptImpl *)pContext;
+    const RsdCpuReference::CpuSymbol *syms = gSyms;
+    const RsdCpuReference::CpuSymbol *sym = NULL;
+
+    sym = s->mCtx->symLookup(name);
+    if (!sym) {
+        sym = s->lookupSymbolMath(name);
+    }
+    if (!sym) {
+        while (syms->fnPtr) {
+            if (!strcmp(syms->name, name)) {
+                sym = syms;
+            }
+            syms++;
+        }
+    }
+
+    if (sym) {
+        s->mIsThreadable &= sym->threadable;
+        return sym->fnPtr;
+    }
+    ALOGE("ScriptC sym lookup failed for %s", name);
+    return NULL;
+}
+
+
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
new file mode 100644
index 0000000..06ce4bb
--- /dev/null
+++ b/cpu_ref/rsCpuScript.cpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include "rsCpuCore.h"
+
+#include "rsCpuScript.h"
+//#include "rsdRuntime.h"
+//#include "rsdAllocation.h"
+//#include "rsCpuIntrinsics.h"
+
+
+#include "utils/Vector.h"
+#include "utils/Timers.h"
+#include "utils/StopWatch.h"
+
+
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
+
+namespace android {
+namespace renderscript {
+
+
+
+RsdCpuScriptImpl::RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s) {
+    mCtx = ctx;
+    mScript = s;
+
+    mRoot = NULL;
+    mRootExpand = NULL;
+    mInit = NULL;
+    mFreeChildren = NULL;
+
+    mCompilerContext = NULL;
+    mCompilerDriver = NULL;
+    mExecutable = NULL;
+
+    mBoundAllocs = NULL;
+    mIntrinsicData = NULL;
+    mIsThreadable = true;
+}
+
+
+bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir,
+                            uint8_t const *bitcode, size_t bitcodeSize,
+                            uint32_t flags) {
+    //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir, bitcode, bitcodeSize, flags, lookupFunc);
+    //ALOGE("rsdScriptInit %p %p", rsc, script);
+
+    mCtx->lockMutex();
+
+    bcc::RSExecutable *exec;
+    const bcc::RSInfo *info;
+
+    mCompilerContext = NULL;
+    mCompilerDriver = NULL;
+    mExecutable = NULL;
+
+    mCompilerContext = new bcc::BCCContext();
+    if (mCompilerContext == NULL) {
+        ALOGE("bcc: FAILS to create compiler context (out of memory)");
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mCompilerDriver = new bcc::RSCompilerDriver();
+    if (mCompilerDriver == NULL) {
+        ALOGE("bcc: FAILS to create compiler driver (out of memory)");
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mCompilerDriver->setRSRuntimeLookupFunction(lookupRuntimeStub);
+    mCompilerDriver->setRSRuntimeLookupContext(this);
+
+    exec = mCompilerDriver->build(*mCompilerContext, cacheDir, resName,
+                                  (const char *)bitcode, bitcodeSize, NULL);
+
+    if (exec == NULL) {
+        ALOGE("bcc: FAILS to prepare executable for '%s'", resName);
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mExecutable = exec;
+
+    exec->setThreadable(mIsThreadable);
+    if (!exec->syncInfo()) {
+        ALOGW("bcc: FAILS to synchronize the RS info file to the disk");
+    }
+
+    mRoot = reinterpret_cast<int (*)()>(exec->getSymbolAddress("root"));
+    mRootExpand =
+        reinterpret_cast<int (*)()>(exec->getSymbolAddress("root.expand"));
+    mInit = reinterpret_cast<void (*)()>(exec->getSymbolAddress("init"));
+    mFreeChildren =
+        reinterpret_cast<void (*)()>(exec->getSymbolAddress(".rs.dtor"));
+
+
+    info = &mExecutable->getInfo();
+    if (info->getExportVarNames().size()) {
+        mBoundAllocs = new Allocation *[info->getExportVarNames().size()];
+        memset(mBoundAllocs, 0, sizeof(void *) * info->getExportVarNames().size());
+    }
+
+    mCtx->unlockMutex();
+    return true;
+}
+
+void RsdCpuScriptImpl::populateScript(Script *script) {
+    const bcc::RSInfo *info = &mExecutable->getInfo();
+
+    // Copy info over to runtime
+    script->mHal.info.exportedFunctionCount = info->getExportFuncNames().size();
+    script->mHal.info.exportedVariableCount = info->getExportVarNames().size();
+    script->mHal.info.exportedPragmaCount = info->getPragmas().size();
+    script->mHal.info.exportedPragmaKeyList =
+        const_cast<const char**>(mExecutable->getPragmaKeys().array());
+    script->mHal.info.exportedPragmaValueList =
+        const_cast<const char**>(mExecutable->getPragmaValues().array());
+
+    if (mRootExpand) {
+        script->mHal.info.root = mRootExpand;
+    } else {
+        script->mHal.info.root = mRoot;
+    }
+}
+
+/*
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+    pthread_mutex_lock(&rsdgInitMutex);
+
+    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
+    if (drv == NULL) {
+        goto error;
+    }
+    s->mHal.drv = drv;
+    drv->mIntrinsicID = iid;
+    drv->mIntrinsicData = rsdIntrinsic_Init(rsc, s, iid, &drv->mIntrinsicFuncs);
+    s->mHal.info.isThreadable = true;
+
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return true;
+
+error:
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return false;
+}
+*/
+
+typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                                        const void * usr, uint32_t usrLen,
+                                        const RsScriptCall *sc,
+                                        MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    if (ain) {
+        mtls->fep.dimX = ain->getType()->getDimX();
+        mtls->fep.dimY = ain->getType()->getDimY();
+        mtls->fep.dimZ = ain->getType()->getDimZ();
+        //mtls->dimArray = ain->getType()->getDimArray();
+    } else if (aout) {
+        mtls->fep.dimX = aout->getType()->getDimX();
+        mtls->fep.dimY = aout->getType()->getDimY();
+        mtls->fep.dimZ = aout->getType()->getDimZ();
+        //mtls->dimArray = aout->getType()->getDimArray();
+    } else {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
+
+    mtls->rsc = mCtx;
+    mtls->ain = ain;
+    mtls->aout = aout;
+    mtls->fep.usr = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 1;
+    mtls->mSliceNum = 0;
+
+    mtls->fep.ptrIn = NULL;
+    mtls->fep.eStrideIn = 0;
+    mtls->isThreadable = mIsThreadable;
+
+    if (ain) {
+        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    }
+}
+
+
+void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
+                                     const Allocation * ain,
+                                     Allocation * aout,
+                                     const void * usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    forEachKernelSetup(slot, &mtls);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+
+    mtls->script = this;
+    mtls->fep.slot = slot;
+
+    rsAssert(slot < mExecutable->getExportForeachFuncAddrs().size());
+    mtls->kernel = reinterpret_cast<ForEachFunc_t>(
+                      mExecutable->getExportForeachFuncAddrs()[slot]);
+    rsAssert(mtls->kernel != NULL);
+    mtls->sig = mExecutable->getInfo().getExportForeachFuncs()[slot].second;
+}
+
+int RsdCpuScriptImpl::invokeRoot() {
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    int ret = mRoot();
+    mCtx->setTLS(oldTLS);
+    return ret;
+}
+
+void RsdCpuScriptImpl::invokeInit() {
+    if (mInit) {
+        mInit();
+    }
+}
+
+void RsdCpuScriptImpl::invokeFreeChildren() {
+    if (mFreeChildren) {
+        mFreeChildren();
+    }
+}
+
+void RsdCpuScriptImpl::invokeFunction(uint32_t slot, const void *params,
+                                      size_t paramLength) {
+    //ALOGE("invoke %p %p %i %p %i", dc, script, slot, params, paramLength);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    reinterpret_cast<void (*)(const void *, uint32_t)>(
+        mExecutable->getExportFuncAddrs()[slot])(params, paramLength);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    //rsAssert(!script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalVar %p %p %i %p %i", dc, script, slot, data, dataLength);
+
+    //if (mIntrinsicID) {
+        //mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
+        //return;
+    //}
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    memcpy(destPtr, data, dataLength);
+}
+
+void RsdCpuScriptImpl::setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                                const Element *elem,
+                                                const size_t *dims, size_t dimLength) {
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+        mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    // We want to look at dimension in terms of integer components,
+    // but dimLength is given in terms of bytes.
+    dimLength /= sizeof(int);
+
+    // Only a single dimension is currently supported.
+    rsAssert(dimLength == 1);
+    if (dimLength == 1) {
+        // First do the increment loop.
+        size_t stride = elem->getSizeBytes();
+        const char *cVal = reinterpret_cast<const char *>(data);
+        for (size_t i = 0; i < dims[0]; i++) {
+            elem->incRefs(cVal);
+            cVal += stride;
+        }
+
+        // Decrement loop comes after (to prevent race conditions).
+        char *oldVal = reinterpret_cast<char *>(destPtr);
+        for (size_t i = 0; i < dims[0]; i++) {
+            elem->decRefs(oldVal);
+            oldVal += stride;
+        }
+    }
+
+    memcpy(destPtr, data, dataLength);
+}
+
+void RsdCpuScriptImpl::setGlobalBind(uint32_t slot, Allocation *data) {
+
+    //rsAssert(!script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalBind %p %p %i %p", dc, script, slot, data);
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    void *ptr = NULL;
+    mBoundAllocs[slot] = data;
+    if(data) {
+        ptr = data->mHal.drvState.lod[0].mallocPtr;
+    }
+    memcpy(destPtr, &ptr, sizeof(void *));
+}
+
+void RsdCpuScriptImpl::setGlobalObj(uint32_t slot, ObjectBase *data) {
+
+    //rsAssert(script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalObj %p %p %i %p", dc, script, slot, data);
+
+    //if (mIntrinsicID) {
+        //mIntrinsicFuncs.setVarObj(dc, script, drv->mIntrinsicData, slot, alloc);
+        //return;
+    //}
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    rsrSetObject(mCtx->getContext(), (ObjectBase **)destPtr, data);
+}
+
+RsdCpuScriptImpl::~RsdCpuScriptImpl() {
+
+    if (mExecutable) {
+        Vector<void *>::const_iterator var_addr_iter =
+            mExecutable->getExportVarAddrs().begin();
+        Vector<void *>::const_iterator var_addr_end =
+            mExecutable->getExportVarAddrs().end();
+
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
+            mExecutable->getInfo().getObjectSlots().begin();
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_end =
+            mExecutable->getInfo().getObjectSlots().end();
+
+        while ((var_addr_iter != var_addr_end) &&
+               (is_object_iter != is_object_end)) {
+            // The field address can be NULL if the script-side has optimized
+            // the corresponding global variable away.
+            ObjectBase **obj_addr =
+                reinterpret_cast<ObjectBase **>(*var_addr_iter);
+            if (*is_object_iter) {
+                if (*var_addr_iter != NULL) {
+                    rsrClearObject(mCtx->getContext(), obj_addr);
+                }
+            }
+            var_addr_iter++;
+            is_object_iter++;
+        }
+    }
+
+    if (mCompilerContext) {
+        delete mCompilerContext;
+    }
+    if (mCompilerDriver) {
+        delete mCompilerDriver;
+    }
+    if (mExecutable) {
+        delete mExecutable;
+    }
+    if (mBoundAllocs) {
+        delete[] mBoundAllocs;
+    }
+}
+
+Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
+    if (!ptr) {
+        return NULL;
+    }
+
+    for (uint32_t ct=0; ct < mScript->mHal.info.exportedVariableCount; ct++) {
+        Allocation *a = mBoundAllocs[ct];
+        if (!a) continue;
+        if (a->mHal.drvState.lod[0].mallocPtr == ptr) {
+            return a;
+        }
+    }
+    ALOGE("rsGetAllocation, failed to find %p", ptr);
+    return NULL;
+}
+
+
+}
+}
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
new file mode 100644
index 0000000..2197a20
--- /dev/null
+++ b/cpu_ref/rsCpuScript.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_BCC_H
+#define RSD_BCC_H
+
+#include <rs_hal.h>
+#include <rsRuntime.h>
+
+#include "rsCpuCore.h"
+
+namespace bcc {
+    class BCCContext;
+    class RSCompilerDriver;
+    class RSExecutable;
+}
+
+namespace android {
+namespace renderscript {
+
+
+
+class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
+public:
+    typedef void (*outer_foreach_t)(
+        const RsForEachStubParamStruct *,
+        uint32_t x1, uint32_t x2,
+        uint32_t instep, uint32_t outstep);
+
+    bool init(char const *resName, char const *cacheDir,
+              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags);
+    virtual void populateScript(Script *);
+
+    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual int invokeRoot();
+    virtual void invokeForEach(uint32_t slot,
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+    virtual void invokeInit();
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                  const Element *e, const size_t *dims, size_t dimLength);
+    virtual void setGlobalBind(uint32_t slot, Allocation *data);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+
+    virtual ~RsdCpuScriptImpl();
+    RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s);
+
+    const Script * getScript() {return mScript;}
+
+    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                          const void * usr, uint32_t usrLen,
+                          const RsScriptCall *sc, MTLaunchStruct *mtls);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+
+
+    const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
+    static void * lookupRuntimeStub(void* pContext, char const* name);
+
+    virtual Allocation * getAllocationForPointer(const void *ptr) const;
+
+
+protected:
+    RsdCpuReferenceImpl *mCtx;
+    const Script *mScript;
+
+    int (*mRoot)();
+    int (*mRootExpand)();
+    void (*mInit)();
+    void (*mFreeChildren)();
+
+    bcc::BCCContext *mCompilerContext;
+    bcc::RSCompilerDriver *mCompilerDriver;
+    bcc::RSExecutable *mExecutable;
+
+    Allocation **mBoundAllocs;
+    void * mIntrinsicData;
+    bool mIsThreadable;
+
+};
+
+
+Allocation * rsdScriptGetAllocationForPointer(
+                        const Context *dc,
+                        const Script *script,
+                        const void *);
+
+
+
+}
+}
+
+#endif
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
new file mode 100644
index 0000000..765057d
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsCpuScriptGroup.h"
+
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
+
+#include "rsScript.h"
+#include "rsScriptGroup.h"
+#include "rsCpuScriptGroup.h"
+//#include "rsdBcc.h"
+//#include "rsdAllocation.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg) {
+    mCtx = ctx;
+    mSG = sg;
+}
+
+CpuScriptGroupImpl::~CpuScriptGroupImpl() {
+
+}
+
+bool CpuScriptGroupImpl::init() {
+    return true;
+}
+
+void CpuScriptGroupImpl::setInput(const ScriptKernelID *kid, Allocation *a) {
+}
+
+void CpuScriptGroupImpl::setOutput(const ScriptKernelID *kid, Allocation *a) {
+}
+
+
+typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
+                                      uint32_t xstart, uint32_t xend,
+                                      uint32_t instep, uint32_t outstep);
+
+void CpuScriptGroupImpl::scriptGroupRoot(const RsForEachStubParamStruct *p,
+                                         uint32_t xstart, uint32_t xend,
+                                         uint32_t instep, uint32_t outstep) {
+
+
+    const ScriptList *sl = (const ScriptList *)p->usr;
+    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
+    const void *oldUsr = p->usr;
+
+    for(size_t ct=0; ct < sl->count; ct++) {
+        ScriptGroupRootFunc_t func;
+        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
+        mp->usr = sl->usrPtrs[ct];
+
+        mp->ptrIn = NULL;
+        mp->in = NULL;
+        mp->ptrOut = NULL;
+        mp->out = NULL;
+
+        if (sl->ins[ct]) {
+            mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            mp->in = mp->ptrIn;
+            if (sl->inExts[ct]) {
+                mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
+            } else {
+                if (sl->ins[ct]->mHal.drvState.lod[0].dimY > p->lid) {
+                    mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->lid;
+                }
+            }
+        }
+
+        if (sl->outs[ct]) {
+            mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
+            mp->out = mp->ptrOut;
+            if (sl->outExts[ct]) {
+                mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
+            } else {
+                if (sl->outs[ct]->mHal.drvState.lod[0].dimY > p->lid) {
+                    mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->lid;
+                }
+            }
+        }
+
+        //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
+        func(p, xstart, xend, instep, outstep);
+    }
+    //ALOGE("script group root");
+
+    //ConvolveParams *cp = (ConvolveParams *)p->usr;
+
+    mp->usr = oldUsr;
+}
+
+
+
+void CpuScriptGroupImpl::execute() {
+    Vector<Allocation *> ins;
+    Vector<bool> inExts;
+    Vector<Allocation *> outs;
+    Vector<bool> outExts;
+    Vector<const ScriptKernelID *> kernels;
+    bool fieldDep = false;
+
+    for (size_t ct=0; ct < mSG->mNodes.size(); ct++) {
+        ScriptGroup::Node *n = mSG->mNodes[ct];
+        Script *s = n->mKernels[0]->mScript;
+
+        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
+
+        for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
+            if (n->mInputs[ct2]->mDstField.get() && n->mInputs[ct2]->mDstField->mScript) {
+                //ALOGE("field %p %zu", n->mInputs[ct2]->mDstField->mScript, n->mInputs[ct2]->mDstField->mSlot);
+                s->setVarObj(n->mInputs[ct2]->mDstField->mSlot, n->mInputs[ct2]->mAlloc.get());
+            }
+        }
+
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            const ScriptKernelID *k = n->mKernels[ct2];
+            Allocation *ain = NULL;
+            Allocation *aout = NULL;
+            bool inExt = false;
+            bool outExt = false;
+
+            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
+                if (n->mInputs[ct3]->mDstKernel.get() == k) {
+                    ain = n->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" link in %p", ain);
+                }
+            }
+            for (size_t ct3=0; ct3 < mSG->mInputs.size(); ct3++) {
+                if (mSG->mInputs[ct3]->mKernel == k) {
+                    ain = mSG->mInputs[ct3]->mAlloc.get();
+                    inExt = true;
+                    //ALOGE(" io in %p", ain);
+                }
+            }
+
+            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == k) {
+                    aout = n->mOutputs[ct3]->mAlloc.get();
+                    if(n->mOutputs[ct3]->mDstField.get() != NULL) {
+                        fieldDep = true;
+                    }
+                    //ALOGE(" link out %p", aout);
+                }
+            }
+            for (size_t ct3=0; ct3 < mSG->mOutputs.size(); ct3++) {
+                if (mSG->mOutputs[ct3]->mKernel == k) {
+                    aout = mSG->mOutputs[ct3]->mAlloc.get();
+                    outExt = true;
+                    //ALOGE(" io out %p", aout);
+                }
+            }
+
+            if ((k->mHasKernelOutput == (aout != NULL)) &&
+                (k->mHasKernelInput == (ain != NULL))) {
+                ins.add(ain);
+                inExts.add(inExt);
+                outs.add(aout);
+                outExts.add(outExt);
+                kernels.add(k);
+            }
+        }
+
+    }
+
+    MTLaunchStruct mtls;
+
+    if(fieldDep) {
+        for (size_t ct=0; ct < ins.size(); ct++) {
+            Script *s = kernels[ct]->mScript;
+            RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+            uint32_t slot = kernels[ct]->mSlot;
+
+            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+            si->forEachKernelSetup(slot, &mtls);
+            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
+        }
+    } else {
+        ScriptList sl;
+        sl.ins = ins.array();
+        sl.outs = outs.array();
+        sl.kernels = kernels.array();
+        sl.count = kernels.size();
+
+        Vector<const void *> usrPtrs;
+        Vector<const void *> fnPtrs;
+        Vector<uint32_t> sigs;
+        for (size_t ct=0; ct < kernels.size(); ct++) {
+            Script *s = kernels[ct]->mScript;
+            RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+
+            si->forEachKernelSetup(kernels[ct]->mSlot, &mtls);
+            fnPtrs.add((void *)mtls.kernel);
+            usrPtrs.add(mtls.fep.usr);
+            sigs.add(mtls.fep.usrLen);
+        }
+        sl.sigs = sigs.array();
+        sl.usrPtrs = usrPtrs.array();
+        sl.fnPtrs = fnPtrs.array();
+        sl.inExts = inExts.array();
+        sl.outExts = outExts.array();
+
+        Script *s = kernels[0]->mScript;
+        RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
+        mtls.script = NULL;
+        mtls.kernel = (void (*)())&scriptGroupRoot;
+        mtls.fep.usr = &sl;
+        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+    }
+}
+
+void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg) {
+}
+
+
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
new file mode 100644
index 0000000..f6fa2ac
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_SCRIPT_GROUP_H
+#define RSD_SCRIPT_GROUP_H
+
+#include <rsd_cpu.h>
+
+namespace android {
+namespace renderscript {
+
+
+class CpuScriptGroupImpl : public RsdCpuReference::CpuScriptGroup {
+public:
+    virtual void setInput(const ScriptKernelID *kid, Allocation *);
+    virtual void setOutput(const ScriptKernelID *kid, Allocation *);
+    virtual void execute();
+    virtual ~CpuScriptGroupImpl();
+
+    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
+    bool init();
+
+    static void scriptGroupRoot(const RsForEachStubParamStruct *p,
+                                uint32_t xstart, uint32_t xend,
+                                uint32_t instep, uint32_t outstep);
+
+protected:
+    struct ScriptList {
+        size_t count;
+        Allocation *const* ins;
+        bool const* inExts;
+        Allocation *const* outs;
+        bool const* outExts;
+        const void *const* usrPtrs;
+        size_t const *usrSizes;
+        uint32_t const *sigs;
+        const void *const* fnPtrs;
+
+        const ScriptKernelID *const* kernels;
+    };
+    ScriptList mSl;
+    const ScriptGroup *mSG;
+    RsdCpuReferenceImpl *mCtx;
+};
+
+}
+}
+
+#endif // RSD_SCRIPT_GROUP_H
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
new file mode 100644
index 0000000..d96d2d1
--- /dev/null
+++ b/cpu_ref/rsd_cpu.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_H
+#define RSD_CPU_H
+
+#include "rsAllocation.h"
+
+
+namespace android {
+namespace renderscript {
+
+class ScriptC;
+class Script;
+class ScriptGroup;
+class ScriptKernelID;
+
+
+class RsdCpuReference {
+public:
+    struct CpuSymbol {
+        const char * name;
+        void * fnPtr;
+        bool threadable;
+    };
+
+    typedef const CpuSymbol * (* sym_lookup_t)(Context *, const char *name);
+
+    struct CpuTls {
+        Context *rsc;
+        const ScriptC * sc;
+    };
+
+    class CpuScript {
+    public:
+        virtual void populateScript(Script *) = 0;
+        virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
+        virtual int invokeRoot() = 0;
+        virtual void invokeForEach(uint32_t slot,
+                           const Allocation * ain,
+                           Allocation * aout,
+                           const void * usr,
+                           uint32_t usrLen,
+                           const RsScriptCall *sc) = 0;
+        virtual void invokeInit() = 0;
+        virtual void invokeFreeChildren() = 0;
+
+        virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) = 0;
+        virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                      const Element *e, const size_t *dims, size_t dimLength) = 0;
+        virtual void setGlobalBind(uint32_t slot, Allocation *data) = 0;
+        virtual void setGlobalObj(uint32_t slot, ObjectBase *obj) = 0;
+
+        virtual Allocation * getAllocationForPointer(const void *ptr) const = 0;
+        virtual ~CpuScript() {}
+    };
+    typedef CpuScript * (* script_lookup_t)(Context *, const Script *s);
+
+    class CpuScriptGroup {
+    public:
+        virtual void setInput(const ScriptKernelID *kid, Allocation *) = 0;
+        virtual void setOutput(const ScriptKernelID *kid, Allocation *) = 0;
+        virtual void execute() = 0;
+        virtual ~CpuScriptGroup() {};
+    };
+
+    static Context * getTlsContext();
+    static const Script * getTlsScript();
+
+    static RsdCpuReference * create(Context *c, uint32_t version_major,
+                                    uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn);
+    virtual ~RsdCpuReference();
+    virtual void setPriority(int32_t priority) = 0;
+
+    virtual CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
+                                     uint8_t const *bitcode, size_t bitcodeSize,
+                                     uint32_t flags) = 0;
+    virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
+    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg) = 0;
+};
+
+
+}
+}
+
+#endif
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 8956b2e..928f777 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -16,7 +16,6 @@
 
 
 #include "rsdCore.h"
-#include "rsdRuntime.h"
 #include "rsdAllocation.h"
 #include "rsdFrameBufferObj.h"
 
@@ -80,10 +79,9 @@
 uint8_t *GetOffsetPtr(const android::renderscript::Allocation *alloc,
                       uint32_t xoff, uint32_t yoff, uint32_t lod,
                       RsAllocationCubemapFace face) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    uint8_t *ptr = (uint8_t *)drv->lod[lod].mallocPtr;
-    ptr += face * drv->faceOffset;
-    ptr += yoff * drv->lod[lod].stride;
+    uint8_t *ptr = (uint8_t *)alloc->mHal.drvState.lod[lod].mallocPtr;
+    ptr += face * alloc->mHal.drvState.faceOffset;
+    ptr += yoff * alloc->mHal.drvState.lod[lod].stride;
     ptr += xoff * alloc->mHal.state.elementSizeBytes;
     return ptr;
 }
@@ -160,7 +158,7 @@
         return;
     }
 
-    if (!drv->lod[0].mallocPtr) {
+    if (!alloc->mHal.drvState.lod[0].mallocPtr) {
         return;
     }
 
@@ -174,10 +172,9 @@
     Upload2DTexture(rsc, alloc, isFirstUpload);
 
     if (!(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_SCRIPT)) {
-        if (alloc->mHal.drvState.mallocPtrLOD0) {
-            free(alloc->mHal.drvState.mallocPtrLOD0);
-            alloc->mHal.drvState.mallocPtrLOD0 = NULL;
-            drv->lod[0].mallocPtr = NULL;
+        if (alloc->mHal.drvState.lod[0].mallocPtr) {
+            free(alloc->mHal.drvState.lod[0].mallocPtr);
+            alloc->mHal.drvState.lod[0].mallocPtr = NULL;
         }
     }
     rsdGLCheckError(rsc, "UploadToTexture");
@@ -224,54 +221,50 @@
     }
     RSD_CALL_GL(glBindBuffer, drv->glTarget, drv->bufferID);
     RSD_CALL_GL(glBufferData, drv->glTarget, alloc->mHal.state.type->getSizeBytes(),
-                 alloc->mHal.drvState.mallocPtrLOD0, GL_DYNAMIC_DRAW);
+                 alloc->mHal.drvState.lod[0].mallocPtr, GL_DYNAMIC_DRAW);
     RSD_CALL_GL(glBindBuffer, drv->glTarget, 0);
     rsdGLCheckError(rsc, "UploadToBufferObject");
 }
 
 static size_t AllocationBuildPointerTable(const Context *rsc, const Allocation *alloc,
         const Type *type, uint8_t *ptr) {
-
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-
-    drv->lod[0].dimX = type->getDimX();
-    drv->lod[0].dimY = type->getDimY();
-    drv->lod[0].mallocPtr = 0;
-    drv->lod[0].stride = drv->lod[0].dimX * type->getElementSizeBytes();
-    drv->lodCount = type->getLODCount();
-    drv->faceCount = type->getDimFaces();
+    alloc->mHal.drvState.lod[0].dimX = type->getDimX();
+    alloc->mHal.drvState.lod[0].dimY = type->getDimY();
+    alloc->mHal.drvState.lod[0].mallocPtr = 0;
+    alloc->mHal.drvState.lod[0].stride = alloc->mHal.drvState.lod[0].dimX * type->getElementSizeBytes();
+    alloc->mHal.drvState.lodCount = type->getLODCount();
+    alloc->mHal.drvState.faceCount = type->getDimFaces();
 
     size_t offsets[Allocation::MAX_LOD];
     memset(offsets, 0, sizeof(offsets));
 
-    size_t o = drv->lod[0].stride * rsMax(drv->lod[0].dimY, 1u) * rsMax(drv->lod[0].dimZ, 1u);
-    if(drv->lodCount > 1) {
-        uint32_t tx = drv->lod[0].dimX;
-        uint32_t ty = drv->lod[0].dimY;
-        uint32_t tz = drv->lod[0].dimZ;
-        for (uint32_t lod=1; lod < drv->lodCount; lod++) {
-            drv->lod[lod].dimX = tx;
-            drv->lod[lod].dimY = ty;
-            drv->lod[lod].dimZ = tz;
-            drv->lod[lod].stride = tx * type->getElementSizeBytes();
+    size_t o = alloc->mHal.drvState.lod[0].stride * rsMax(alloc->mHal.drvState.lod[0].dimY, 1u) *
+            rsMax(alloc->mHal.drvState.lod[0].dimZ, 1u);
+    if(alloc->mHal.drvState.lodCount > 1) {
+        uint32_t tx = alloc->mHal.drvState.lod[0].dimX;
+        uint32_t ty = alloc->mHal.drvState.lod[0].dimY;
+        uint32_t tz = alloc->mHal.drvState.lod[0].dimZ;
+        for (uint32_t lod=1; lod < alloc->mHal.drvState.lodCount; lod++) {
+            alloc->mHal.drvState.lod[lod].dimX = tx;
+            alloc->mHal.drvState.lod[lod].dimY = ty;
+            alloc->mHal.drvState.lod[lod].dimZ = tz;
+            alloc->mHal.drvState.lod[lod].stride = tx * type->getElementSizeBytes();
             offsets[lod] = o;
-            o += drv->lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
+            o += alloc->mHal.drvState.lod[lod].stride * rsMax(ty, 1u) * rsMax(tz, 1u);
             if (tx > 1) tx >>= 1;
             if (ty > 1) ty >>= 1;
             if (tz > 1) tz >>= 1;
         }
     }
-    drv->faceOffset = o;
+    alloc->mHal.drvState.faceOffset = o;
 
-    drv->lod[0].mallocPtr = ptr;
-    for (uint32_t lod=1; lod < drv->lodCount; lod++) {
-        drv->lod[lod].mallocPtr = ptr + offsets[lod];
+    alloc->mHal.drvState.lod[0].mallocPtr = ptr;
+    for (uint32_t lod=1; lod < alloc->mHal.drvState.lodCount; lod++) {
+        alloc->mHal.drvState.lod[lod].mallocPtr = ptr + offsets[lod];
     }
-    alloc->mHal.drvState.strideLOD0 = drv->lod[0].stride;
-    alloc->mHal.drvState.mallocPtrLOD0 = ptr;
 
-    size_t allocSize = drv->faceOffset;
-    if(drv->faceCount) {
+    size_t allocSize = alloc->mHal.drvState.faceOffset;
+    if(alloc->mHal.drvState.faceCount) {
         allocSize *= 6;
     }
 
@@ -352,9 +345,9 @@
         drv->renderTargetID = 0;
     }
 
-    if (alloc->mHal.drvState.mallocPtrLOD0) {
-        free(alloc->mHal.drvState.mallocPtrLOD0);
-        alloc->mHal.drvState.mallocPtrLOD0 = NULL;
+    if (alloc->mHal.drvState.lod[0].mallocPtr) {
+        free(alloc->mHal.drvState.lod[0].mallocPtr);
+        alloc->mHal.drvState.lod[0].mallocPtr = NULL;
     }
     if (drv->readBackFBO != NULL) {
         delete drv->readBackFBO;
@@ -366,9 +359,7 @@
 
 void rsdAllocationResize(const Context *rsc, const Allocation *alloc,
                          const Type *newType, bool zeroNew) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-
-    void * oldPtr = drv->lod[0].mallocPtr;
+    void * oldPtr = alloc->mHal.drvState.lod[0].mallocPtr;
     // Calculate the object size
     size_t s = AllocationBuildPointerTable(rsc, alloc, newType, NULL);
     uint8_t *ptr = (uint8_t *)realloc(oldPtr, s);
@@ -383,7 +374,7 @@
 
     if (dimX > oldDimX) {
         uint32_t stride = alloc->mHal.state.elementSizeBytes;
-        memset(((uint8_t *)alloc->mHal.drvState.mallocPtrLOD0) + stride * oldDimX,
+        memset(((uint8_t *)alloc->mHal.drvState.lod[0].mallocPtr) + stride * oldDimX,
                  0, stride * (dimX - oldDimX));
     }
 }
@@ -411,8 +402,9 @@
     drv->readBackFBO->setActive(rsc);
 
     // Do the readback
-    RSD_CALL_GL(glReadPixels, 0, 0, drv->lod[0].dimX, drv->lod[0].dimY,
-                drv->glFormat, drv->glType, drv->lod[0].mallocPtr);
+    RSD_CALL_GL(glReadPixels, 0, 0, alloc->mHal.drvState.lod[0].dimX,
+                alloc->mHal.drvState.lod[0].dimY,
+                drv->glFormat, drv->glType, alloc->mHal.drvState.lod[0].mallocPtr);
 
     // Revert framebuffer to its original
     lastFbo->setActive(rsc);
@@ -482,9 +474,8 @@
     mapper.lock(drv->wndBuffer->handle,
             GRALLOC_USAGE_SW_READ_NEVER | GRALLOC_USAGE_SW_WRITE_OFTEN,
             bounds, &dst);
-    drv->lod[0].mallocPtr = dst;
-    alloc->mHal.drvState.mallocPtrLOD0 = dst;
-    drv->lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
+    alloc->mHal.drvState.lod[0].mallocPtr = dst;
+    alloc->mHal.drvState.lod[0].stride = drv->wndBuffer->stride * alloc->mHal.state.elementSizeBytes;
 
     return true;
 }
@@ -597,7 +588,7 @@
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
     uint32_t lineSize = eSize * w;
 
-    if (drv->lod[0].mallocPtr) {
+    if (alloc->mHal.drvState.lod[0].mallocPtr) {
         const uint8_t *src = static_cast<const uint8_t *>(data);
         uint8_t *dst = GetOffsetPtr(alloc, xoff, yoff, lod, face);
 
@@ -608,7 +599,7 @@
             }
             memcpy(dst, src, lineSize);
             src += lineSize;
-            dst += drv->lod[lod].stride;
+            dst += alloc->mHal.drvState.lod[lod].stride;
         }
         drv->uploadDeferred = true;
     } else {
@@ -626,8 +617,6 @@
 void rsdAllocationRead1D(const Context *rsc, const Allocation *alloc,
                          uint32_t xoff, uint32_t lod, uint32_t count,
                          void *data, size_t sizeBytes) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-
     const uint32_t eSize = alloc->mHal.state.type->getElementSizeBytes();
     const uint8_t * ptr = GetOffsetPtr(alloc, xoff, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
     memcpy(data, ptr, count * eSize);
@@ -636,19 +625,17 @@
 void rsdAllocationRead2D(const Context *rsc, const Allocation *alloc,
                          uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
                          uint32_t w, uint32_t h, void *data, size_t sizeBytes) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-
     uint32_t eSize = alloc->mHal.state.elementSizeBytes;
     uint32_t lineSize = eSize * w;
 
-    if (drv->lod[0].mallocPtr) {
+    if (alloc->mHal.drvState.lod[0].mallocPtr) {
         uint8_t *dst = static_cast<uint8_t *>(data);
         const uint8_t *src = GetOffsetPtr(alloc, xoff, yoff, lod, face);
 
         for (uint32_t line=yoff; line < (yoff+h); line++) {
             memcpy(dst, src, lineSize);
             dst += lineSize;
-            src += drv->lod[lod].stride;
+            src += alloc->mHal.drvState.lod[lod].stride;
         }
     } else {
         ALOGE("Add code to readback from non-script memory");
@@ -664,8 +651,7 @@
 
 void * rsdAllocationLock1D(const android::renderscript::Context *rsc,
                           const android::renderscript::Allocation *alloc) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    return drv->lod[0].mallocPtr;
+    return alloc->mHal.drvState.lod[0].mallocPtr;
 }
 
 void rsdAllocationUnlock1D(const android::renderscript::Context *rsc,
@@ -767,9 +753,8 @@
 }
 
 static void mip565(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    uint32_t w = drv->lod[lod + 1].dimX;
-    uint32_t h = drv->lod[lod + 1].dimY;
+    uint32_t w = alloc->mHal.drvState.lod[lod + 1].dimX;
+    uint32_t h = alloc->mHal.drvState.lod[lod + 1].dimY;
 
     for (uint32_t y=0; y < h; y++) {
         uint16_t *oPtr = (uint16_t *)GetOffsetPtr(alloc, 0, y, lod + 1, face);
@@ -786,9 +771,8 @@
 }
 
 static void mip8888(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    uint32_t w = drv->lod[lod + 1].dimX;
-    uint32_t h = drv->lod[lod + 1].dimY;
+    uint32_t w = alloc->mHal.drvState.lod[lod + 1].dimX;
+    uint32_t h = alloc->mHal.drvState.lod[lod + 1].dimY;
 
     for (uint32_t y=0; y < h; y++) {
         uint32_t *oPtr = (uint32_t *)GetOffsetPtr(alloc, 0, y, lod + 1, face);
@@ -805,9 +789,8 @@
 }
 
 static void mip8(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    uint32_t w = drv->lod[lod + 1].dimX;
-    uint32_t h = drv->lod[lod + 1].dimY;
+    uint32_t w = alloc->mHal.drvState.lod[lod + 1].dimX;
+    uint32_t h = alloc->mHal.drvState.lod[lod + 1].dimY;
 
     for (uint32_t y=0; y < h; y++) {
         uint8_t *oPtr = GetOffsetPtr(alloc, 0, y, lod + 1, face);
@@ -824,8 +807,7 @@
 }
 
 void rsdAllocationGenerateMipmaps(const Context *rsc, const Allocation *alloc) {
-    DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
-    if(!drv->lod[0].mallocPtr) {
+    if(!alloc->mHal.drvState.lod[0].mallocPtr) {
         return;
     }
     uint32_t numFaces = alloc->getType()->getDimFaces() ? 6 : 1;
diff --git a/driver/rsdAllocation.h b/driver/rsdAllocation.h
index e6488b9..d2ecc9a 100644
--- a/driver/rsdAllocation.h
+++ b/driver/rsdAllocation.h
@@ -21,6 +21,8 @@
 #include <rsRuntime.h>
 #include <rsAllocation.h>
 
+#include "../cpu_ref/rsd_cpu.h"
+
 #include <GLES/gl.h>
 #include <GLES2/gl2.h>
 
@@ -49,19 +51,6 @@
     RsdFrameBufferObj * readBackFBO;
     ANativeWindow *wnd;
     ANativeWindowBuffer *wndBuffer;
-
-    struct LodState {
-        void * mallocPtr;
-        size_t stride;
-        uint32_t dimX;
-        uint32_t dimY;
-        uint32_t dimZ;
-    } lod[android::renderscript::Allocation::MAX_LOD];
-    size_t faceOffset;
-    uint32_t lodCount;
-    uint32_t faceCount;
-
-
 };
 
 GLenum rsdTypeToGLType(RsDataType t);
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index ddcaac8..436b9b2 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -14,17 +14,12 @@
  * limitations under the License.
  */
 
+#include "../cpu_ref/rsd_cpu.h"
+
 #include "rsdCore.h"
 
-#include <bcc/BCCContext.h>
-#include <bcc/Renderscript/RSCompilerDriver.h>
-#include <bcc/Renderscript/RSExecutable.h>
-#include <bcc/Renderscript/RSInfo.h>
-
 #include "rsdBcc.h"
-#include "rsdRuntime.h"
 #include "rsdAllocation.h"
-#include "rsdIntrinsics.h"
 
 #include "rsContext.h"
 #include "rsElement.h"
@@ -38,15 +33,6 @@
 using namespace android::renderscript;
 
 
-static Script * setTLS(Script *sc) {
-    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(rsdgThreadTLSKey);
-    rsAssert(tls);
-    Script *old = tls->mScript;
-    tls->mScript = sc;
-    return old;
-}
-
-
 bool rsdScriptInit(const Context *rsc,
                      ScriptC *script,
                      char const *resName,
@@ -54,358 +40,26 @@
                      uint8_t const *bitcode,
                      size_t bitcodeSize,
                      uint32_t flags) {
-    //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir, bitcode, bitcodeSize, flags, lookupFunc);
-    //ALOGE("rsdScriptInit %p %p", rsc, script);
-
-    pthread_mutex_lock(&rsdgInitMutex);
-
-    bcc::RSExecutable *exec;
-    const bcc::RSInfo *info;
-    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
-    if (drv == NULL) {
-        goto error;
+    RsdHal *dc = (RsdHal *)rsc->mHal.drv;
+    RsdCpuReference::CpuScript * cs = dc->mCpuRef->createScript(script, resName, cacheDir,
+                                                                bitcode, bitcodeSize, flags);
+    if (cs == NULL) {
+        return false;
     }
-    script->mHal.drv = drv;
-
-    drv->mCompilerContext = NULL;
-    drv->mCompilerDriver = NULL;
-    drv->mExecutable = NULL;
-
-    drv->mCompilerContext = new bcc::BCCContext();
-    if (drv->mCompilerContext == NULL) {
-        ALOGE("bcc: FAILS to create compiler context (out of memory)");
-        goto error;
-    }
-
-    drv->mCompilerDriver = new bcc::RSCompilerDriver();
-    if (drv->mCompilerDriver == NULL) {
-        ALOGE("bcc: FAILS to create compiler driver (out of memory)");
-        goto error;
-    }
-
-    script->mHal.info.isThreadable = true;
-
-    drv->mCompilerDriver->setRSRuntimeLookupFunction(rsdLookupRuntimeStub);
-    drv->mCompilerDriver->setRSRuntimeLookupContext(script);
-
-    exec = drv->mCompilerDriver->build(*drv->mCompilerContext,
-                                       cacheDir, resName,
-                                       (const char *)bitcode, bitcodeSize,
-                                       NULL);
-
-    if (exec == NULL) {
-        ALOGE("bcc: FAILS to prepare executable for '%s'", resName);
-        goto error;
-    }
-
-    drv->mExecutable = exec;
-
-    exec->setThreadable(script->mHal.info.isThreadable);
-    if (!exec->syncInfo()) {
-        ALOGW("bcc: FAILS to synchronize the RS info file to the disk");
-    }
-
-    drv->mRoot = reinterpret_cast<int (*)()>(exec->getSymbolAddress("root"));
-    drv->mRootExpand =
-        reinterpret_cast<int (*)()>(exec->getSymbolAddress("root.expand"));
-    drv->mInit = reinterpret_cast<void (*)()>(exec->getSymbolAddress("init"));
-    drv->mFreeChildren =
-        reinterpret_cast<void (*)()>(exec->getSymbolAddress(".rs.dtor"));
-
-    info = &drv->mExecutable->getInfo();
-    // Copy info over to runtime
-    script->mHal.info.exportedFunctionCount = info->getExportFuncNames().size();
-    script->mHal.info.exportedVariableCount = info->getExportVarNames().size();
-    script->mHal.info.exportedPragmaCount = info->getPragmas().size();
-    script->mHal.info.exportedPragmaKeyList =
-        const_cast<const char**>(exec->getPragmaKeys().array());
-    script->mHal.info.exportedPragmaValueList =
-        const_cast<const char**>(exec->getPragmaValues().array());
-
-    if (drv->mRootExpand) {
-        script->mHal.info.root = drv->mRootExpand;
-    } else {
-        script->mHal.info.root = drv->mRoot;
-    }
-
-    if (script->mHal.info.exportedVariableCount) {
-        drv->mBoundAllocs = new Allocation *[script->mHal.info.exportedVariableCount];
-        memset(drv->mBoundAllocs, 0, sizeof(void *) * script->mHal.info.exportedVariableCount);
-    }
-
-    pthread_mutex_unlock(&rsdgInitMutex);
+    script->mHal.drv = cs;
+    cs->populateScript(script);
     return true;
-
-error:
-
-    pthread_mutex_unlock(&rsdgInitMutex);
-    if (drv) {
-        delete drv->mCompilerContext;
-        delete drv->mCompilerDriver;
-        delete drv->mExecutable;
-        delete[] drv->mBoundAllocs;
-        free(drv);
-    }
-    script->mHal.drv = NULL;
-    return false;
-
 }
 
 bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
-    pthread_mutex_lock(&rsdgInitMutex);
-
-    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
-    if (drv == NULL) {
-        goto error;
+    RsdHal *dc = (RsdHal *)rsc->mHal.drv;
+    RsdCpuReference::CpuScript * cs = dc->mCpuRef->createIntrinsic(s, iid, e);
+    if (cs == NULL) {
+        return false;
     }
-    s->mHal.drv = drv;
-    drv->mIntrinsicID = iid;
-    drv->mIntrinsicData = rsdIntrinsic_Init(rsc, s, iid, &drv->mIntrinsicFuncs);
-    s->mHal.info.isThreadable = true;
-
-    pthread_mutex_unlock(&rsdgInitMutex);
+    s->mHal.drv = cs;
+    cs->populateScript(s);
     return true;
-
-error:
-    pthread_mutex_unlock(&rsdgInitMutex);
-    return false;
-}
-
-typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
-
-static void wc_xy(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
-    uint32_t sig = mtls->sig;
-
-#if defined(ARCH_ARM_RS_USE_CACHED_SCANLINE_WRITE)
-    unsigned char buf[1024 * 8];
-#endif
-
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd = yStart + mtls->mSliceSize;
-        yEnd = rsMin(yEnd, mtls->yEnd);
-        if (yEnd <= yStart) {
-            return;
-        }
-
-        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-#if defined(ARCH_ARM_RS_USE_CACHED_SCANLINE_WRITE)
-        if (mtls->fep.yStrideOut < sizeof(buf)) {
-            p.out = buf;
-            for (p.y = yStart; p.y < yEnd; p.y++) {
-                p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y);
-                fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-                memcpy(mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y), buf, mtls->fep.yStrideOut);
-            }
-        } else
-#endif
-            {
-            for (p.y = yStart; p.y < yEnd; p.y++) {
-                p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
-                        (mtls->fep.eStrideOut * mtls->xStart);
-                p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
-                       (mtls->fep.eStrideIn * mtls->xStart);
-                fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-            }
-        }
-    }
-}
-
-static void wc_x(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
-    uint32_t sig = mtls->sig;
-
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd = xStart + mtls->mSliceSize;
-        xEnd = rsMin(xEnd, mtls->xEnd);
-        if (xEnd <= xStart) {
-            return;
-        }
-
-        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
-        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-    }
-}
-
-void rsdScriptInvokeForEachMtlsSetup(const Context *rsc,
-                                     const Allocation * ain,
-                                     Allocation * aout,
-                                     const void * usr,
-                                     uint32_t usrLen,
-                                     const RsScriptCall *sc,
-                                     MTLaunchStruct *mtls) {
-
-    memset(mtls, 0, sizeof(MTLaunchStruct));
-
-    if (ain) {
-        mtls->fep.dimX = ain->getType()->getDimX();
-        mtls->fep.dimY = ain->getType()->getDimY();
-        mtls->fep.dimZ = ain->getType()->getDimZ();
-        //mtls->dimArray = ain->getType()->getDimArray();
-    } else if (aout) {
-        mtls->fep.dimX = aout->getType()->getDimX();
-        mtls->fep.dimY = aout->getType()->getDimY();
-        mtls->fep.dimZ = aout->getType()->getDimZ();
-        //mtls->dimArray = aout->getType()->getDimArray();
-    } else {
-        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
-
-    if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls->fep.dimX);
-        rsAssert(sc->xEnd <= mtls->fep.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls->fep.dimY);
-        rsAssert(sc->yEnd <= mtls->fep.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
-    }
-
-    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
-
-    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
-    Context *mrsc = (Context *)rsc;
-    mtls->rsc = mrsc;
-    mtls->ain = ain;
-    mtls->aout = aout;
-    mtls->fep.usr = usr;
-    mtls->fep.usrLen = usrLen;
-    mtls->mSliceSize = 10;
-    mtls->mSliceNum = 0;
-
-    mtls->fep.ptrIn = NULL;
-    mtls->fep.eStrideIn = 0;
-
-    if (ain) {
-        DrvAllocation *aindrv = (DrvAllocation *)ain->mHal.drv;
-        mtls->fep.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
-        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls->fep.yStrideIn = aindrv->lod[0].stride;
-    }
-
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        DrvAllocation *aoutdrv = (DrvAllocation *)aout->mHal.drv;
-        mtls->fep.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aoutdrv->lod[0].stride;
-    }
-}
-
-void rsdScriptLaunchThreads(const Context *rsc,
-                            bool isThreadable,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            uint32_t usrLen,
-                            const RsScriptCall *sc,
-                            MTLaunchStruct *mtls) {
-
-    Context *mrsc = (Context *)rsc;
-    RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
-
-    if ((dc->mWorkers.mCount >= 1) && isThreadable && !dc->mInForEach) {
-        const size_t targetByteChunk = 16 * 1024;
-        dc->mInForEach = true;
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
-
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
-
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
-            }
-
-            rsdLaunchThreads(mrsc, wc_xy, mtls);
-        } else {
-            uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
-
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
-
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
-            }
-
-            rsdLaunchThreads(mrsc, wc_x, mtls);
-        }
-        dc->mInForEach = false;
-
-        //ALOGE("launch 1");
-    } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
-
-        //ALOGE("launch 3");
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
-                                      mtls->fep.dimY * p.z + p.y;
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
-                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
-                           (mtls->fep.eStrideIn * mtls->xStart);
-                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-                }
-            }
-        }
-    }
 }
 
 void rsdScriptInvokeForEach(const Context *rsc,
@@ -417,237 +71,69 @@
                             uint32_t usrLen,
                             const RsScriptCall *sc) {
 
-    RsdHal * dc = (RsdHal *)rsc->mHal.drv;
-
-    MTLaunchStruct mtls;
-    rsdScriptInvokeForEachMtlsSetup(rsc, ain, aout, usr, usrLen, sc, &mtls);
-    mtls.script = s;
-    mtls.fep.slot = slot;
-
-    DrvScript *drv = (DrvScript *)s->mHal.drv;
-    if (drv->mIntrinsicID) {
-        mtls.kernel = (void (*)())drv->mIntrinsicFuncs.root;
-        mtls.fep.usr = drv->mIntrinsicData;
-    } else {
-        rsAssert(slot < drv->mExecutable->getExportForeachFuncAddrs().size());
-        mtls.kernel = reinterpret_cast<ForEachFunc_t>(
-                          drv->mExecutable->getExportForeachFuncAddrs()[slot]);
-        rsAssert(mtls.kernel != NULL);
-        mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
-    }
-
-
-    Script * oldTLS = setTLS(s);
-    rsdScriptLaunchThreads(rsc, s->mHal.info.isThreadable, ain, aout, usr, usrLen, sc, &mtls);
-    setTLS(oldTLS);
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeForEach(slot, ain, aout, usr, usrLen, sc);
 }
 
 
-int rsdScriptInvokeRoot(const Context *dc, Script *script) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    Script * oldTLS = setTLS(script);
-    int ret = drv->mRoot();
-    setTLS(oldTLS);
-
-    return ret;
+int rsdScriptInvokeRoot(const Context *dc, Script *s) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    return cs->invokeRoot();
 }
 
-void rsdScriptInvokeInit(const Context *dc, Script *script) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    if (drv->mInit) {
-        drv->mInit();
-    }
+void rsdScriptInvokeInit(const Context *dc, Script *s) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeInit();
 }
 
-void rsdScriptInvokeFreeChildren(const Context *dc, Script *script) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    if (drv->mFreeChildren) {
-        drv->mFreeChildren();
-    }
+void rsdScriptInvokeFreeChildren(const Context *dc, Script *s) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeFreeChildren();
 }
 
-void rsdScriptInvokeFunction(const Context *dc, Script *script,
+void rsdScriptInvokeFunction(const Context *dc, Script *s,
                             uint32_t slot,
                             const void *params,
                             size_t paramLength) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-    //ALOGE("invoke %p %p %i %p %i", dc, script, slot, params, paramLength);
-
-    Script * oldTLS = setTLS(script);
-    reinterpret_cast<void (*)(const void *, uint32_t)>(
-        drv->mExecutable->getExportFuncAddrs()[slot])(params, paramLength);
-    setTLS(oldTLS);
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeFunction(slot, params, paramLength);
 }
 
-void rsdScriptSetGlobalVar(const Context *dc, const Script *script,
+void rsdScriptSetGlobalVar(const Context *dc, const Script *s,
                            uint32_t slot, void *data, size_t dataLength) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-    //rsAssert(!script->mFieldIsObject[slot]);
-    //ALOGE("setGlobalVar %p %p %i %p %i", dc, script, slot, data, dataLength);
-
-    if (drv->mIntrinsicID) {
-        drv->mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
-        return;
-    }
-
-    int32_t *destPtr = reinterpret_cast<int32_t *>(
-                          drv->mExecutable->getExportVarAddrs()[slot]);
-    if (!destPtr) {
-        //ALOGV("Calling setVar on slot = %i which is null", slot);
-        return;
-    }
-
-    memcpy(destPtr, data, dataLength);
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->setGlobalVar(slot, data, dataLength);
 }
 
-void rsdScriptSetGlobalVarWithElemDims(
-        const android::renderscript::Context *dc,
-        const android::renderscript::Script *script,
-        uint32_t slot, void *data, size_t dataLength,
-        const android::renderscript::Element *elem,
-        const size_t *dims, size_t dimLength) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    int32_t *destPtr = reinterpret_cast<int32_t *>(
-        drv->mExecutable->getExportVarAddrs()[slot]);
-    if (!destPtr) {
-        //ALOGV("Calling setVar on slot = %i which is null", slot);
-        return;
-    }
-
-    // We want to look at dimension in terms of integer components,
-    // but dimLength is given in terms of bytes.
-    dimLength /= sizeof(int);
-
-    // Only a single dimension is currently supported.
-    rsAssert(dimLength == 1);
-    if (dimLength == 1) {
-        // First do the increment loop.
-        size_t stride = elem->getSizeBytes();
-        char *cVal = reinterpret_cast<char *>(data);
-        for (size_t i = 0; i < dims[0]; i++) {
-            elem->incRefs(cVal);
-            cVal += stride;
-        }
-
-        // Decrement loop comes after (to prevent race conditions).
-        char *oldVal = reinterpret_cast<char *>(destPtr);
-        for (size_t i = 0; i < dims[0]; i++) {
-            elem->decRefs(oldVal);
-            oldVal += stride;
-        }
-    }
-
-    memcpy(destPtr, data, dataLength);
+void rsdScriptSetGlobalVarWithElemDims(const Context *dc, const Script *s,
+                                       uint32_t slot, void *data, size_t dataLength,
+                                       const android::renderscript::Element *elem,
+                                       const size_t *dims, size_t dimLength) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->setGlobalVarWithElemDims(slot, data, dataLength, elem, dims, dimLength);
 }
 
-void rsdScriptSetGlobalBind(const Context *dc, const Script *script, uint32_t slot, Allocation *data) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    //rsAssert(!script->mFieldIsObject[slot]);
-    //ALOGE("setGlobalBind %p %p %i %p", dc, script, slot, data);
-
-    rsAssert(!drv->mIntrinsicID);
-
-    int32_t *destPtr = reinterpret_cast<int32_t *>(
-                          drv->mExecutable->getExportVarAddrs()[slot]);
-    if (!destPtr) {
-        //ALOGV("Calling setVar on slot = %i which is null", slot);
-        return;
-    }
-
-    void *ptr = NULL;
-    drv->mBoundAllocs[slot] = data;
-    if(data) {
-        DrvAllocation *allocDrv = (DrvAllocation *)data->mHal.drv;
-        ptr = allocDrv->lod[0].mallocPtr;
-    }
-    memcpy(destPtr, &ptr, sizeof(void *));
+void rsdScriptSetGlobalBind(const Context *dc, const Script *s, uint32_t slot, Allocation *data) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->setGlobalBind(slot, data);
 }
 
-void rsdScriptSetGlobalObj(const Context *dc, const Script *script, uint32_t slot, ObjectBase *data) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-    //rsAssert(script->mFieldIsObject[slot]);
-    //ALOGE("setGlobalObj %p %p %i %p", dc, script, slot, data);
-
-    if (drv->mIntrinsicID) {
-        drv->mIntrinsicFuncs.setVarObj(dc, script, drv->mIntrinsicData, slot,
-                                       static_cast<Allocation *>(data));
-        return;
-    }
-
-    int32_t *destPtr = reinterpret_cast<int32_t *>(
-                          drv->mExecutable->getExportVarAddrs()[slot]);
-    if (!destPtr) {
-        //ALOGV("Calling setVar on slot = %i which is null", slot);
-        return;
-    }
-
-    rsrSetObject(dc, script, (ObjectBase **)destPtr, data);
+void rsdScriptSetGlobalObj(const Context *dc, const Script *s, uint32_t slot, ObjectBase *data) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->setGlobalObj(slot, data);
 }
 
-void rsdScriptDestroy(const Context *dc, Script *script) {
-    DrvScript *drv = (DrvScript *)script->mHal.drv;
-
-    if (drv == NULL) {
-        return;
-    }
-
-    if (drv->mExecutable) {
-        Vector<void *>::const_iterator var_addr_iter =
-            drv->mExecutable->getExportVarAddrs().begin();
-        Vector<void *>::const_iterator var_addr_end =
-            drv->mExecutable->getExportVarAddrs().end();
-
-        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
-            drv->mExecutable->getInfo().getObjectSlots().begin();
-        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_end =
-            drv->mExecutable->getInfo().getObjectSlots().end();
-
-        while ((var_addr_iter != var_addr_end) &&
-               (is_object_iter != is_object_end)) {
-            // The field address can be NULL if the script-side has optimized
-            // the corresponding global variable away.
-            ObjectBase **obj_addr =
-                reinterpret_cast<ObjectBase **>(*var_addr_iter);
-            if (*is_object_iter) {
-                if (*var_addr_iter != NULL) {
-                    rsrClearObject(dc, script, obj_addr);
-                }
-            }
-            var_addr_iter++;
-            is_object_iter++;
-        }
-    }
-
-    delete drv->mCompilerContext;
-    delete drv->mCompilerDriver;
-    delete drv->mExecutable;
-    delete[] drv->mBoundAllocs;
-    free(drv);
-    script->mHal.drv = NULL;
+void rsdScriptDestroy(const Context *dc, Script *s) {
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    delete cs;
+    s->mHal.drv = NULL;
 }
 
+
 Allocation * rsdScriptGetAllocationForPointer(const android::renderscript::Context *dc,
                                               const android::renderscript::Script *sc,
                                               const void *ptr) {
-    DrvScript *drv = (DrvScript *)sc->mHal.drv;
-    if (!ptr) {
-        return NULL;
-    }
-
-    for (uint32_t ct=0; ct < sc->mHal.info.exportedVariableCount; ct++) {
-        Allocation *a = drv->mBoundAllocs[ct];
-        if (!a) continue;
-        DrvAllocation *adrv = (DrvAllocation *)a->mHal.drv;
-        if (adrv->lod[0].mallocPtr == ptr) {
-            return a;
-        }
-    }
-    ALOGE("rsGetAllocation, failed to find %p", ptr);
-    return NULL;
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)sc->mHal.drv;
+    return cs->getAllocationForPointer(ptr);
 }
 
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index 4a42eb5..4c65c2a 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -20,12 +20,6 @@
 #include <rs_hal.h>
 #include <rsRuntime.h>
 
-namespace bcc {
-    class BCCContext;
-    class RSCompilerDriver;
-    class RSExecutable;
-}
-
 bool rsdScriptInit(const android::renderscript::Context *, android::renderscript::ScriptC *,
                    char const *resName, char const *cacheDir,
                    uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags);
@@ -92,92 +86,4 @@
                         const void *);
 
 
-typedef void (*outer_foreach_t)(
-    const android::renderscript::RsForEachStubParamStruct *,
-    uint32_t x1, uint32_t x2,
-    uint32_t instep, uint32_t outstep);
-
-typedef struct RsdIntriniscFuncs_rec {
-
-    void (*setVarObj)(const android::renderscript::Context *dc,
-                      const android::renderscript::Script *script,
-                      void * intrinsicData,
-                      uint32_t slot, android::renderscript::Allocation *data);
-    void (*setVar)(const android::renderscript::Context *dc,
-                   const android::renderscript::Script *script,
-                   void * intrinsicData,
-                   uint32_t slot, void *data, size_t dataLength);
-    void (*root)(const android::renderscript::RsForEachStubParamStruct *,
-                 uint32_t x1, uint32_t x2, uint32_t instep, uint32_t outstep);
-
-    void (*destroy)(const android::renderscript::Context *dc,
-                    const android::renderscript::Script *script,
-                    void * intrinsicData);
-} RsdIntriniscFuncs_t;
-
-struct DrvScript {
-    RsScriptIntrinsicID mIntrinsicID;
-    int (*mRoot)();
-    int (*mRootExpand)();
-    void (*mInit)();
-    void (*mFreeChildren)();
-
-    bcc::BCCContext *mCompilerContext;
-    bcc::RSCompilerDriver *mCompilerDriver;
-    bcc::RSExecutable *mExecutable;
-
-    android::renderscript::Allocation **mBoundAllocs;
-    RsdIntriniscFuncs_t mIntrinsicFuncs;
-    void * mIntrinsicData;
-};
-
-typedef struct {
-    android::renderscript::RsForEachStubParamStruct fep;
-    uint32_t cpuIdx;
-
-} MTThreadStuct;
-
-typedef struct {
-    android::renderscript::RsForEachStubParamStruct fep;
-
-    android::renderscript::Context *rsc;
-    android::renderscript::Script *script;
-    ForEachFunc_t kernel;
-    uint32_t sig;
-    const android::renderscript::Allocation * ain;
-    android::renderscript::Allocation * aout;
-
-    uint32_t mSliceSize;
-    volatile int mSliceNum;
-
-    uint32_t xStart;
-    uint32_t xEnd;
-    uint32_t yStart;
-    uint32_t yEnd;
-    uint32_t zStart;
-    uint32_t zEnd;
-    uint32_t arrayStart;
-    uint32_t arrayEnd;
-} MTLaunchStruct;
-
-void rsdScriptLaunchThreads(const android::renderscript::Context *rsc,
-                            bool isThreadable,
-                            const android::renderscript::Allocation * ain,
-                            android::renderscript::Allocation * aout,
-                            const void * usr,
-                            uint32_t usrLen,
-                            const RsScriptCall *sc,
-                            MTLaunchStruct *mtls);
-
-void rsdScriptInvokeForEachMtlsSetup(const android::renderscript::Context *rsc,
-                                     const android::renderscript::Allocation * ain,
-                                     android::renderscript::Allocation * aout,
-                                     const void * usr,
-                                     uint32_t usrLen,
-                                     const RsScriptCall *sc,
-                                     MTLaunchStruct *mtls);
-
-
-
-
 #endif
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index caa5aa7..7f4060a 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "../cpu_ref/rsd_cpu.h"
+
 #include "rsdCore.h"
 #include "rsdAllocation.h"
 #include "rsdBcc.h"
@@ -154,71 +156,10 @@
 
 };
 
-pthread_key_t rsdgThreadTLSKey = 0;
-uint32_t rsdgThreadTLSKeyCount = 0;
-pthread_mutex_t rsdgInitMutex = PTHREAD_MUTEX_INITIALIZER;
+extern const RsdCpuReference::CpuSymbol * rsdLookupRuntimeStub(Context * pContext, char const* name);
 
-
-static void * HelperThreadProc(void *vrsc) {
-    Context *rsc = static_cast<Context *>(vrsc);
-    RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-
-
-    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
-
-    //ALOGV("RS helperThread starting %p idx=%i", rsc, idx);
-
-    dc->mWorkers.mLaunchSignals[idx].init();
-    dc->mWorkers.mNativeThreadId[idx] = gettid();
-
-    int status = pthread_setspecific(rsdgThreadTLSKey, &dc->mTlsStruct);
-    if (status) {
-        ALOGE("pthread_setspecific %i", status);
-    }
-
-#if 0
-    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
-    cpu_set_t cpuset;
-    memset(&cpuset, 0, sizeof(cpuset));
-    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
-    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
-              sizeof(cpuset), &cpuset);
-    ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
-#endif
-
-    while (!dc->mExit) {
-        dc->mWorkers.mLaunchSignals[idx].wait();
-        if (dc->mWorkers.mLaunchCallback) {
-            // idx +1 is used because the calling thread is always worker 0.
-            dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
-        }
-        android_atomic_dec(&dc->mWorkers.mRunningCount);
-        dc->mWorkers.mCompleteSignal.set();
-    }
-
-    //ALOGV("RS helperThread exited %p idx=%i", rsc, idx);
-    return NULL;
-}
-
-void rsdLaunchThreads(Context *rsc, WorkerCallback_t cbk, void *data) {
-    RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-
-    dc->mWorkers.mLaunchData = data;
-    dc->mWorkers.mLaunchCallback = cbk;
-    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
-    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
-        dc->mWorkers.mLaunchSignals[ct].set();
-    }
-
-    // We use the calling thread as one of the workers so we can start without
-    // the delay of the thread wakeup.
-    if (dc->mWorkers.mLaunchCallback) {
-       dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, 0);
-    }
-
-    while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
-        dc->mWorkers.mCompleteSignal.wait();
-    }
+static RsdCpuReference::CpuScript * LookupScript(Context *, const Script *s) {
+    return (RsdCpuReference::CpuScript *)s->mHal.drv;
 }
 
 extern "C" bool rsdHalInit(RsContext c, uint32_t version_major,
@@ -233,76 +174,23 @@
     }
     rsc->mHal.drv = dc;
 
-    pthread_mutex_lock(&rsdgInitMutex);
-    if (!rsdgThreadTLSKeyCount) {
-        int status = pthread_key_create(&rsdgThreadTLSKey, NULL);
-        if (status) {
-            ALOGE("Failed to init thread tls key.");
-            pthread_mutex_unlock(&rsdgInitMutex);
-            return false;
-        }
-    }
-    rsdgThreadTLSKeyCount++;
-    pthread_mutex_unlock(&rsdgInitMutex);
-
-    dc->mTlsStruct.mContext = rsc;
-    dc->mTlsStruct.mScript = NULL;
-    int status = pthread_setspecific(rsdgThreadTLSKey, &dc->mTlsStruct);
-    if (status) {
-        ALOGE("pthread_setspecific %i", status);
-    }
-
-
-    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
-    if(rsc->props.mDebugMaxThreads) {
-        cpu = rsc->props.mDebugMaxThreads;
-    }
-    if (cpu < 2) {
-        cpu = 0;
-    }
-    ALOGV("%p Launching thread(s), CPUs %i", rsc, cpu);
-
-    // Subtract one from the cpu count because we also use the command thread as a worker.
-    dc->mWorkers.mCount = (uint32_t)(cpu - 1);
-    dc->mWorkers.mThreadId = (pthread_t *) calloc(dc->mWorkers.mCount, sizeof(pthread_t));
-    dc->mWorkers.mNativeThreadId = (pid_t *) calloc(dc->mWorkers.mCount, sizeof(pid_t));
-    dc->mWorkers.mLaunchSignals = new Signal[dc->mWorkers.mCount];
-    dc->mWorkers.mLaunchCallback = NULL;
-
-    dc->mWorkers.mCompleteSignal.init();
-
-    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
-    android_atomic_release_store(0, &dc->mWorkers.mLaunchCount);
-
-    pthread_attr_t threadAttr;
-    status = pthread_attr_init(&threadAttr);
-    if (status) {
-        ALOGE("Failed to init thread attribute.");
+    dc->mCpuRef = RsdCpuReference::create((Context *)c, version_major, version_minor,
+                                          &rsdLookupRuntimeStub, &LookupScript);
+    if (!dc->mCpuRef) {
+        ALOGE("RsdCpuReference::create for driver hal failed.");
+        free(dc);
         return false;
     }
 
-    for (uint32_t ct=0; ct < dc->mWorkers.mCount; ct++) {
-        status = pthread_create(&dc->mWorkers.mThreadId[ct], &threadAttr, HelperThreadProc, rsc);
-        if (status) {
-            dc->mWorkers.mCount = ct;
-            ALOGE("Created fewer than expected number of RS threads.");
-            break;
-        }
-    }
-    while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) {
-        usleep(100);
-    }
-
-    pthread_attr_destroy(&threadAttr);
     return true;
 }
 
 
 void SetPriority(const Context *rsc, int32_t priority) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-    for (uint32_t ct=0; ct < dc->mWorkers.mCount; ct++) {
-        setpriority(PRIO_PROCESS, dc->mWorkers.mNativeThreadId[ct], priority);
-    }
+
+    dc->mCpuRef->setPriority(priority);
+
     if (dc->mHasGraphics) {
         rsdGLSetPriority(rsc, priority);
     }
@@ -310,27 +198,7 @@
 
 void Shutdown(Context *rsc) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-
-    dc->mExit = true;
-    dc->mWorkers.mLaunchData = NULL;
-    dc->mWorkers.mLaunchCallback = NULL;
-    android_atomic_release_store(dc->mWorkers.mCount, &dc->mWorkers.mRunningCount);
-    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
-        dc->mWorkers.mLaunchSignals[ct].set();
-    }
-    void *res;
-    for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) {
-        pthread_join(dc->mWorkers.mThreadId[ct], &res);
-    }
-    rsAssert(android_atomic_acquire_load(&dc->mWorkers.mRunningCount) == 0);
-
-    // Global structure cleanup.
-    pthread_mutex_lock(&rsdgInitMutex);
-    --rsdgThreadTLSKeyCount;
-    if (!rsdgThreadTLSKeyCount) {
-        pthread_key_delete(rsdgThreadTLSKey);
-    }
-    pthread_mutex_unlock(&rsdgInitMutex);
-
+    delete dc->mCpuRef;
+    rsc->mHal.drv = NULL;
 }
 
diff --git a/driver/rsdCore.h b/driver/rsdCore.h
index 92e7c7f..0a46460 100644
--- a/driver/rsdCore.h
+++ b/driver/rsdCore.h
@@ -19,6 +19,8 @@
 
 #include <rs_hal.h>
 
+#include "../cpu_ref/rsd_cpu.h"
+
 #include "rsMutex.h"
 #include "rsSignal.h"
 
@@ -28,12 +30,6 @@
 typedef void (* ForEachFunc_t)(void);
 typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
 
-typedef struct RsdSymbolTableRec {
-    const char * mName;
-    void * mPtr;
-    bool threadable;
-} RsdSymbolTable;
-
 typedef struct ScriptTLSStructRec {
     android::renderscript::Context * mContext;
     android::renderscript::Script * mScript;
@@ -43,33 +39,13 @@
     uint32_t version_major;
     uint32_t version_minor;
     bool mHasGraphics;
-    bool mInForEach;
-
-    struct Workers {
-        volatile int mRunningCount;
-        volatile int mLaunchCount;
-        uint32_t mCount;
-        pthread_t *mThreadId;
-        pid_t *mNativeThreadId;
-        android::renderscript::Signal mCompleteSignal;
-
-        android::renderscript::Signal *mLaunchSignals;
-        WorkerCallback_t mLaunchCallback;
-        void *mLaunchData;
-    };
-    Workers mWorkers;
-    bool mExit;
 
     ScriptTLSStruct mTlsStruct;
+    android::renderscript::RsdCpuReference *mCpuRef;
 
     RsdGL gl;
 } RsdHal;
 
-extern pthread_key_t rsdgThreadTLSKey;
-extern uint32_t rsdgThreadTLSKeyCount;
-extern pthread_mutex_t rsdgInitMutex;
-
-
 void rsdLaunchThreads(android::renderscript::Context *rsc, WorkerCallback_t cbk, void *data);
 
 #endif
diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp
deleted file mode 100644
index cfe0333..0000000
--- a/driver/rsdIntrinsicColorMatrix.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-struct ConvolveParams {
-    float fp[16];
-    short ip[16];
-    bool use3x3;
-    bool useDot;
-};
-
-static void ColorMatrix_SetVar(const Context *dc, const Script *script, void * intrinsicData,
-                               uint32_t slot, void *data, size_t dataLength) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-
-    rsAssert(slot == 0);
-    memcpy (cp->fp, data, dataLength);
-    for(int ct=0; ct < 16; ct++) {
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-
-    if ((cp->ip[3] == 0) && (cp->ip[7] == 0) && (cp->ip[11] == 0) &&
-        (cp->ip[12] == 0) && (cp->ip[13] == 0) && (cp->ip[14] == 0) &&
-        (cp->ip[15] == 255)) {
-        cp->use3x3 = true;
-
-        if ((cp->ip[0] == cp->ip[1]) && (cp->ip[0] == cp->ip[2]) &&
-            (cp->ip[4] == cp->ip[5]) && (cp->ip[4] == cp->ip[6]) &&
-            (cp->ip[8] == cp->ip[9]) && (cp->ip[8] == cp->ip[10])) {
-            cp->useDot = true;
-        }
-    }
-}
-
-extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
-
-static void One(const RsForEachStubParamStruct *p, uchar4 *out,
-                const uchar4 *py, const float* coeff) {
-    float4 i = convert_float4(py[0]);
-
-    float4 sum;
-    sum.x = i.x * coeff[0] +
-            i.y * coeff[4] +
-            i.z * coeff[8] +
-            i.w * coeff[12];
-    sum.y = i.x * coeff[1] +
-            i.y * coeff[5] +
-            i.z * coeff[9] +
-            i.w * coeff[13];
-    sum.z = i.x * coeff[2] +
-            i.y * coeff[6] +
-            i.z * coeff[10] +
-            i.w * coeff[14];
-    sum.w = i.x * coeff[3] +
-            i.y * coeff[7] +
-            i.z * coeff[11] +
-            i.w * coeff[15];
-
-    sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
-    sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
-    sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
-    sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
-
-    *out = convert_uchar4(sum);
-}
-
-static void ColorMatrix_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1) >> 2;
-        if(len > 0) {
-            if (cp->use3x3) {
-                if (cp->useDot) {
-                    rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
-                } else {
-                    rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
-                }
-            } else {
-                rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
-            }
-            x1 += len << 2;
-            out += len << 2;
-            in += len << 2;
-        }
-#endif
-
-        while(x1 != x2) {
-            One(p, out++, in++, cp->fp);
-            x1++;
-        }
-    }
-}
-
-void * rsdIntrinsic_InitColorMatrix(const android::renderscript::Context *dc,
-                                    android::renderscript::Script *script,
-                                    RsdIntriniscFuncs_t *funcs) {
-
-    script->mHal.info.exportedVariableCount = 1;
-    funcs->setVar = ColorMatrix_SetVar;
-    funcs->root = ColorMatrix_uchar4;
-
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    cp->fp[0] = 1.f;
-    cp->fp[5] = 1.f;
-    cp->fp[10] = 1.f;
-    cp->fp[15] = 1.f;
-    for(int ct=0; ct < 16; ct++) {
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-    return cp;
-}
-
-
diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp
deleted file mode 100644
index 55f4360..0000000
--- a/driver/rsdIntrinsicConvolve3x3.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-struct ConvolveParams {
-    float fp[16];
-    short ip[16];
-    ObjectBaseRef<Allocation> alloc;
-};
-
-static void Convolve3x3_Bind(const Context *dc, const Script *script,
-                             void * intrinsicData, uint32_t slot, Allocation *data) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-    rsAssert(slot == 1);
-    cp->alloc.set(data);
-}
-
-static void Convolve3x3_SetVar(const Context *dc, const Script *script, void * intrinsicData,
-                               uint32_t slot, void *data, size_t dataLength) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-
-    rsAssert(slot == 0);
-    memcpy (cp->fp, data, dataLength);
-    for(int ct=0; ct < 9; ct++) {
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-}
-
-extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, const void *y2, const short *coef, uint32_t count);
-
-
-static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
-                        const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
-                        const float* coeff) {
-
-    uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX);
-
-    float4 px = convert_float4(py0[x1]) * coeff[0] +
-                convert_float4(py0[x]) * coeff[1] +
-                convert_float4(py0[x2]) * coeff[2] +
-                convert_float4(py1[x1]) * coeff[3] +
-                convert_float4(py1[x]) * coeff[4] +
-                convert_float4(py1[x2]) * coeff[5] +
-                convert_float4(py2[x1]) * coeff[6] +
-                convert_float4(py2[x]) * coeff[7] +
-                convert_float4(py2[x2]) * coeff[8];
-
-    px = clamp(px, 0.f, 255.f);
-    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
-    *out = o;
-}
-
-static void Convolve3x3_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
-
-    if (!cp->alloc.get()) {
-        ALOGE("Convolve3x3 executed without input, skipping");
-        return;
-    }
-    DrvAllocation *din = (DrvAllocation *)cp->alloc->mHal.drv;
-    const uchar *pin = (const uchar *)din->lod[0].mallocPtr;
-
-    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
-    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
-    const uchar4 *py0 = (const uchar4 *)(pin + din->lod[0].stride * y2);
-    const uchar4 *py1 = (const uchar4 *)(pin + din->lod[0].stride * p->y);
-    const uchar4 *py2 = (const uchar4 *)(pin + din->lod[0].stride * y1);
-
-    uchar4 *out = (uchar4 *)p->out;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-    if(x1 == 0) {
-        ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
-        x1 ++;
-        out++;
-    }
-
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 1;
-        if(len > 0) {
-            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
-            x1 += len << 1;
-            out += len << 1;
-        }
-#endif
-
-        while(x1 != x2) {
-            ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
-            out++;
-            x1++;
-        }
-    }
-}
-
-void * rsdIntrinsic_InitConvolve3x3(const android::renderscript::Context *dc,
-                                    android::renderscript::Script *script,
-                                    RsdIntriniscFuncs_t *funcs) {
-
-    script->mHal.info.exportedVariableCount = 2;
-    funcs->setVarObj = Convolve3x3_Bind;
-    funcs->setVar = Convolve3x3_SetVar;
-    funcs->root = Convolve3x3_uchar4;
-
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    for(int ct=0; ct < 9; ct++) {
-        cp->fp[ct] = 1.f / 9.f;
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-    return cp;
-}
-
-
diff --git a/driver/rsdIntrinsicConvolve5x5.cpp b/driver/rsdIntrinsicConvolve5x5.cpp
deleted file mode 100644
index fc6b029..0000000
--- a/driver/rsdIntrinsicConvolve5x5.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-struct ConvolveParams {
-    float fp[28];
-    short ip[28];
-    ObjectBaseRef<Allocation> alloc;
-};
-
-static void Convolve5x5_Bind(const Context *dc, const Script *script,
-                             void * intrinsicData, uint32_t slot, Allocation *data) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-    rsAssert(slot == 1);
-    cp->alloc.set(data);
-}
-
-static void Convolve5x5_SetVar(const Context *dc, const Script *script, void * intrinsicData,
-                               uint32_t slot, void *data, size_t dataLength) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-
-    rsAssert(slot == 0);
-    memcpy (cp->fp, data, dataLength);
-    for(int ct=0; ct < 25; ct++) {
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-}
-
-
-static void One(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
-                const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
-                const float* coeff) {
-
-    uint32_t x0 = rsMax((int32_t)x-2, 0);
-    uint32_t x1 = rsMax((int32_t)x-1, 0);
-    uint32_t x2 = x;
-    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
-    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
-
-    float4 px = convert_float4(py0[x0]) * coeff[0] +
-                convert_float4(py0[x1]) * coeff[1] +
-                convert_float4(py0[x2]) * coeff[2] +
-                convert_float4(py0[x3]) * coeff[3] +
-                convert_float4(py0[x4]) * coeff[4] +
-
-                convert_float4(py1[x0]) * coeff[5] +
-                convert_float4(py1[x1]) * coeff[6] +
-                convert_float4(py1[x2]) * coeff[7] +
-                convert_float4(py1[x3]) * coeff[8] +
-                convert_float4(py1[x4]) * coeff[9] +
-
-                convert_float4(py2[x0]) * coeff[10] +
-                convert_float4(py2[x1]) * coeff[11] +
-                convert_float4(py2[x2]) * coeff[12] +
-                convert_float4(py2[x3]) * coeff[13] +
-                convert_float4(py2[x4]) * coeff[14] +
-
-                convert_float4(py3[x0]) * coeff[15] +
-                convert_float4(py3[x1]) * coeff[16] +
-                convert_float4(py3[x2]) * coeff[17] +
-                convert_float4(py3[x3]) * coeff[18] +
-                convert_float4(py3[x4]) * coeff[19] +
-
-                convert_float4(py4[x0]) * coeff[20] +
-                convert_float4(py4[x1]) * coeff[21] +
-                convert_float4(py4[x2]) * coeff[22] +
-                convert_float4(py4[x3]) * coeff[23] +
-                convert_float4(py4[x4]) * coeff[24];
-
-    px = clamp(px, 0.f, 255.f);
-    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
-    *out = o;
-}
-
-extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
-                                          const void *y2, const void *y3, const void *y4,
-                                          const short *coef, uint32_t count);
-
-static void Convolve5x5_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
-    if (!cp->alloc.get()) {
-        ALOGE("Convolve5x5 executed without input, skipping");
-        return;
-    }
-    DrvAllocation *din = (DrvAllocation *)cp->alloc->mHal.drv;
-    const uchar *pin = (const uchar *)din->lod[0].mallocPtr;
-
-    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
-    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
-    uint32_t y2 = p->y;
-    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
-    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
-
-    const uchar4 *py0 = (const uchar4 *)(pin + din->lod[0].stride * y0);
-    const uchar4 *py1 = (const uchar4 *)(pin + din->lod[0].stride * y1);
-    const uchar4 *py2 = (const uchar4 *)(pin + din->lod[0].stride * y2);
-    const uchar4 *py3 = (const uchar4 *)(pin + din->lod[0].stride * y3);
-    const uchar4 *py4 = (const uchar4 *)(pin + din->lod[0].stride * y4);
-
-    uchar4 *out = (uchar4 *)p->out;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    while((x1 < x2) && (x1 < 2)) {
-        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
-        out++;
-        x1++;
-    }
-
-#if defined(ARCH_ARM_HAVE_NEON)
-    if((x1 + 3) < x2) {
-        uint32_t len = (x2 - x1 - 3) >> 1;
-        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
-        out += len << 1;
-        x1 += len << 1;
-    }
-#endif
-
-    while(x1 < x2) {
-        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
-        out++;
-        x1++;
-    }
-}
-
-void * rsdIntrinsic_InitConvolve5x5(const android::renderscript::Context *dc,
-                                    android::renderscript::Script *script,
-                                    RsdIntriniscFuncs_t *funcs) {
-
-    script->mHal.info.exportedVariableCount = 2;
-    funcs->setVarObj = Convolve5x5_Bind;
-    funcs->setVar = Convolve5x5_SetVar;
-    funcs->root = Convolve5x5_uchar4;
-
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    for(int ct=0; ct < 25; ct++) {
-        cp->fp[ct] = 1.f / 25.f;
-        cp->ip[ct] = (short)(cp->fp[ct] * 255.f + 0.5f);
-    }
-    return cp;
-}
-
-
diff --git a/driver/rsdIntrinsicLUT.cpp b/driver/rsdIntrinsicLUT.cpp
deleted file mode 100644
index 818a132..0000000
--- a/driver/rsdIntrinsicLUT.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-struct ConvolveParams {
-    ObjectBaseRef<Allocation> lut;
-};
-
-static void LUT_Bind(const Context *dc, const Script *script,
-                             void * intrinsicData, uint32_t slot, Allocation *data) {
-    ConvolveParams *cp = (ConvolveParams *)intrinsicData;
-    rsAssert(slot == 0);
-    cp->lut.set(data);
-}
-
-static void LUT_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
-    ConvolveParams *cp = (ConvolveParams *)p->usr;
-    uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    DrvAllocation *din = (DrvAllocation *)cp->lut->mHal.drv;
-    const uchar *tr = (const uchar *)din->lod[0].mallocPtr;
-    const uchar *tg = &tr[256];
-    const uchar *tb = &tg[256];
-    const uchar *ta = &tb[256];
-
-    while (x1 < x2) {
-        uchar4 p = *in;
-        uchar4 o = {tr[p.x], tg[p.y], tb[p.z], ta[p.w]};
-        *out = o;
-        in++;
-        out++;
-        x1++;
-    }
-}
-
-void * rsdIntrinsic_InitLUT(const android::renderscript::Context *dc,
-                                    android::renderscript::Script *script,
-                                    RsdIntriniscFuncs_t *funcs) {
-
-    script->mHal.info.exportedVariableCount = 1;
-    funcs->setVarObj = LUT_Bind;
-    funcs->root = LUT_uchar4;
-    ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams));
-    return cp;
-}
-
-
diff --git a/driver/rsdIntrinsicYuvToRGB.cpp b/driver/rsdIntrinsicYuvToRGB.cpp
deleted file mode 100644
index b3fb059..0000000
--- a/driver/rsdIntrinsicYuvToRGB.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-#include "rsdIntrinsicInlines.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-struct YuvParams {
-    ObjectBaseRef<Allocation> alloc;
-};
-
-static void YuvToRGB_Bind(const Context *dc, const Script *script,
-                             void * intrinsicData, uint32_t slot, Allocation *data) {
-    YuvParams *cp = (YuvParams *)intrinsicData;
-    rsAssert(slot == 0);
-    cp->alloc.set(data);
-}
-
-
-
-static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    short Y = ((short)y) - 16;
-    short U = ((short)u) - 128;
-    short V = ((short)v) - 128;
-
-    short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    if(p.r < 0) {
-        p.r = 0;
-    }
-    if(p.r > 255) {
-        p.r = 255;
-    }
-    if(p.g < 0) {
-        p.g = 0;
-    }
-    if(p.g > 255) {
-        p.g = 255;
-    }
-    if(p.b < 0) {
-        p.b = 0;
-    }
-    if(p.b > 255) {
-        p.b = 255;
-    }
-
-    return (uchar4){p.r, p.g, p.b, p.a};
-}
-
-
-static short YuvCoeff[] = {
-    298, 409, -100, 516,   -208, 255, 0, 0,
-    16, 16, 16, 16,        16, 16, 16, 16,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    298, 298, 298, 298, 298, 298, 298, 298,
-    255, 255, 255, 255, 255, 255, 255, 255
-
-
-};
-
-extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
-
-static void YuvToRGB_uchar4(const RsForEachStubParamStruct *p,
-                                    uint32_t xstart, uint32_t xend,
-                                    uint32_t instep, uint32_t outstep) {
-    YuvParams *cp = (YuvParams *)p->usr;
-    if (!cp->alloc.get()) {
-        ALOGE("YuvToRGB executed without input, skipping");
-        return;
-    }
-    DrvAllocation *din = (DrvAllocation *)cp->alloc->mHal.drv;
-    const uchar *pin = (const uchar *)din->lod[0].mallocPtr;
-
-    const uchar *Y = pin + (p->y * p->dimX);
-    const uchar *uv = pin + (p->dimX * p->dimY);
-    uv += (p->y>>1) * p->dimX;
-
-    uchar4 *out = (uchar4 *)p->out;
-    uint32_t x1 = xstart;
-    uint32_t x2 = xend;
-
-    if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
-        int32_t len = (x2 - x1 - 1) >> 3;
-        if(len > 0) {
-            rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
-            x1 += len << 3;
-            out += len << 3;
-        }
-#endif
-
-       // ALOGE("y %i  %i  %i", p->y, x1, x2);
-        while(x1 < x2) {
-            uchar u = uv[(x1 & 0xffffe) + 1];
-            uchar v = uv[(x1 & 0xffffe) + 0];
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
-            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
-            out++;
-            x1++;
-        }
-    }
-}
-
-void * rsdIntrinsic_InitYuvToRGB(const android::renderscript::Context *dc,
-                                 android::renderscript::Script *script,
-                                 RsdIntriniscFuncs_t *funcs) {
-
-    script->mHal.info.exportedVariableCount = 1;
-    funcs->setVarObj = YuvToRGB_Bind;
-    funcs->root = YuvToRGB_uchar4;
-    YuvParams *cp = (YuvParams *)calloc(1, sizeof(YuvParams));
-    return cp;
-}
-
-
diff --git a/driver/rsdIntrinsics.cpp b/driver/rsdIntrinsics.cpp
deleted file mode 100644
index 0f747fa..0000000
--- a/driver/rsdIntrinsics.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rsdCore.h"
-#include "rsdIntrinsics.h"
-#include "rsdAllocation.h"
-
-using namespace android;
-using namespace android::renderscript;
-
-void * rsdIntrinsic_InitBlur(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitConvolve3x3(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitConvolve5x5(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitColorMatrix(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitLUT(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitYuvToRGB(const Context *, Script *, RsdIntriniscFuncs_t *);
-void * rsdIntrinsic_InitBlend(const Context *, Script *, RsdIntriniscFuncs_t *);
-
-static void SetVarObj(const Context *, const Script *, void *, uint32_t, Allocation *) {
-    rsAssert(!"Intrinsic_SetVarObj unexpectedly called");
-}
-
-static void SetVar(const Context *, const Script *, void *, uint32_t, void *, size_t) {
-    rsAssert(!"Intrinsic_Bind unexpectedly called");
-}
-
-static void Destroy(const Context *dc, const Script *script, void * intrinsicData) {
-    free(intrinsicData);
-}
-
-void * rsdIntrinsic_Init(const android::renderscript::Context *dc,
-                       android::renderscript::Script *script,
-                       RsScriptIntrinsicID iid,
-                       RsdIntriniscFuncs_t *funcs) {
-
-    funcs->setVarObj = SetVarObj;
-    funcs->setVar = SetVar;
-    funcs->destroy = Destroy;
-
-    switch(iid) {
-    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
-        return rsdIntrinsic_InitConvolve3x3(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
-        return rsdIntrinsic_InitConvolve5x5(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
-        return rsdIntrinsic_InitColorMatrix(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_LUT:
-        return rsdIntrinsic_InitLUT(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_BLUR:
-        return rsdIntrinsic_InitBlur(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
-        return rsdIntrinsic_InitYuvToRGB(dc, script, funcs);
-    case RS_SCRIPT_INTRINSIC_ID_BLEND:
-        return rsdIntrinsic_InitBlend(dc, script, funcs);
-
-    default:
-        return NULL;
-    }
-    return NULL;
-}
-
-
-
diff --git a/driver/rsdIntrinsics.h b/driver/rsdIntrinsics.h
deleted file mode 100644
index 221a81a..0000000
--- a/driver/rsdIntrinsics.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef RSD_INTRINSICS_H
-#define RSD_INTRINSICS_H
-
-#include <rs_hal.h>
-#include "rsdBcc.h"
-
-void * rsdIntrinsic_Init(const android::renderscript::Context *dc,
-                         android::renderscript::Script *script,
-                         RsScriptIntrinsicID id, RsdIntriniscFuncs_t *funcs);
-
-#endif // RSD_INTRINSICS_H
-
diff --git a/driver/rsdMeshObj.cpp b/driver/rsdMeshObj.cpp
index 92e02be..e8df21f 100644
--- a/driver/rsdMeshObj.cpp
+++ b/driver/rsdMeshObj.cpp
@@ -151,7 +151,7 @@
             mAttribs[ct].ptr = NULL;
         } else {
             mAttribs[ct].buffer = 0;
-            mAttribs[ct].ptr = (const uint8_t*)alloc->mHal.drvState.mallocPtrLOD0;
+            mAttribs[ct].ptr = (const uint8_t*)alloc->mHal.drvState.lod[0].mallocPtr;
         }
     }
 
@@ -172,7 +172,7 @@
         } else {
             RSD_CALL_GL(glBindBuffer, GL_ELEMENT_ARRAY_BUFFER, 0);
             RSD_CALL_GL(glDrawElements, mGLPrimitives[primIndex], len, GL_UNSIGNED_SHORT,
-                        idxAlloc->mHal.drvState.mallocPtrLOD0);
+                        idxAlloc->mHal.drvState.lod[0].mallocPtr);
         }
     } else {
         RSD_CALL_GL(glDrawArrays, mGLPrimitives[primIndex], start, len);
diff --git a/driver/rsdRuntime.h b/driver/rsdRuntime.h
deleted file mode 100644
index dc84032..0000000
--- a/driver/rsdRuntime.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef RSD_RUNTIME_STUBS_H
-#define RSD_RUNTIME_STUBS_H
-
-#include <rs_hal.h>
-
-#include "rsMutex.h"
-
-const RsdSymbolTable * rsdLookupSymbolMath(const char *sym);
-
-void* rsdLookupRuntimeStub(void* pContext, char const* name);
-
-#endif
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index 9bd1396..5141c9f 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,6 @@
 #include "rsdCore.h"
 #include "rsdBcc.h"
 
-#include "rsdRuntime.h"
 #include "rsdPath.h"
 #include "rsdAllocation.h"
 #include "rsdShaderCache.h"
@@ -36,11 +35,6 @@
 using namespace android;
 using namespace android::renderscript;
 
-#define GET_TLS()  ScriptTLSStruct * tls = \
-    (ScriptTLSStruct *)pthread_getspecific(rsdgThreadTLSKey); \
-    Context * rsc = tls->mContext; \
-    ScriptC * sc = (ScriptC *) tls->mScript
-
 typedef float float2 __attribute__((ext_vector_type(2)));
 typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
@@ -76,13 +70,13 @@
 
 
 static void SC_AllocationSyncAll2(Allocation *a, RsAllocationUsageType source) {
-    GET_TLS();
-    rsrAllocationSyncAll(rsc, sc, a, source);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrAllocationSyncAll(rsc, a, source);
 }
 
 static void SC_AllocationSyncAll(Allocation *a) {
-    GET_TLS();
-    rsrAllocationSyncAll(rsc, sc, a, RS_ALLOCATION_USAGE_SCRIPT);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrAllocationSyncAll(rsc, a, RS_ALLOCATION_USAGE_SCRIPT);
 }
 
 static void SC_AllocationCopy1DRange(Allocation *dstAlloc,
@@ -91,7 +85,7 @@
                                      uint32_t count,
                                      Allocation *srcAlloc,
                                      uint32_t srcOff, uint32_t srcMip) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsrAllocationCopy1DRange(rsc, dstAlloc, dstOff, dstMip, count,
                              srcAlloc, srcOff, srcMip);
 }
@@ -103,7 +97,7 @@
                                      Allocation *srcAlloc,
                                      uint32_t srcXoff, uint32_t srcYoff,
                                      uint32_t srcMip, uint32_t srcFace) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsrAllocationCopy2DRange(rsc, dstAlloc,
                              dstXoff, dstYoff, dstMip, dstFace,
                              width, height,
@@ -112,13 +106,13 @@
 }
 
 static void SC_AllocationIoSend(Allocation *alloc) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsdAllocationIoSend(rsc, alloc);
 }
 
 
 static void SC_AllocationIoReceive(Allocation *alloc) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsdAllocationIoReceive(rsc, alloc);
 }
 
@@ -129,68 +123,68 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static void SC_BindTexture(ProgramFragment *pf, uint32_t slot, Allocation *a) {
-    GET_TLS();
-    rsrBindTexture(rsc, sc, pf, slot, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindTexture(rsc, pf, slot, a);
 }
 
 static void SC_BindVertexConstant(ProgramVertex *pv, uint32_t slot, Allocation *a) {
-    GET_TLS();
-    rsrBindConstant(rsc, sc, pv, slot, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindConstant(rsc, pv, slot, a);
 }
 
 static void SC_BindFragmentConstant(ProgramFragment *pf, uint32_t slot, Allocation *a) {
-    GET_TLS();
-    rsrBindConstant(rsc, sc, pf, slot, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindConstant(rsc, pf, slot, a);
 }
 
 static void SC_BindSampler(ProgramFragment *pf, uint32_t slot, Sampler *s) {
-    GET_TLS();
-    rsrBindSampler(rsc, sc, pf, slot, s);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindSampler(rsc, pf, slot, s);
 }
 
 static void SC_BindProgramStore(ProgramStore *ps) {
-    GET_TLS();
-    rsrBindProgramStore(rsc, sc, ps);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindProgramStore(rsc, ps);
 }
 
 static void SC_BindProgramFragment(ProgramFragment *pf) {
-    GET_TLS();
-    rsrBindProgramFragment(rsc, sc, pf);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindProgramFragment(rsc, pf);
 }
 
 static void SC_BindProgramVertex(ProgramVertex *pv) {
-    GET_TLS();
-    rsrBindProgramVertex(rsc, sc, pv);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindProgramVertex(rsc, pv);
 }
 
 static void SC_BindProgramRaster(ProgramRaster *pr) {
-    GET_TLS();
-    rsrBindProgramRaster(rsc, sc, pr);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindProgramRaster(rsc, pr);
 }
 
 static void SC_BindFrameBufferObjectColorTarget(Allocation *a, uint32_t slot) {
-    GET_TLS();
-    rsrBindFrameBufferObjectColorTarget(rsc, sc, a, slot);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindFrameBufferObjectColorTarget(rsc, a, slot);
 }
 
 static void SC_BindFrameBufferObjectDepthTarget(Allocation *a) {
-    GET_TLS();
-    rsrBindFrameBufferObjectDepthTarget(rsc, sc, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindFrameBufferObjectDepthTarget(rsc, a);
 }
 
 static void SC_ClearFrameBufferObjectColorTarget(uint32_t slot) {
-    GET_TLS();
-    rsrClearFrameBufferObjectColorTarget(rsc, sc, slot);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrClearFrameBufferObjectColorTarget(rsc, slot);
 }
 
 static void SC_ClearFrameBufferObjectDepthTarget(Context *, Script *) {
-    GET_TLS();
-    rsrClearFrameBufferObjectDepthTarget(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrClearFrameBufferObjectDepthTarget(rsc);
 }
 
 static void SC_ClearFrameBufferObjectTargets(Context *, Script *) {
-    GET_TLS();
-    rsrClearFrameBufferObjectTargets(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrClearFrameBufferObjectTargets(rsc);
 }
 
 
@@ -199,28 +193,28 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static void SC_VpLoadProjectionMatrix(const rsc_Matrix *m) {
-    GET_TLS();
-    rsrVpLoadProjectionMatrix(rsc, sc, m);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrVpLoadProjectionMatrix(rsc, m);
 }
 
 static void SC_VpLoadModelMatrix(const rsc_Matrix *m) {
-    GET_TLS();
-    rsrVpLoadModelMatrix(rsc, sc, m);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrVpLoadModelMatrix(rsc, m);
 }
 
 static void SC_VpLoadTextureMatrix(const rsc_Matrix *m) {
-    GET_TLS();
-    rsrVpLoadTextureMatrix(rsc, sc, m);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrVpLoadTextureMatrix(rsc, m);
 }
 
 static void SC_PfConstantColor(ProgramFragment *pf, float r, float g, float b, float a) {
-    GET_TLS();
-    rsrPfConstantColor(rsc, sc, pf, r, g, b, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrPfConstantColor(rsc, pf, r, g, b, a);
 }
 
 static void SC_VpGetProjectionMatrix(rsc_Matrix *m) {
-    GET_TLS();
-    rsrVpGetProjectionMatrix(rsc, sc, m);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrVpGetProjectionMatrix(rsc, m);
 }
 
 
@@ -232,7 +226,7 @@
                                  float x2, float y2, float z2, float u2, float v2,
                                  float x3, float y3, float z3, float u3, float v3,
                                  float x4, float y4, float z4, float u4, float v4) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
 
     if (!rsc->setupCheck()) {
         return;
@@ -266,7 +260,6 @@
                         float x2, float y2, float z2,
                         float x3, float y3, float z3,
                         float x4, float y4, float z4) {
-    GET_TLS();
     SC_DrawQuadTexCoords(x1, y1, z1, 0, 1,
                          x2, y2, z2, 1, 1,
                          x3, y3, z3, 1, 0,
@@ -274,7 +267,7 @@
 }
 
 static void SC_DrawSpriteScreenspace(float x, float y, float z, float w, float h) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
 
     ObjectBaseRef<const ProgramVertex> tmp(rsc->getProgramVertex());
     rsc->setProgramVertex(rsc->getDefaultProgramVertex());
@@ -292,38 +285,34 @@
 }
 
 static void SC_DrawRect(float x1, float y1, float x2, float y2, float z) {
-    GET_TLS();
-
     SC_DrawQuad(x1, y2, z, x2, y2, z, x2, y1, z, x1, y1, z);
-
 }
 
 static void SC_DrawPath(Path *p) {
-    GET_TLS();
-    //rsrDrawPath(rsc, sc, p);
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsdPathDraw(rsc, p);
 }
 
 static void SC_DrawMesh(Mesh *m) {
-    GET_TLS();
-    rsrDrawMesh(rsc, sc, m);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrDrawMesh(rsc, m);
 }
 
 static void SC_DrawMeshPrimitive(Mesh *m, uint32_t primIndex) {
-    GET_TLS();
-    rsrDrawMeshPrimitive(rsc, sc, m, primIndex);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrDrawMeshPrimitive(rsc, m, primIndex);
 }
 
 static void SC_DrawMeshPrimitiveRange(Mesh *m, uint32_t primIndex, uint32_t start, uint32_t len) {
-    GET_TLS();
-    rsrDrawMeshPrimitiveRange(rsc, sc, m, primIndex, start, len);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrDrawMeshPrimitiveRange(rsc, m, primIndex, start, len);
 }
 
 static void SC_MeshComputeBoundingBox(Mesh *m,
                                float *minX, float *minY, float *minZ,
                                float *maxX, float *maxY, float *maxZ) {
-    GET_TLS();
-    rsrMeshComputeBoundingBox(rsc, sc, m, minX, minY, minZ, maxX, maxY, maxZ);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrMeshComputeBoundingBox(rsc, m, minX, minY, minZ, maxX, maxY, maxZ);
 }
 
 
@@ -334,67 +323,67 @@
 
 
 static void SC_Color(float r, float g, float b, float a) {
-    GET_TLS();
-    rsrColor(rsc, sc, r, g, b, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrColor(rsc, r, g, b, a);
 }
 
 static void SC_Finish() {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
     rsdGLFinish(rsc);
 }
 
 static void SC_ClearColor(float r, float g, float b, float a) {
-    GET_TLS();
-    rsrPrepareClear(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrPrepareClear(rsc);
     rsdGLClearColor(rsc, r, g, b, a);
 }
 
 static void SC_ClearDepth(float v) {
-    GET_TLS();
-    rsrPrepareClear(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrPrepareClear(rsc);
     rsdGLClearDepth(rsc, v);
 }
 
 static uint32_t SC_GetWidth() {
-    GET_TLS();
-    return rsrGetWidth(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrGetWidth(rsc);
 }
 
 static uint32_t SC_GetHeight() {
-    GET_TLS();
-    return rsrGetHeight(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrGetHeight(rsc);
 }
 
 static void SC_DrawTextAlloc(Allocation *a, int x, int y) {
-    GET_TLS();
-    rsrDrawTextAlloc(rsc, sc, a, x, y);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrDrawTextAlloc(rsc, a, x, y);
 }
 
 static void SC_DrawText(const char *text, int x, int y) {
-    GET_TLS();
-    rsrDrawText(rsc, sc, text, x, y);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrDrawText(rsc, text, x, y);
 }
 
 static void SC_MeasureTextAlloc(Allocation *a,
                          int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
-    GET_TLS();
-    rsrMeasureTextAlloc(rsc, sc, a, left, right, top, bottom);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrMeasureTextAlloc(rsc, a, left, right, top, bottom);
 }
 
 static void SC_MeasureText(const char *text,
                     int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
-    GET_TLS();
-    rsrMeasureText(rsc, sc, text, left, right, top, bottom);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrMeasureText(rsc, text, left, right, top, bottom);
 }
 
 static void SC_BindFont(Font *f) {
-    GET_TLS();
-    rsrBindFont(rsc, sc, f);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrBindFont(rsc, f);
 }
 
 static void SC_FontColor(float r, float g, float b, float a) {
-    GET_TLS();
-    rsrFontColor(rsc, sc, r, g, b, a);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrFontColor(rsc, r, g, b, a);
 }
 
 
@@ -404,41 +393,42 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static void SC_SetObject(ObjectBase **dst, ObjectBase * src) {
-    GET_TLS();
-    rsrSetObject(rsc, sc, dst, src);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrSetObject(rsc, dst, src);
 }
 
 static void SC_ClearObject(ObjectBase **dst) {
-    GET_TLS();
-    rsrClearObject(rsc, sc, dst);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrClearObject(rsc, dst);
 }
 
 static bool SC_IsObject(const ObjectBase *src) {
-    GET_TLS();
-    return rsrIsObject(rsc, sc, src);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrIsObject(rsc, src);
 }
 
 
 
 
 static const Allocation * SC_GetAllocation(const void *ptr) {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
+    const Script *sc = RsdCpuReference::getTlsScript();
     return rsdScriptGetAllocationForPointer(rsc, sc, ptr);
 }
 
 static void SC_ForEach_SAA(Script *target,
                             Allocation *in,
                             Allocation *out) {
-    GET_TLS();
-    rsrForEach(rsc, sc, target, in, out, NULL, 0, NULL);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrForEach(rsc, target, in, out, NULL, 0, NULL);
 }
 
 static void SC_ForEach_SAAU(Script *target,
                             Allocation *in,
                             Allocation *out,
                             const void *usr) {
-    GET_TLS();
-    rsrForEach(rsc, sc, target, in, out, usr, 0, NULL);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrForEach(rsc, target, in, out, usr, 0, NULL);
 }
 
 static void SC_ForEach_SAAUS(Script *target,
@@ -446,8 +436,8 @@
                              Allocation *out,
                              const void *usr,
                              const RsScriptCall *call) {
-    GET_TLS();
-    rsrForEach(rsc, sc, target, in, out, usr, 0, call);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrForEach(rsc, target, in, out, usr, 0, call);
 }
 
 static void SC_ForEach_SAAUL(Script *target,
@@ -455,8 +445,8 @@
                              Allocation *out,
                              const void *usr,
                              uint32_t usrLen) {
-    GET_TLS();
-    rsrForEach(rsc, sc, target, in, out, usr, usrLen, NULL);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrForEach(rsc, target, in, out, usr, usrLen, NULL);
 }
 
 static void SC_ForEach_SAAULS(Script *target,
@@ -465,8 +455,8 @@
                               const void *usr,
                               uint32_t usrLen,
                               const RsScriptCall *call) {
-    GET_TLS();
-    rsrForEach(rsc, sc, target, in, out, usr, usrLen, call);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    rsrForEach(rsc, target, in, out, usr, usrLen, call);
 }
 
 
@@ -476,28 +466,29 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static float SC_GetDt() {
-    GET_TLS();
+    Context *rsc = RsdCpuReference::getTlsContext();
+    const Script *sc = RsdCpuReference::getTlsScript();
     return rsrGetDt(rsc, sc);
 }
 
 time_t SC_Time(time_t *timer) {
-    GET_TLS();
-    return rsrTime(rsc, sc, timer);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrTime(rsc, timer);
 }
 
 tm* SC_LocalTime(tm *local, time_t *timer) {
-    GET_TLS();
-    return rsrLocalTime(rsc, sc, local, timer);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrLocalTime(rsc, local, timer);
 }
 
 int64_t SC_UptimeMillis() {
-    GET_TLS();
-    return rsrUptimeMillis(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrUptimeMillis(rsc);
 }
 
 int64_t SC_UptimeNanos() {
-    GET_TLS();
-    return rsrUptimeNanos(rsc, sc);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrUptimeNanos(rsc);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -505,179 +496,25 @@
 //////////////////////////////////////////////////////////////////////////////
 
 static uint32_t SC_ToClient2(int cmdID, void *data, int len) {
-    GET_TLS();
-    return rsrToClient(rsc, sc, cmdID, data, len);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrToClient(rsc, cmdID, data, len);
 }
 
 static uint32_t SC_ToClient(int cmdID) {
-    GET_TLS();
-    return rsrToClient(rsc, sc, cmdID, NULL, 0);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrToClient(rsc, cmdID, NULL, 0);
 }
 
 static uint32_t SC_ToClientBlocking2(int cmdID, void *data, int len) {
-    GET_TLS();
-    return rsrToClientBlocking(rsc, sc, cmdID, data, len);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrToClientBlocking(rsc, cmdID, data, len);
 }
 
 static uint32_t SC_ToClientBlocking(int cmdID) {
-    GET_TLS();
-    return rsrToClientBlocking(rsc, sc, cmdID, NULL, 0);
+    Context *rsc = RsdCpuReference::getTlsContext();
+    return rsrToClientBlocking(rsc, cmdID, NULL, 0);
 }
 
-int SC_divsi3(int a, int b) {
-    return a / b;
-}
-
-int SC_modsi3(int a, int b) {
-    return a % b;
-}
-
-unsigned int SC_udivsi3(unsigned int a, unsigned int b) {
-    return a / b;
-}
-
-unsigned int SC_umodsi3(unsigned int a, unsigned int b) {
-    return a % b;
-}
-
-static void SC_debugF(const char *s, float f) {
-    ALOGD("%s %f, 0x%08x", s, f, *((int *) (&f)));
-}
-static void SC_debugFv2(const char *s, float f1, float f2) {
-    ALOGD("%s {%f, %f}", s, f1, f2);
-}
-static void SC_debugFv3(const char *s, float f1, float f2, float f3) {
-    ALOGD("%s {%f, %f, %f}", s, f1, f2, f3);
-}
-static void SC_debugFv4(const char *s, float f1, float f2, float f3, float f4) {
-    ALOGD("%s {%f, %f, %f, %f}", s, f1, f2, f3, f4);
-}
-static void SC_debugF2(const char *s, float2 f) {
-    ALOGD("%s {%f, %f}", s, f.x, f.y);
-}
-static void SC_debugF3(const char *s, float3 f) {
-    ALOGD("%s {%f, %f, %f}", s, f.x, f.y, f.z);
-}
-static void SC_debugF4(const char *s, float4 f) {
-    ALOGD("%s {%f, %f, %f, %f}", s, f.x, f.y, f.z, f.w);
-}
-static void SC_debugD(const char *s, double d) {
-    ALOGD("%s %f, 0x%08llx", s, d, *((long long *) (&d)));
-}
-static void SC_debugFM4v4(const char *s, const float *f) {
-    ALOGD("%s {%f, %f, %f, %f", s, f[0], f[4], f[8], f[12]);
-    ALOGD("%s  %f, %f, %f, %f", s, f[1], f[5], f[9], f[13]);
-    ALOGD("%s  %f, %f, %f, %f", s, f[2], f[6], f[10], f[14]);
-    ALOGD("%s  %f, %f, %f, %f}", s, f[3], f[7], f[11], f[15]);
-}
-static void SC_debugFM3v3(const char *s, const float *f) {
-    ALOGD("%s {%f, %f, %f", s, f[0], f[3], f[6]);
-    ALOGD("%s  %f, %f, %f", s, f[1], f[4], f[7]);
-    ALOGD("%s  %f, %f, %f}",s, f[2], f[5], f[8]);
-}
-static void SC_debugFM2v2(const char *s, const float *f) {
-    ALOGD("%s {%f, %f", s, f[0], f[2]);
-    ALOGD("%s  %f, %f}",s, f[1], f[3]);
-}
-static void SC_debugI8(const char *s, char c) {
-    ALOGD("%s %hhd  0x%hhx", s, c, (unsigned char)c);
-}
-static void SC_debugC2(const char *s, char2 c) {
-    ALOGD("%s {%hhd, %hhd}  0x%hhx 0x%hhx", s, c.x, c.y, (unsigned char)c.x, (unsigned char)c.y);
-}
-static void SC_debugC3(const char *s, char3 c) {
-    ALOGD("%s {%hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z);
-}
-static void SC_debugC4(const char *s, char4 c) {
-    ALOGD("%s {%hhd, %hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z, (unsigned char)c.w);
-}
-static void SC_debugU8(const char *s, unsigned char c) {
-    ALOGD("%s %hhu  0x%hhx", s, c, c);
-}
-static void SC_debugUC2(const char *s, uchar2 c) {
-    ALOGD("%s {%hhu, %hhu}  0x%hhx 0x%hhx", s, c.x, c.y, c.x, c.y);
-}
-static void SC_debugUC3(const char *s, uchar3 c) {
-    ALOGD("%s {%hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.x, c.y, c.z);
-}
-static void SC_debugUC4(const char *s, uchar4 c) {
-    ALOGD("%s {%hhu, %hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
-}
-static void SC_debugI16(const char *s, short c) {
-    ALOGD("%s %hd  0x%hx", s, c, c);
-}
-static void SC_debugS2(const char *s, short2 c) {
-    ALOGD("%s {%hd, %hd}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
-}
-static void SC_debugS3(const char *s, short3 c) {
-    ALOGD("%s {%hd, %hd, %hd}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
-}
-static void SC_debugS4(const char *s, short4 c) {
-    ALOGD("%s {%hd, %hd, %hd, %hd}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
-}
-static void SC_debugU16(const char *s, unsigned short c) {
-    ALOGD("%s %hu  0x%hx", s, c, c);
-}
-static void SC_debugUS2(const char *s, ushort2 c) {
-    ALOGD("%s {%hu, %hu}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
-}
-static void SC_debugUS3(const char *s, ushort3 c) {
-    ALOGD("%s {%hu, %hu, %hu}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
-}
-static void SC_debugUS4(const char *s, ushort4 c) {
-    ALOGD("%s {%hu, %hu, %hu, %hu}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
-}
-static void SC_debugI32(const char *s, int32_t i) {
-    ALOGD("%s %d  0x%x", s, i, i);
-}
-static void SC_debugI2(const char *s, int2 i) {
-    ALOGD("%s {%d, %d}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
-}
-static void SC_debugI3(const char *s, int3 i) {
-    ALOGD("%s {%d, %d, %d}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
-}
-static void SC_debugI4(const char *s, int4 i) {
-    ALOGD("%s {%d, %d, %d, %d}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
-}
-static void SC_debugU32(const char *s, uint32_t i) {
-    ALOGD("%s %u  0x%x", s, i, i);
-}
-static void SC_debugUI2(const char *s, uint2 i) {
-    ALOGD("%s {%u, %u}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
-}
-static void SC_debugUI3(const char *s, uint3 i) {
-    ALOGD("%s {%u, %u, %u}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
-}
-static void SC_debugUI4(const char *s, uint4 i) {
-    ALOGD("%s {%u, %u, %u, %u}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
-}
-static void SC_debugLL64(const char *s, long long ll) {
-    ALOGD("%s %lld  0x%llx", s, ll, ll);
-}
-static void SC_debugL2(const char *s, long2 ll) {
-    ALOGD("%s {%lld, %lld}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
-}
-static void SC_debugL3(const char *s, long3 ll) {
-    ALOGD("%s {%lld, %lld, %lld}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
-}
-static void SC_debugL4(const char *s, long4 ll) {
-    ALOGD("%s {%lld, %lld, %lld, %lld}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
-}
-static void SC_debugULL64(const char *s, unsigned long long ll) {
-    ALOGD("%s %llu  0x%llx", s, ll, ll);
-}
-static void SC_debugUL2(const char *s, ulong2 ll) {
-    ALOGD("%s {%llu, %llu}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
-}
-static void SC_debugUL3(const char *s, ulong3 ll) {
-    ALOGD("%s {%llu, %llu, %llu}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
-}
-static void SC_debugUL4(const char *s, ulong4 ll) {
-    ALOGD("%s {%llu, %llu, %llu, %llu}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
-}
-static void SC_debugP(const char *s, const void *p) {
-    ALOGD("%s %p", s, p);
-}
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -701,10 +538,7 @@
 //                 ::= f  # float
 //                 ::= d  # double
 
-static RsdSymbolTable gSyms[] = {
-    { "memset", (void *)&memset, true },
-    { "memcpy", (void *)&memcpy, true },
-
+static RsdCpuReference::CpuSymbol gSyms[] = {
     // Refcounting
     { "_Z11rsSetObjectP10rs_elementS_", (void *)&SC_SetObject, true },
     { "_Z13rsClearObjectP10rs_element", (void *)&SC_ClearObject, true },
@@ -839,86 +673,24 @@
     { "_Z5colorffff", (void *)&SC_Color, false },
     { "_Z9rsgFinishv", (void *)&SC_Finish, false },
 
-    // Debug
-    { "_Z7rsDebugPKcf", (void *)&SC_debugF, true },
-    { "_Z7rsDebugPKcff", (void *)&SC_debugFv2, true },
-    { "_Z7rsDebugPKcfff", (void *)&SC_debugFv3, true },
-    { "_Z7rsDebugPKcffff", (void *)&SC_debugFv4, true },
-    { "_Z7rsDebugPKcDv2_f", (void *)&SC_debugF2, true },
-    { "_Z7rsDebugPKcDv3_f", (void *)&SC_debugF3, true },
-    { "_Z7rsDebugPKcDv4_f", (void *)&SC_debugF4, true },
-    { "_Z7rsDebugPKcd", (void *)&SC_debugD, true },
-    { "_Z7rsDebugPKcPK12rs_matrix4x4", (void *)&SC_debugFM4v4, true },
-    { "_Z7rsDebugPKcPK12rs_matrix3x3", (void *)&SC_debugFM3v3, true },
-    { "_Z7rsDebugPKcPK12rs_matrix2x2", (void *)&SC_debugFM2v2, true },
-    { "_Z7rsDebugPKcc", (void *)&SC_debugI8, true },
-    { "_Z7rsDebugPKcDv2_c", (void *)&SC_debugC2, true },
-    { "_Z7rsDebugPKcDv3_c", (void *)&SC_debugC3, true },
-    { "_Z7rsDebugPKcDv4_c", (void *)&SC_debugC4, true },
-    { "_Z7rsDebugPKch", (void *)&SC_debugU8, true },
-    { "_Z7rsDebugPKcDv2_h", (void *)&SC_debugUC2, true },
-    { "_Z7rsDebugPKcDv3_h", (void *)&SC_debugUC3, true },
-    { "_Z7rsDebugPKcDv4_h", (void *)&SC_debugUC4, true },
-    { "_Z7rsDebugPKcs", (void *)&SC_debugI16, true },
-    { "_Z7rsDebugPKcDv2_s", (void *)&SC_debugS2, true },
-    { "_Z7rsDebugPKcDv3_s", (void *)&SC_debugS3, true },
-    { "_Z7rsDebugPKcDv4_s", (void *)&SC_debugS4, true },
-    { "_Z7rsDebugPKct", (void *)&SC_debugU16, true },
-    { "_Z7rsDebugPKcDv2_t", (void *)&SC_debugUS2, true },
-    { "_Z7rsDebugPKcDv3_t", (void *)&SC_debugUS3, true },
-    { "_Z7rsDebugPKcDv4_t", (void *)&SC_debugUS4, true },
-    { "_Z7rsDebugPKci", (void *)&SC_debugI32, true },
-    { "_Z7rsDebugPKcDv2_i", (void *)&SC_debugI2, true },
-    { "_Z7rsDebugPKcDv3_i", (void *)&SC_debugI3, true },
-    { "_Z7rsDebugPKcDv4_i", (void *)&SC_debugI4, true },
-    { "_Z7rsDebugPKcj", (void *)&SC_debugU32, true },
-    { "_Z7rsDebugPKcDv2_j", (void *)&SC_debugUI2, true },
-    { "_Z7rsDebugPKcDv3_j", (void *)&SC_debugUI3, true },
-    { "_Z7rsDebugPKcDv4_j", (void *)&SC_debugUI4, true },
-    // Both "long" and "unsigned long" need to be redirected to their
-    // 64-bit counterparts, since we have hacked Slang to use 64-bit
-    // for "long" on Arm (to be similar to Java).
-    { "_Z7rsDebugPKcl", (void *)&SC_debugLL64, true },
-    { "_Z7rsDebugPKcDv2_l", (void *)&SC_debugL2, true },
-    { "_Z7rsDebugPKcDv3_l", (void *)&SC_debugL3, true },
-    { "_Z7rsDebugPKcDv4_l", (void *)&SC_debugL4, true },
-    { "_Z7rsDebugPKcm", (void *)&SC_debugULL64, true },
-    { "_Z7rsDebugPKcDv2_m", (void *)&SC_debugUL2, true },
-    { "_Z7rsDebugPKcDv3_m", (void *)&SC_debugUL3, true },
-    { "_Z7rsDebugPKcDv4_m", (void *)&SC_debugUL4, true },
-    { "_Z7rsDebugPKcx", (void *)&SC_debugLL64, true },
-    { "_Z7rsDebugPKcDv2_x", (void *)&SC_debugL2, true },
-    { "_Z7rsDebugPKcDv3_x", (void *)&SC_debugL3, true },
-    { "_Z7rsDebugPKcDv4_x", (void *)&SC_debugL4, true },
-    { "_Z7rsDebugPKcy", (void *)&SC_debugULL64, true },
-    { "_Z7rsDebugPKcDv2_y", (void *)&SC_debugUL2, true },
-    { "_Z7rsDebugPKcDv3_y", (void *)&SC_debugUL3, true },
-    { "_Z7rsDebugPKcDv4_y", (void *)&SC_debugUL4, true },
-    { "_Z7rsDebugPKcPKv", (void *)&SC_debugP, true },
-
     { NULL, NULL, false }
 };
 
 
-void* rsdLookupRuntimeStub(void* pContext, char const* name) {
+extern const RsdCpuReference::CpuSymbol * rsdLookupRuntimeStub(Context * pContext, char const* name) {
     ScriptC *s = (ScriptC *)pContext;
-    RsdSymbolTable *syms = gSyms;
-    const RsdSymbolTable *sym = rsdLookupSymbolMath(name);
+    const RsdCpuReference::CpuSymbol *syms = gSyms;
+    const RsdCpuReference::CpuSymbol *sym = NULL;
 
     if (!sym) {
-        while (syms->mPtr) {
-            if (!strcmp(syms->mName, name)) {
-                sym = syms;
+        while (syms->fnPtr) {
+            if (!strcmp(syms->name, name)) {
+                return syms;
             }
             syms++;
         }
     }
 
-    if (sym) {
-        s->mHal.info.isThreadable &= sym->threadable;
-        return sym->mPtr;
-    }
-    ALOGE("ScriptC sym lookup failed for %s", name);
     return NULL;
 }
 
diff --git a/driver/rsdScriptGroup.cpp b/driver/rsdScriptGroup.cpp
index f4f0f1c..ef802a2 100644
--- a/driver/rsdScriptGroup.cpp
+++ b/driver/rsdScriptGroup.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 The Android Open Source Project
+ * Copyright (C) 2011-2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,8 @@
  */
 
 #include "rsdCore.h"
+#include "../cpu_ref/rsd_cpu.h"
 
-#include <bcc/BCCContext.h>
-#include <bcc/Renderscript/RSCompilerDriver.h>
-#include <bcc/Renderscript/RSExecutable.h>
-#include <bcc/Renderscript/RSInfo.h>
 
 #include "rsScript.h"
 #include "rsScriptGroup.h"
@@ -31,236 +28,29 @@
 using namespace android::renderscript;
 
 
-bool rsdScriptGroupInit(const android::renderscript::Context *rsc,
-                        const android::renderscript::ScriptGroup *sg) {
-    return true;
+bool rsdScriptGroupInit(const Context *rsc, ScriptGroup *sg) {
+    RsdHal *dc = (RsdHal *)rsc->mHal.drv;
+
+    sg->mHal.drv = dc->mCpuRef->createScriptGroup(sg);
+    return sg->mHal.drv != NULL;
 }
 
-void rsdScriptGroupSetInput(const android::renderscript::Context *rsc,
-                            const android::renderscript::ScriptGroup *sg,
-                            const android::renderscript::ScriptKernelID *kid,
-                            android::renderscript::Allocation *) {
+void rsdScriptGroupSetInput(const Context *rsc, const ScriptGroup *sg,
+                            const ScriptKernelID *kid, Allocation *) {
 }
 
-void rsdScriptGroupSetOutput(const android::renderscript::Context *rsc,
-                             const android::renderscript::ScriptGroup *sg,
-                             const android::renderscript::ScriptKernelID *kid,
-                             android::renderscript::Allocation *) {
+void rsdScriptGroupSetOutput(const Context *rsc, const ScriptGroup *sg,
+                             const ScriptKernelID *kid, Allocation *) {
 }
 
-struct ScriptList {
-    size_t count;
-    Allocation *const* ins;
-    bool const* inExts;
-    Allocation *const* outs;
-    bool const* outExts;
-    const void *const* usrPtrs;
-    size_t const *usrSizes;
-    uint32_t const *sigs;
-    const void *const* fnPtrs;
-
-    const ScriptKernelID *const* kernels;
-};
-
-typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
-                                      uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep);
-
-static void ScriptGroupRoot(const RsForEachStubParamStruct *p,
-                            uint32_t xstart, uint32_t xend,
-                            uint32_t instep, uint32_t outstep) {
-
-    const ScriptList *sl = (const ScriptList *)p->usr;
-    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
-    const void *oldUsr = p->usr;
-
-    for(size_t ct=0; ct < sl->count; ct++) {
-        ScriptGroupRootFunc_t func;
-        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
-        mp->usr = sl->usrPtrs[ct];
-
-        mp->ptrIn = NULL;
-        mp->in = NULL;
-        mp->ptrOut = NULL;
-        mp->out = NULL;
-
-        if (sl->ins[ct]) {
-            DrvAllocation *drv = (DrvAllocation *)sl->ins[ct]->mHal.drv;
-            mp->ptrIn = (const uint8_t *)drv->lod[0].mallocPtr;
-            mp->in = mp->ptrIn;
-            if (sl->inExts[ct]) {
-                mp->in = mp->ptrIn + drv->lod[0].stride * p->y;
-            } else {
-                if (drv->lod[0].dimY > p->lid) {
-                    mp->in = mp->ptrIn + drv->lod[0].stride * p->lid;
-                }
-            }
-        }
-
-        if (sl->outs[ct]) {
-            DrvAllocation *drv = (DrvAllocation *)sl->outs[ct]->mHal.drv;
-            mp->ptrOut = (uint8_t *)drv->lod[0].mallocPtr;
-            mp->out = mp->ptrOut;
-            if (sl->outExts[ct]) {
-                mp->out = mp->ptrOut + drv->lod[0].stride * p->y;
-            } else {
-                if (drv->lod[0].dimY > p->lid) {
-                    mp->out = mp->ptrOut + drv->lod[0].stride * p->lid;
-                }
-            }
-        }
-
-        //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(p, xstart, xend, instep, outstep);
-    }
-    //ALOGE("script group root");
-
-    //ConvolveParams *cp = (ConvolveParams *)p->usr;
-
-    mp->usr = oldUsr;
+void rsdScriptGroupExecute(const Context *rsc, const ScriptGroup *sg) {
+    RsdCpuReference::CpuScriptGroup *sgi = (RsdCpuReference::CpuScriptGroup *)sg->mHal.drv;
+    sgi->execute();
 }
 
-
-void rsdScriptGroupExecute(const android::renderscript::Context *rsc,
-                           const android::renderscript::ScriptGroup *sg) {
-
-    Vector<Allocation *> ins;
-    Vector<bool> inExts;
-    Vector<Allocation *> outs;
-    Vector<bool> outExts;
-    Vector<const ScriptKernelID *> kernels;
-    bool fieldDep = false;
-
-    for (size_t ct=0; ct < sg->mNodes.size(); ct++) {
-        ScriptGroup::Node *n = sg->mNodes[ct];
-        Script *s = n->mKernels[0]->mScript;
-
-        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
-
-        for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
-            if (n->mInputs[ct2]->mDstField.get() && n->mInputs[ct2]->mDstField->mScript) {
-                //ALOGE("field %p %zu", n->mInputs[ct2]->mDstField->mScript, n->mInputs[ct2]->mDstField->mSlot);
-                s->setVarObj(n->mInputs[ct2]->mDstField->mSlot, n->mInputs[ct2]->mAlloc.get());
-            }
-        }
-
-        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
-            const ScriptKernelID *k = n->mKernels[ct2];
-            Allocation *ain = NULL;
-            Allocation *aout = NULL;
-            bool inExt = false;
-            bool outExt = false;
-
-            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
-                if (n->mInputs[ct3]->mDstKernel.get() == k) {
-                    ain = n->mInputs[ct3]->mAlloc.get();
-                    //ALOGE(" link in %p", ain);
-                }
-            }
-            for (size_t ct3=0; ct3 < sg->mInputs.size(); ct3++) {
-                if (sg->mInputs[ct3]->mKernel == k) {
-                    ain = sg->mInputs[ct3]->mAlloc.get();
-                    inExt = true;
-                    //ALOGE(" io in %p", ain);
-                }
-            }
-
-            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
-                if (n->mOutputs[ct3]->mSource.get() == k) {
-                    aout = n->mOutputs[ct3]->mAlloc.get();
-                    if(n->mOutputs[ct3]->mDstField.get() != NULL) {
-                        fieldDep = true;
-                    }
-                    //ALOGE(" link out %p", aout);
-                }
-            }
-            for (size_t ct3=0; ct3 < sg->mOutputs.size(); ct3++) {
-                if (sg->mOutputs[ct3]->mKernel == k) {
-                    aout = sg->mOutputs[ct3]->mAlloc.get();
-                    outExt = true;
-                    //ALOGE(" io out %p", aout);
-                }
-            }
-
-            if ((k->mHasKernelOutput == (aout != NULL)) &&
-                (k->mHasKernelInput == (ain != NULL))) {
-                ins.add(ain);
-                inExts.add(inExt);
-                outs.add(aout);
-                outExts.add(outExt);
-                kernels.add(k);
-            }
-        }
-
-    }
-
-    RsdHal * dc = (RsdHal *)rsc->mHal.drv;
-    MTLaunchStruct mtls;
-
-    if(fieldDep) {
-        for (size_t ct=0; ct < ins.size(); ct++) {
-            Script *s = kernels[ct]->mScript;
-            DrvScript *drv = (DrvScript *)s->mHal.drv;
-            uint32_t slot = kernels[ct]->mSlot;
-
-            rsdScriptInvokeForEachMtlsSetup(rsc, ins[ct], outs[ct], NULL, 0, NULL, &mtls);
-            mtls.script = s;
-
-            if (drv->mIntrinsicID) {
-                mtls.kernel = (void (*)())drv->mIntrinsicFuncs.root;
-                mtls.fep.usr = drv->mIntrinsicData;
-            } else {
-                mtls.kernel = reinterpret_cast<ForEachFunc_t>(
-                                  drv->mExecutable->getExportForeachFuncAddrs()[slot]);
-                rsAssert(mtls.kernel != NULL);
-                mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
-            }
-
-            rsdScriptLaunchThreads(rsc, s->mHal.info.isThreadable, ins[ct], outs[ct],
-                                   NULL, 0, NULL, &mtls);
-        }
-    } else {
-        ScriptList sl;
-        sl.ins = ins.array();
-        sl.outs = outs.array();
-        sl.kernels = kernels.array();
-        sl.count = kernels.size();
-
-        Vector<const void *> usrPtrs;
-        Vector<const void *> fnPtrs;
-        Vector<uint32_t> sigs;
-        for (size_t ct=0; ct < kernels.size(); ct++) {
-            Script *s = kernels[ct]->mScript;
-            DrvScript *drv = (DrvScript *)s->mHal.drv;
-
-            if (drv->mIntrinsicID) {
-                fnPtrs.add((void *)drv->mIntrinsicFuncs.root);
-                usrPtrs.add(drv->mIntrinsicData);
-                sigs.add(0);
-            } else {
-                int slot = kernels[ct]->mSlot;
-                fnPtrs.add((void *)drv->mExecutable->getExportForeachFuncAddrs()[slot]);
-                usrPtrs.add(NULL);
-                sigs.add(drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second);
-            }
-        }
-        sl.sigs = sigs.array();
-        sl.usrPtrs = usrPtrs.array();
-        sl.fnPtrs = fnPtrs.array();
-        sl.inExts = inExts.array();
-        sl.outExts = outExts.array();
-
-        rsdScriptInvokeForEachMtlsSetup(rsc, ins[0], outs[0], NULL, 0, NULL, &mtls);
-        mtls.script = NULL;
-        mtls.kernel = (void (*)())&ScriptGroupRoot;
-        mtls.fep.usr = &sl;
-        rsdScriptLaunchThreads(rsc, true, ins[0], outs[0], NULL, 0, NULL, &mtls);
-    }
-
-}
-
-void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
-                           const android::renderscript::ScriptGroup *sg) {
+void rsdScriptGroupDestroy(const Context *rsc, const ScriptGroup *sg) {
+    RsdCpuReference::CpuScriptGroup *sgi = (RsdCpuReference::CpuScriptGroup *)sg->mHal.drv;
+    delete sgi;
 }
 
 
diff --git a/driver/rsdScriptGroup.h b/driver/rsdScriptGroup.h
index a817aef..ee8cd69 100644
--- a/driver/rsdScriptGroup.h
+++ b/driver/rsdScriptGroup.h
@@ -20,7 +20,7 @@
 #include <rs_hal.h>
 
 bool rsdScriptGroupInit(const android::renderscript::Context *rsc,
-                        const android::renderscript::ScriptGroup *sg);
+                        android::renderscript::ScriptGroup *sg);
 void rsdScriptGroupSetInput(const android::renderscript::Context *rsc,
                             const android::renderscript::ScriptGroup *sg,
                             const android::renderscript::ScriptKernelID *kid,
diff --git a/driver/rsdShader.cpp b/driver/rsdShader.cpp
index 3654090..0361844 100644
--- a/driver/rsdShader.cpp
+++ b/driver/rsdShader.cpp
@@ -346,9 +346,9 @@
                 rsAssert(0);
             }
         }
-        ALOGE("Element size %u data=%p", elementSize, fd);
+        ALOGV("Element size %u data=%p", elementSize, fd);
         fd += elementSize;
-        ALOGE("New data=%p", fd);
+        ALOGV("New data=%p", fd);
     }
 }
 
@@ -524,8 +524,7 @@
             continue;
         }
 
-        DrvAllocation *adrv = (DrvAllocation *)alloc->mHal.drv;
-        const uint8_t *data = static_cast<const uint8_t *>(adrv->lod[0].mallocPtr);
+        const uint8_t *data = static_cast<const uint8_t *>(alloc->mHal.drvState.lod[0].mallocPtr);
         const Element *e = mRSProgram->mHal.state.constantTypes[ct]->getElement();
         for (uint32_t field=0; field < e->mHal.state.fieldsCount; field++) {
             const Element *f = e->mHal.state.fields[field];
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index b1247d7..79a4808 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -235,7 +235,7 @@
     }
 
     ALOGV("%s allocation ptr=%p  mUsageFlags=0x04%x, mMipmapControl=0x%04x",
-         prefix, mHal.drvState.mallocPtrLOD0, mHal.state.usageFlags, mHal.state.mipmapControl);
+         prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags, mHal.state.mipmapControl);
 }
 
 uint32_t Allocation::getPackedSize() const {
diff --git a/rsAllocation.h b/rsAllocation.h
index c6b918f..de79cba 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -70,9 +70,18 @@
         State state;
 
         struct DrvState {
-            mutable void * mallocPtrLOD0;
-            mutable uint32_t strideLOD0;
-        } drvState;
+            struct LodState {
+                void * mallocPtr;
+                size_t stride;
+                uint32_t dimX;
+                uint32_t dimY;
+                uint32_t dimZ;
+            } lod[android::renderscript::Allocation::MAX_LOD];
+            size_t faceOffset;
+            uint32_t lodCount;
+            uint32_t faceCount;
+        };
+        mutable DrvState drvState;
 
     };
     Hal mHal;
diff --git a/rsRuntime.h b/rsRuntime.h
index 7a1d5e2..3a20cb0 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -29,40 +29,40 @@
 // Context
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrBindTexture(Context *, Script *, ProgramFragment *, uint32_t slot, Allocation *);
-void rsrBindConstant(Context *, Script *, ProgramFragment *, uint32_t slot, Allocation *);
-void rsrBindConstant(Context *, Script *, ProgramVertex*, uint32_t slot, Allocation *);
-void rsrBindSampler(Context *, Script *, ProgramFragment *, uint32_t slot, Sampler *);
-void rsrBindProgramStore(Context *, Script *, ProgramStore *);
-void rsrBindProgramFragment(Context *, Script *, ProgramFragment *);
-void rsrBindProgramVertex(Context *, Script *, ProgramVertex *);
-void rsrBindProgramRaster(Context *, Script *, ProgramRaster *);
-void rsrBindFrameBufferObjectColorTarget(Context *, Script *, Allocation *, uint32_t slot);
-void rsrBindFrameBufferObjectDepthTarget(Context *, Script *, Allocation *);
-void rsrClearFrameBufferObjectColorTarget(Context *, Script *, uint32_t slot);
-void rsrClearFrameBufferObjectDepthTarget(Context *, Script *);
-void rsrClearFrameBufferObjectTargets(Context *, Script *);
+void rsrBindTexture(Context *, ProgramFragment *, uint32_t slot, Allocation *);
+void rsrBindConstant(Context *, ProgramFragment *, uint32_t slot, Allocation *);
+void rsrBindConstant(Context *, ProgramVertex*, uint32_t slot, Allocation *);
+void rsrBindSampler(Context *, ProgramFragment *, uint32_t slot, Sampler *);
+void rsrBindProgramStore(Context *, ProgramStore *);
+void rsrBindProgramFragment(Context *, ProgramFragment *);
+void rsrBindProgramVertex(Context *, ProgramVertex *);
+void rsrBindProgramRaster(Context *, ProgramRaster *);
+void rsrBindFrameBufferObjectColorTarget(Context *, Allocation *, uint32_t slot);
+void rsrBindFrameBufferObjectDepthTarget(Context *, Allocation *);
+void rsrClearFrameBufferObjectColorTarget(Context *, uint32_t slot);
+void rsrClearFrameBufferObjectDepthTarget(Context *);
+void rsrClearFrameBufferObjectTargets(Context *);
 
 //////////////////////////////////////////////////////////////////////////////
 // VP
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrVpLoadProjectionMatrix(Context *, Script *, const rsc_Matrix *m);
-void rsrVpLoadModelMatrix(Context *, Script *, const rsc_Matrix *m);
-void rsrVpLoadTextureMatrix(Context *, Script *, const rsc_Matrix *m);
-void rsrPfConstantColor(Context *, Script *, ProgramFragment *, float r, float g, float b, float a);
-void rsrVpGetProjectionMatrix(Context *, Script *, rsc_Matrix *m);
+void rsrVpLoadProjectionMatrix(Context *, const rsc_Matrix *m);
+void rsrVpLoadModelMatrix(Context *, const rsc_Matrix *m);
+void rsrVpLoadTextureMatrix(Context *, const rsc_Matrix *m);
+void rsrPfConstantColor(Context *, ProgramFragment *, float r, float g, float b, float a);
+void rsrVpGetProjectionMatrix(Context *, rsc_Matrix *m);
 
 //////////////////////////////////////////////////////////////////////////////
 // Drawing
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrDrawPath(Context *, Script *, Path *);
-void rsrDrawMesh(Context *, Script *, Mesh *);
-void rsrDrawMeshPrimitive(Context *, Script *, Mesh *, uint32_t primIndex);
-void rsrDrawMeshPrimitiveRange(Context *, Script *, Mesh *,
+void rsrDrawPath(Context *, Path *);
+void rsrDrawMesh(Context *, Mesh *);
+void rsrDrawMeshPrimitive(Context *, Mesh *, uint32_t primIndex);
+void rsrDrawMeshPrimitiveRange(Context *, Mesh *,
                                uint32_t primIndex, uint32_t start, uint32_t len);
-void rsrMeshComputeBoundingBox(Context *, Script *, Mesh *,
+void rsrMeshComputeBoundingBox(Context *, Mesh *,
                                float *minX, float *minY, float *minZ,
                                float *maxX, float *maxY, float *maxZ);
 
@@ -72,8 +72,7 @@
 //////////////////////////////////////////////////////////////////////////////
 
 
-void rsrColor(Context *, Script *, float r, float g, float b, float a);
-void rsrAllocationSyncAll(Context *, Script *, Allocation *);
+void rsrColor(Context *, float r, float g, float b, float a);
 
 void rsrAllocationCopy1DRange(Context *, Allocation *dstAlloc,
                               uint32_t dstOff,
@@ -89,44 +88,44 @@
                               uint32_t srcXoff, uint32_t srcYoff,
                               uint32_t srcMip, uint32_t srcFace);
 
-void rsrPrepareClear(Context *, Script *);
-uint32_t rsrGetWidth(Context *, Script *);
-uint32_t rsrGetHeight(Context *, Script *);
-void rsrDrawTextAlloc(Context *, Script *, Allocation *, int x, int y);
-void rsrDrawText(Context *, Script *, const char *text, int x, int y);
-void rsrSetMetrics(Context *, Script *, Font::Rect *metrics,
+void rsrPrepareClear(Context *);
+uint32_t rsrGetWidth(Context *);
+uint32_t rsrGetHeight(Context *);
+void rsrDrawTextAlloc(Context *, Allocation *, int x, int y);
+void rsrDrawText(Context *, const char *text, int x, int y);
+void rsrSetMetrics(Context *, Font::Rect *metrics,
                    int32_t *left, int32_t *right, int32_t *top, int32_t *bottom);
-void rsrMeasureTextAlloc(Context *, Script *, Allocation *,
+void rsrMeasureTextAlloc(Context *, Allocation *,
                          int32_t *left, int32_t *right, int32_t *top, int32_t *bottom);
-void rsrMeasureText(Context *, Script *, const char *text,
+void rsrMeasureText(Context *, const char *text,
                     int32_t *left, int32_t *right, int32_t *top, int32_t *bottom);
-void rsrBindFont(Context *, Script *, Font *);
-void rsrFontColor(Context *, Script *, float r, float g, float b, float a);
+void rsrBindFont(Context *, Font *);
+void rsrFontColor(Context *, float r, float g, float b, float a);
 
 //////////////////////////////////////////////////////////////////////////////
 // Time routines
 //////////////////////////////////////////////////////////////////////////////
 
-float rsrGetDt(Context *, Script *);
-time_t rsrTime(Context *, Script *, time_t *timer);
-tm* rsrLocalTime(Context *, Script *, tm *local, time_t *timer);
-int64_t rsrUptimeMillis(Context *, Script *);
-int64_t rsrUptimeNanos(Context *, Script *);
+float rsrGetDt(Context *, const Script *sc);
+time_t rsrTime(Context *, time_t *timer);
+tm* rsrLocalTime(Context *, tm *local, time_t *timer);
+int64_t rsrUptimeMillis(Context *);
+int64_t rsrUptimeNanos(Context *);
 
 //////////////////////////////////////////////////////////////////////////////
 // Message routines
 //////////////////////////////////////////////////////////////////////////////
 
-uint32_t rsrToClient(Context *, Script *, int cmdID, void *data, int len);
-uint32_t rsrToClientBlocking(Context *, Script *, int cmdID, void *data, int len);
+uint32_t rsrToClient(Context *, int cmdID, void *data, int len);
+uint32_t rsrToClientBlocking(Context *, int cmdID, void *data, int len);
 
 //////////////////////////////////////////////////////////////////////////////
 //
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrSetObject(const Context *, const Script *, ObjectBase **dst, ObjectBase * src);
-void rsrClearObject(const Context *, const Script *, ObjectBase **dst);
-bool rsrIsObject(const Context *, const Script *, const ObjectBase *src);
+void rsrSetObject(const Context *, ObjectBase **dst, ObjectBase * src);
+void rsrClearObject(const Context *, ObjectBase **dst);
+bool rsrIsObject(const Context *, const ObjectBase *src);
 
 void rsrAllocationIncRefs(const Context *, const Allocation *, void *ptr,
                           size_t elementCount, size_t startOffset);
@@ -134,14 +133,10 @@
                           size_t elementCount, size_t startOffset);
 
 
-uint32_t rsrToClient(Context *, Script *, int cmdID, void *data, int len);
-uint32_t rsrToClientBlocking(Context *, Script *, int cmdID, void *data, int len);
-
-void rsrAllocationMarkDirty(Context *, Script *, RsAllocation a);
-void rsrAllocationSyncAll(Context *, Script *, Allocation *a, RsAllocationUsageType source);
+void rsrAllocationSyncAll(Context *, Allocation *a, RsAllocationUsageType source);
 
 
-void rsrForEach(Context *, Script *, Script *target,
+void rsrForEach(Context *, Script *target,
                 Allocation *in,
                 Allocation *out,
                 const void *usr,
diff --git a/rsScript.h b/rsScript.h
index 6339f49..8afd3bc 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -72,7 +72,6 @@
             char const **exportedPragmaValueList;
 
             int (* root)();
-            bool isThreadable;
         };
         DriverInfo info;
     };
@@ -85,7 +84,7 @@
 
     struct Enviroment_t {
         int64_t mStartTimeMillis;
-        int64_t mLastDtTime;
+        mutable int64_t mLastDtTime;
 
         ObjectBaseRef<ProgramVertex> mVertex;
         ObjectBaseRef<ProgramFragment> mFragment;
diff --git a/rsScriptC.h b/rsScriptC.h
index 4ef2c4b..75fb0f4 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -60,8 +60,7 @@
 //protected:
     void setupScript(Context *);
     void setupGLState(Context *);
-    Script * setTLS(Script *);
-  private:
+private:
 #ifndef ANDROID_RS_SERIALIZE
     bcinfo::BitcodeTranslator *BT;
 #endif
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index ac3dd12..e8c9d1d 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -79,11 +79,11 @@
 // Time routines
 //////////////////////////////////////////////////////////////////////////////
 
-time_t rsrTime(Context *rsc, Script *sc, time_t *timer) {
+time_t rsrTime(Context *rsc, time_t *timer) {
     return time(timer);
 }
 
-tm* rsrLocalTime(Context *rsc, Script *sc, tm *local, time_t *timer) {
+tm* rsrLocalTime(Context *rsc, tm *local, time_t *timer) {
     if (!local) {
       return NULL;
     }
@@ -97,15 +97,15 @@
     return local;
 }
 
-int64_t rsrUptimeMillis(Context *rsc, Script *sc) {
+int64_t rsrUptimeMillis(Context *rsc) {
     return nanoseconds_to_milliseconds(systemTime(SYSTEM_TIME_MONOTONIC));
 }
 
-int64_t rsrUptimeNanos(Context *rsc, Script *sc) {
+int64_t rsrUptimeNanos(Context *rsc) {
     return systemTime(SYSTEM_TIME_MONOTONIC);
 }
 
-float rsrGetDt(Context *rsc, Script *sc) {
+float rsrGetDt(Context *rsc, const Script *sc) {
     int64_t l = sc->mEnviroment.mLastDtTime;
     sc->mEnviroment.mLastDtTime = systemTime(SYSTEM_TIME_MONOTONIC);
     return ((float)(sc->mEnviroment.mLastDtTime - l)) / 1.0e9;
@@ -115,7 +115,7 @@
 //
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrSetObject(const Context *rsc, const Script *sc, ObjectBase **dst, ObjectBase * src) {
+void rsrSetObject(const Context *rsc, ObjectBase **dst, ObjectBase * src) {
     //ALOGE("rsiSetObject  %p,%p  %p", vdst, *vdst, vsrc);
     if (src) {
         CHECK_OBJ(src);
@@ -128,7 +128,7 @@
     *dst = src;
 }
 
-void rsrClearObject(const Context *rsc, const Script *sc, ObjectBase **dst) {
+void rsrClearObject(const Context *rsc, ObjectBase **dst) {
     //ALOGE("rsiClearObject  %p,%p", vdst, *vdst);
     if (dst[0]) {
         CHECK_OBJ(dst[0]);
@@ -137,23 +137,23 @@
     *dst = NULL;
 }
 
-bool rsrIsObject(const Context *rsc, const Script *sc, const ObjectBase *src) {
+bool rsrIsObject(const Context *rsc, const ObjectBase *src) {
     return src != NULL;
 }
 
 
-uint32_t rsrToClient(Context *rsc, Script *sc, int cmdID, void *data, int len) {
+uint32_t rsrToClient(Context *rsc, int cmdID, void *data, int len) {
     //ALOGE("SC_toClient %i %i %i", cmdID, len);
     return rsc->sendMessageToClient(data, RS_MESSAGE_TO_CLIENT_USER, cmdID, len, false);
 }
 
-uint32_t rsrToClientBlocking(Context *rsc, Script *sc, int cmdID, void *data, int len) {
+uint32_t rsrToClientBlocking(Context *rsc, int cmdID, void *data, int len) {
     //ALOGE("SC_toClientBlocking %i %i", cmdID, len);
     return rsc->sendMessageToClient(data, RS_MESSAGE_TO_CLIENT_USER, cmdID, len, true);
 }
 
 
-void rsrForEach(Context *rsc, Script *sc,
+void rsrForEach(Context *rsc,
                 Script *target,
                 Allocation *in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
@@ -161,7 +161,7 @@
     target->runForEach(rsc, /* root slot */ 0, in, out, usr, usrBytes, call);
 }
 
-void rsrAllocationSyncAll(Context *rsc, Script *sc, Allocation *a, RsAllocationUsageType usage) {
+void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
     a->syncAll(rsc, usage);
 }
 
diff --git a/rsScriptC_LibGL.cpp b/rsScriptC_LibGL.cpp
index 63fb53e..279ddb2 100644
--- a/rsScriptC_LibGL.cpp
+++ b/rsScriptC_LibGL.cpp
@@ -46,73 +46,73 @@
 // Context
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrBindTexture(Context *rsc, Script *sc, ProgramFragment *pf, uint32_t slot, Allocation *a) {
+void rsrBindTexture(Context *rsc, ProgramFragment *pf, uint32_t slot, Allocation *a) {
     CHECK_OBJ_OR_NULL(a);
     CHECK_OBJ(pf);
     pf->bindTexture(rsc, slot, a);
 }
 
-void rsrBindConstant(Context *rsc, Script *sc, ProgramFragment *pf, uint32_t slot, Allocation *a) {
+void rsrBindConstant(Context *rsc, ProgramFragment *pf, uint32_t slot, Allocation *a) {
     CHECK_OBJ_OR_NULL(a);
     CHECK_OBJ(pf);
     pf->bindAllocation(rsc, a, slot);
 }
 
-void rsrBindConstant(Context *rsc, Script *sc, ProgramVertex *pv, uint32_t slot, Allocation *a) {
+void rsrBindConstant(Context *rsc, ProgramVertex *pv, uint32_t slot, Allocation *a) {
     CHECK_OBJ_OR_NULL(a);
     CHECK_OBJ(pv);
     pv->bindAllocation(rsc, a, slot);
 }
 
-void rsrBindSampler(Context *rsc, Script *sc, ProgramFragment *pf, uint32_t slot, Sampler *s) {
+void rsrBindSampler(Context *rsc, ProgramFragment *pf, uint32_t slot, Sampler *s) {
     CHECK_OBJ_OR_NULL(vs);
     CHECK_OBJ(vpf);
     pf->bindSampler(rsc, slot, s);
 }
 
-void rsrBindProgramStore(Context *rsc, Script *sc, ProgramStore *ps) {
+void rsrBindProgramStore(Context *rsc, ProgramStore *ps) {
     CHECK_OBJ_OR_NULL(ps);
     rsc->setProgramStore(ps);
 }
 
-void rsrBindProgramFragment(Context *rsc, Script *sc, ProgramFragment *pf) {
+void rsrBindProgramFragment(Context *rsc, ProgramFragment *pf) {
     CHECK_OBJ_OR_NULL(pf);
     rsc->setProgramFragment(pf);
 }
 
-void rsrBindProgramVertex(Context *rsc, Script *sc, ProgramVertex *pv) {
+void rsrBindProgramVertex(Context *rsc, ProgramVertex *pv) {
     CHECK_OBJ_OR_NULL(pv);
     rsc->setProgramVertex(pv);
 }
 
-void rsrBindProgramRaster(Context *rsc, Script *sc, ProgramRaster *pr) {
+void rsrBindProgramRaster(Context *rsc, ProgramRaster *pr) {
     CHECK_OBJ_OR_NULL(pr);
     rsc->setProgramRaster(pr);
 }
 
-void rsrBindFrameBufferObjectColorTarget(Context *rsc, Script *sc, Allocation *a, uint32_t slot) {
+void rsrBindFrameBufferObjectColorTarget(Context *rsc, Allocation *a, uint32_t slot) {
     CHECK_OBJ(va);
     rsc->mFBOCache.bindColorTarget(rsc, a, slot);
     rsc->mStateVertex.updateSize(rsc);
 }
 
-void rsrBindFrameBufferObjectDepthTarget(Context *rsc, Script *sc, Allocation *a) {
+void rsrBindFrameBufferObjectDepthTarget(Context *rsc, Allocation *a) {
     CHECK_OBJ(va);
     rsc->mFBOCache.bindDepthTarget(rsc, a);
     rsc->mStateVertex.updateSize(rsc);
 }
 
-void rsrClearFrameBufferObjectColorTarget(Context *rsc, Script *sc, uint32_t slot) {
+void rsrClearFrameBufferObjectColorTarget(Context *rsc, uint32_t slot) {
     rsc->mFBOCache.bindColorTarget(rsc, NULL, slot);
     rsc->mStateVertex.updateSize(rsc);
 }
 
-void rsrClearFrameBufferObjectDepthTarget(Context *rsc, Script *sc) {
+void rsrClearFrameBufferObjectDepthTarget(Context *rsc) {
     rsc->mFBOCache.bindDepthTarget(rsc, NULL);
     rsc->mStateVertex.updateSize(rsc);
 }
 
-void rsrClearFrameBufferObjectTargets(Context *rsc, Script *sc) {
+void rsrClearFrameBufferObjectTargets(Context *rsc) {
     rsc->mFBOCache.resetAll(rsc);
     rsc->mStateVertex.updateSize(rsc);
 }
@@ -121,25 +121,25 @@
 // VP
 //////////////////////////////////////////////////////////////////////////////
 
-void rsrVpLoadProjectionMatrix(Context *rsc, Script *sc, const rsc_Matrix *m) {
+void rsrVpLoadProjectionMatrix(Context *rsc, const rsc_Matrix *m) {
     rsc->getProgramVertex()->setProjectionMatrix(rsc, m);
 }
 
-void rsrVpLoadModelMatrix(Context *rsc, Script *sc, const rsc_Matrix *m) {
+void rsrVpLoadModelMatrix(Context *rsc, const rsc_Matrix *m) {
     rsc->getProgramVertex()->setModelviewMatrix(rsc, m);
 }
 
-void rsrVpLoadTextureMatrix(Context *rsc, Script *sc, const rsc_Matrix *m) {
+void rsrVpLoadTextureMatrix(Context *rsc, const rsc_Matrix *m) {
     rsc->getProgramVertex()->setTextureMatrix(rsc, m);
 }
 
-void rsrPfConstantColor(Context *rsc, Script *sc, ProgramFragment *pf,
+void rsrPfConstantColor(Context *rsc, ProgramFragment *pf,
                         float r, float g, float b, float a) {
     CHECK_OBJ(pf);
     pf->setConstantColor(rsc, r, g, b, a);
 }
 
-void rsrVpGetProjectionMatrix(Context *rsc, Script *sc, rsc_Matrix *m) {
+void rsrVpGetProjectionMatrix(Context *rsc, rsc_Matrix *m) {
     rsc->getProgramVertex()->getProjectionMatrix(rsc, m);
 }
 
@@ -148,7 +148,7 @@
 //////////////////////////////////////////////////////////////////////////////
 
 
-void rsrDrawPath(Context *rsc, Script *sc, Path *sm) {
+void rsrDrawPath(Context *rsc, Path *sm) {
     CHECK_OBJ(sm);
     if (!rsc->setupCheck()) {
         return;
@@ -156,7 +156,7 @@
     sm->render(rsc);
 }
 
-void rsrDrawMesh(Context *rsc, Script *sc, Mesh *sm) {
+void rsrDrawMesh(Context *rsc, Mesh *sm) {
     CHECK_OBJ(sm);
     if (!rsc->setupCheck()) {
         return;
@@ -164,7 +164,7 @@
     sm->render(rsc);
 }
 
-void rsrDrawMeshPrimitive(Context *rsc, Script *sc, Mesh *sm, uint32_t primIndex) {
+void rsrDrawMeshPrimitive(Context *rsc, Mesh *sm, uint32_t primIndex) {
     CHECK_OBJ(sm);
     if (!rsc->setupCheck()) {
         return;
@@ -172,7 +172,7 @@
     sm->renderPrimitive(rsc, primIndex);
 }
 
-void rsrDrawMeshPrimitiveRange(Context *rsc, Script *sc, Mesh *sm, uint32_t primIndex,
+void rsrDrawMeshPrimitiveRange(Context *rsc, Mesh *sm, uint32_t primIndex,
                                uint32_t start, uint32_t len) {
     CHECK_OBJ(sm);
     if (!rsc->setupCheck()) {
@@ -181,7 +181,7 @@
     sm->renderPrimitiveRange(rsc, primIndex, start, len);
 }
 
-void rsrMeshComputeBoundingBox(Context *rsc, Script *sc, Mesh *sm,
+void rsrMeshComputeBoundingBox(Context *rsc, Mesh *sm,
                                float *minX, float *minY, float *minZ,
                                float *maxX, float *maxY, float *maxZ) {
     CHECK_OBJ(sm);
@@ -200,32 +200,32 @@
 //////////////////////////////////////////////////////////////////////////////
 
 
-void rsrColor(Context *rsc, Script *sc, float r, float g, float b, float a) {
+void rsrColor(Context *rsc, float r, float g, float b, float a) {
     ProgramFragment *pf = rsc->getProgramFragment();
     pf->setConstantColor(rsc, r, g, b, a);
 }
 
-void rsrPrepareClear(Context *rsc, Script *sc) {
+void rsrPrepareClear(Context *rsc) {
     rsc->mFBOCache.setup(rsc);
     rsc->setupProgramStore();
 }
 
-uint32_t rsrGetWidth(Context *rsc, Script *sc) {
+uint32_t rsrGetWidth(Context *rsc) {
     return rsc->getWidth();
 }
 
-uint32_t rsrGetHeight(Context *rsc, Script *sc) {
+uint32_t rsrGetHeight(Context *rsc) {
     return rsc->getHeight();
 }
 
-void rsrDrawTextAlloc(Context *rsc, Script *sc, Allocation *a, int x, int y) {
+void rsrDrawTextAlloc(Context *rsc, Allocation *a, int x, int y) {
     const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
     size_t allocSize = a->getType()->getSizeBytes();
     rsc->mStateFont.renderText(text, allocSize, x, y);
     rsc->mHal.funcs.allocation.unlock1D(rsc, a);
 }
 
-void rsrDrawText(Context *rsc, Script *sc, const char *text, int x, int y) {
+void rsrDrawText(Context *rsc, const char *text, int x, int y) {
     size_t textLen = strlen(text);
     rsc->mStateFont.renderText(text, textLen, x, y);
 }
@@ -246,7 +246,7 @@
     }
 }
 
-void rsrMeasureTextAlloc(Context *rsc, Script *sc, Allocation *a,
+void rsrMeasureTextAlloc(Context *rsc, Allocation *a,
                          int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
     CHECK_OBJ(a);
     const char *text = (const char *)rsc->mHal.funcs.allocation.lock1D(rsc, a);
@@ -257,7 +257,7 @@
     rsc->mHal.funcs.allocation.unlock1D(rsc, a);
 }
 
-void rsrMeasureText(Context *rsc, Script *sc, const char *text,
+void rsrMeasureText(Context *rsc, const char *text,
                     int32_t *left, int32_t *right, int32_t *top, int32_t *bottom) {
     size_t textLen = strlen(text);
     Font::Rect metrics;
@@ -265,12 +265,12 @@
     SetMetrics(&metrics, left, right, top, bottom);
 }
 
-void rsrBindFont(Context *rsc, Script *sc, Font *font) {
+void rsrBindFont(Context *rsc, Font *font) {
     CHECK_OBJ(font);
     rsi_ContextBindFont(rsc, font);
 }
 
-void rsrFontColor(Context *rsc, Script *sc, float r, float g, float b, float a) {
+void rsrFontColor(Context *rsc, float r, float g, float b, float a) {
     rsc->mStateFont.setFontColor(r, g, b, a);
 }
 
diff --git a/rs_hal.h b/rs_hal.h
index f172fbf..877fd96 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -265,7 +265,7 @@
     } framebuffer;
 
     struct {
-        bool (*init)(const Context *rsc, const ScriptGroup *sg);
+        bool (*init)(const Context *rsc, ScriptGroup *sg);
         void (*setInput)(const Context *rsc, const ScriptGroup *sg,
                          const ScriptKernelID *kid, Allocation *);
         void (*setOutput)(const Context *rsc, const ScriptGroup *sg,