Separate CPU driver impl from reference driver.

Change-Id: Ifb484edda665959b81d7b1f890d108bfa20a535d
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
new file mode 100644
index 0000000..062a916
--- /dev/null
+++ b/cpu_ref/Android.mk
@@ -0,0 +1,51 @@
+
+LOCAL_PATH:=$(call my-dir)
+
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable
+ifeq ($(TARGET_BUILD_PDK), true)
+  rs_base_CFLAGS += -D__RS_PDK__
+endif
+
+ifneq ($(OVERRIDE_RS_DRIVER),)
+  rs_base_CFLAGS += -DOVERRIDE_RS_DRIVER=$(OVERRIDE_RS_DRIVER)
+endif
+
+include $(CLEAR_VARS)
+LOCAL_CLANG := true
+LOCAL_MODULE := libRSCpuRef
+
+LOCAL_SRC_FILES:= \
+	rsCpuCore.cpp \
+	rsCpuScript.cpp \
+	rsCpuRuntimeMath.cpp \
+	rsCpuRuntimeStubs.cpp \
+	rsCpuScriptGroup.cpp \
+	rsCpuIntrinsic.cpp \
+	rsCpuIntrinsicBlend.cpp \
+	rsCpuIntrinsicBlur.cpp \
+	rsCpuIntrinsicColorMatrix.cpp \
+	rsCpuIntrinsicConvolve3x3.cpp \
+	rsCpuIntrinsicConvolve5x5.cpp \
+	rsCpuIntrinsicLUT.cpp \
+	rsCpuIntrinsicYuvToRGB.cpp
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+    LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
+    LOCAL_SRC_FILES+= \
+        rsCpuIntrinsics_neon.S
+endif
+
+LOCAL_SHARED_LIBRARIES += libRS libcutils libutils libsync
+LOCAL_SHARED_LIBRARIES += libbcc libbcinfo
+
+LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
+LOCAL_C_INCLUDES += frameworks/rs
+
+LOCAL_CFLAGS += $(rs_base_CFLAGS)
+
+LOCAL_LDLIBS := -lpthread -ldl
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_SHARED_LIBRARY)
+
+
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
new file mode 100644
index 0000000..29539da
--- /dev/null
+++ b/cpu_ref/rsCpuCore.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsCpuScriptGroup.h"
+
+#include <malloc.h>
+#include "rsContext.h"
+
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <sched.h>
+#include <cutils/properties.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include "utils/StopWatch.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+typedef void (*outer_foreach_t)(
+    const android::renderscript::RsForEachStubParamStruct *,
+    uint32_t x1, uint32_t x2,
+    uint32_t instep, uint32_t outstep);
+
+
+static pthread_key_t gThreadTLSKey = 0;
+static uint32_t gThreadTLSKeyCount = 0;
+static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
+
+RsdCpuReference::~RsdCpuReference() {
+}
+
+RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
+                                          uint32_t version_minor, sym_lookup_t lfn,
+                                          script_lookup_t slfn) {
+
+    RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
+    if (!cpu) {
+        return NULL;
+    }
+    if (!cpu->init(version_major, version_minor, lfn, slfn)) {
+        delete cpu;
+        return NULL;
+    }
+    return cpu;
+}
+
+
+Context * RsdCpuReference::getTlsContext() {
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    return tls->mContext;
+}
+
+const Script * RsdCpuReference::getTlsScript() {
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    return tls->mScript;
+}
+
+
+////////////////////////////////////////////////////////////
+///
+
+RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
+    mRSC = rsc;
+
+    version_major = 0;
+    version_minor = 0;
+    mInForEach = false;
+    memset(&mWorkers, 0, sizeof(mWorkers));
+    memset(&mTlsStruct, 0, sizeof(mTlsStruct));
+    mExit = false;
+
+}
+
+
+void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
+    RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
+
+
+    uint32_t idx = (uint32_t)android_atomic_inc(&dc->mWorkers.mLaunchCount);
+
+    //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
+
+    dc->mWorkers.mLaunchSignals[idx].init();
+    dc->mWorkers.mNativeThreadId[idx] = gettid();
+
+    memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
+    int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
+    if (status) {
+        ALOGE("pthread_setspecific %i", status);
+    }
+
+#if 0
+    typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
+    cpu_set_t cpuset;
+    memset(&cpuset, 0, sizeof(cpuset));
+    cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
+    int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
+              sizeof(cpuset), &cpuset);
+    ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
+#endif
+
+    while (!dc->mExit) {
+        dc->mWorkers.mLaunchSignals[idx].wait();
+        if (dc->mWorkers.mLaunchCallback) {
+           // idx +1 is used because the calling thread is always worker 0.
+           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
+        }
+        android_atomic_dec(&dc->mWorkers.mRunningCount);
+        dc->mWorkers.mCompleteSignal.set();
+    }
+
+    //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
+    return NULL;
+}
+
+void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
+    mWorkers.mLaunchData = data;
+    mWorkers.mLaunchCallback = cbk;
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+
+    // We use the calling thread as one of the workers so we can start without
+    // the delay of the thread wakeup.
+    if (mWorkers.mLaunchCallback) {
+       mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
+    }
+
+    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+        mWorkers.mCompleteSignal.wait();
+    }
+}
+
+
+void RsdCpuReferenceImpl::lockMutex() {
+    pthread_mutex_lock(&gInitMutex);
+}
+
+void RsdCpuReferenceImpl::unlockMutex() {
+    pthread_mutex_unlock(&gInitMutex);
+}
+
+bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
+                               sym_lookup_t lfn, script_lookup_t slfn) {
+
+    mSymLookupFn = lfn;
+    mScriptLookupFn = slfn;
+
+    lockMutex();
+    if (!gThreadTLSKeyCount) {
+        int status = pthread_key_create(&gThreadTLSKey, NULL);
+        if (status) {
+            ALOGE("Failed to init thread tls key.");
+            unlockMutex();
+            return false;
+        }
+    }
+    gThreadTLSKeyCount++;
+    unlockMutex();
+
+    mTlsStruct.mContext = mRSC;
+    mTlsStruct.mScript = NULL;
+    int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
+    if (status) {
+        ALOGE("pthread_setspecific %i", status);
+    }
+
+    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    if(mRSC->props.mDebugMaxThreads) {
+        cpu = mRSC->props.mDebugMaxThreads;
+    }
+    if (cpu < 2) {
+        mWorkers.mCount = 0;
+        return true;
+    }
+
+    // Subtract one from the cpu count because we also use the command thread as a worker.
+    mWorkers.mCount = (uint32_t)(cpu - 1);
+
+    ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount);
+
+    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
+    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
+    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
+    mWorkers.mLaunchCallback = NULL;
+
+    mWorkers.mCompleteSignal.init();
+
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    android_atomic_release_store(0, &mWorkers.mLaunchCount);
+
+    pthread_attr_t threadAttr;
+    status = pthread_attr_init(&threadAttr);
+    if (status) {
+        ALOGE("Failed to init thread attribute.");
+        return false;
+    }
+
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
+        if (status) {
+            mWorkers.mCount = ct;
+            ALOGE("Created fewer than expected number of RS threads.");
+            break;
+        }
+    }
+    while (android_atomic_acquire_load(&mWorkers.mRunningCount) != 0) {
+        usleep(100);
+    }
+
+    pthread_attr_destroy(&threadAttr);
+    return true;
+}
+
+
+void RsdCpuReferenceImpl::setPriority(int32_t priority) {
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
+    }
+}
+
+RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
+    mExit = true;
+    mWorkers.mLaunchData = NULL;
+    mWorkers.mLaunchCallback = NULL;
+    android_atomic_release_store(mWorkers.mCount, &mWorkers.mRunningCount);
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+    void *res;
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        pthread_join(mWorkers.mThreadId[ct], &res);
+    }
+    rsAssert(android_atomic_acquire_load(&mWorkers.mRunningCount) == 0);
+
+    // Global structure cleanup.
+    lockMutex();
+    --gThreadTLSKeyCount;
+    if (!gThreadTLSKeyCount) {
+        pthread_key_delete(gThreadTLSKey);
+    }
+    unlockMutex();
+
+}
+
+typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+static void wc_xy(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsForEachStubParamStruct p;
+    memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
+    uint32_t sig = mtls->sig;
+
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd = yStart + mtls->mSliceSize;
+        yEnd = rsMin(yEnd, mtls->yEnd);
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        for (p.y = yStart; p.y < yEnd; p.y++) {
+            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
+                    (mtls->fep.eStrideOut * mtls->xStart);
+            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
+                   (mtls->fep.eStrideIn * mtls->xStart);
+            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+        }
+    }
+}
+
+static void wc_x(void *usr, uint32_t idx) {
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    RsForEachStubParamStruct p;
+    memcpy(&p, &mtls->fep, sizeof(p));
+    p.lid = idx;
+    uint32_t sig = mtls->sig;
+
+    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+        uint32_t xEnd = xStart + mtls->mSliceSize;
+        xEnd = rsMin(xEnd, mtls->xEnd);
+        if (xEnd <= xStart) {
+            return;
+        }
+
+        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
+        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+
+        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    }
+}
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
+                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
+
+    //android::StopWatch kernel_time("kernel time");
+
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            launchThreads(wc_x, mtls);
+        }
+        mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsForEachStubParamStruct p;
+        memcpy(&p, &mtls->fep, sizeof(p));
+        uint32_t sig = mtls->sig;
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
+                                      mtls->fep.dimY * p.z + p.y;
+                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
+                            (mtls->fep.eStrideOut * mtls->xStart);
+                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
+                           (mtls->fep.eStrideIn * mtls->xStart);
+                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+                }
+            }
+        }
+    }
+}
+
+RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
+    //ALOGE("setTls %p", sc);
+    ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
+    rsAssert(tls);
+    RsdCpuScriptImpl *old = tls->mImpl;
+    tls->mImpl = sc;
+    tls->mContext = mRSC;
+    if (sc) {
+        tls->mScript = sc->getScript();
+    } else {
+        tls->mScript = NULL;
+    }
+    return old;
+}
+
+const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
+    return mSymLookupFn(mRSC, name);
+}
+
+
+RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
+                                    char const *resName, char const *cacheDir,
+                                    uint8_t const *bitcode, size_t bitcodeSize,
+                                    uint32_t flags) {
+
+    RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
+    if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags)) {
+        delete i;
+        return NULL;
+    }
+    return i;
+}
+
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s);
+
+RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
+                                    RsScriptIntrinsicID iid, Element *e) {
+
+    RsdCpuScriptImpl *i = NULL;
+    switch (iid) {
+    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
+        i = rsdIntrinsic_Convolve3x3(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
+        i = rsdIntrinsic_ColorMatrix(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_LUT:
+        i = rsdIntrinsic_LUT(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
+        i = rsdIntrinsic_Convolve5x5(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_BLUR:
+        i = rsdIntrinsic_Blur(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
+        i = rsdIntrinsic_YuvToRGB(this, s);
+        break;
+    case RS_SCRIPT_INTRINSIC_ID_BLEND:
+        i = rsdIntrinsic_Blend(this, s);
+        break;
+
+    default:
+        rsAssert(0);
+    }
+
+    return i;
+}
+
+RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
+    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
+    if (!sgi->init()) {
+        delete sgi;
+        return NULL;
+    }
+    return sgi;
+}
+
+
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
new file mode 100644
index 0000000..4883591
--- /dev/null
+++ b/cpu_ref/rsCpuCore.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_CORE_H
+#define RSD_CPU_CORE_H
+
+#include "rsd_cpu.h"
+#include "rsSignal.h"
+#include "rsContext.h"
+#include "rsElement.h"
+#include "rsScriptC.h"
+
+namespace bcc {
+    class BCCContext;
+    class RSCompilerDriver;
+    class RSExecutable;
+}
+
+namespace android {
+namespace renderscript {
+
+
+typedef void (* InvokeFunc_t)(void);
+typedef void (* ForEachFunc_t)(void);
+typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
+
+class RsdCpuScriptImpl;
+class RsdCpuReferenceImpl;
+
+typedef struct ScriptTLSStructRec {
+    android::renderscript::Context * mContext;
+    const android::renderscript::Script * mScript;
+    RsdCpuScriptImpl *mImpl;
+} ScriptTLSStruct;
+
+typedef struct {
+    RsForEachStubParamStruct fep;
+
+    RsdCpuReferenceImpl *rsc;
+    RsdCpuScriptImpl *script;
+
+    ForEachFunc_t kernel;
+    uint32_t sig;
+    const Allocation * ain;
+    Allocation * aout;
+
+    uint32_t mSliceSize;
+    volatile int mSliceNum;
+    bool isThreadable;
+
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+} MTLaunchStruct;
+
+
+
+
+class RsdCpuReferenceImpl : public RsdCpuReference {
+public:
+    virtual ~RsdCpuReferenceImpl();
+    RsdCpuReferenceImpl(Context *);
+
+    void lockMutex();
+    void unlockMutex();
+
+    bool init(uint32_t version_major, uint32_t version_minor, sym_lookup_t, script_lookup_t);
+    virtual void setPriority(int32_t priority);
+    virtual void launchThreads(WorkerCallback_t cbk, void *data);
+    static void * helperThreadProc(void *vrsc);
+    RsdCpuScriptImpl * setTLS(RsdCpuScriptImpl *sc);
+
+    Context * getContext() {return mRSC;}
+
+    void launchThreads(const Allocation * ain, Allocation * aout,
+                       const RsScriptCall *sc, MTLaunchStruct *mtls);
+
+    virtual CpuScript * createScript(const ScriptC *s,
+                                     char const *resName, char const *cacheDir,
+                                     uint8_t const *bitcode, size_t bitcodeSize,
+                                     uint32_t flags);
+    virtual CpuScript * createIntrinsic(const Script *s,
+                                        RsScriptIntrinsicID iid, Element *e);
+    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg);
+
+    const RsdCpuReference::CpuSymbol *symLookup(const char *);
+
+    RsdCpuReference::CpuScript * lookupScript(const Script *s) {
+        return mScriptLookupFn(mRSC, s);
+    }
+
+
+protected:
+    Context *mRSC;
+    uint32_t version_major;
+    uint32_t version_minor;
+    //bool mHasGraphics;
+    bool mInForEach;
+
+    struct Workers {
+        volatile int mRunningCount;
+        volatile int mLaunchCount;
+        uint32_t mCount;
+        pthread_t *mThreadId;
+        pid_t *mNativeThreadId;
+        Signal mCompleteSignal;
+        Signal *mLaunchSignals;
+        WorkerCallback_t mLaunchCallback;
+        void *mLaunchData;
+    };
+    Workers mWorkers;
+    bool mExit;
+    sym_lookup_t mSymLookupFn;
+    script_lookup_t mScriptLookupFn;
+
+    ScriptTLSStruct mTlsStruct;
+};
+
+
+}
+}
+
+#endif
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
new file mode 100644
index 0000000..a4eef21
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+RsdCpuScriptIntrinsic::RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s,
+                                             RsScriptIntrinsicID iid)
+        : RsdCpuScriptImpl(ctx, s) {
+
+    mID = iid;
+}
+
+RsdCpuScriptIntrinsic::~RsdCpuScriptIntrinsic() {
+}
+
+void RsdCpuScriptIntrinsic::invokeFunction(uint32_t slot, const void *params, size_t paramLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeFunction");
+}
+
+int RsdCpuScriptIntrinsic::invokeRoot() {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeRoot");
+    return 0;
+}
+
+void RsdCpuScriptIntrinsic::invokeInit() {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::invokeInit");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalVar");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalVarWithElemDims(uint32_t slot, const void *data,
+                                                     size_t dataLength, const Element *e,
+                                                     const size_t *dims, size_t dimLength) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalVarWithElemDims");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalBind(uint32_t slot, Allocation *data) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalBind");
+}
+
+void RsdCpuScriptIntrinsic::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    mCtx->getContext()->setError(RS_ERROR_FATAL_DRIVER,
+                                 "Unexpected RsdCpuScriptIntrinsic::setGlobalObj");
+}
+
+void RsdCpuScriptIntrinsic::invokeFreeChildren() {
+}
+
+
+void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
+                                          const Allocation * ain,
+                                          Allocation * aout,
+                                          const void * usr,
+                                          uint32_t usrLen,
+                                          const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    mtls.script = this;
+    mtls.fep.slot = slot;
+
+    mtls.kernel = (void (*)())mRootPtr;
+    mtls.fep.usr = this;
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+
+    mtls->script = this;
+    mtls->fep.slot = slot;
+    mtls->kernel = (void (*)())mRootPtr;
+    mtls->fep.usr = this;
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
new file mode 100644
index 0000000..1756115
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_SCRIPT_INTRINSIC_H
+#define RSD_CPU_SCRIPT_INTRINSIC_H
+
+#include "rsCpuScript.h"
+
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsic : public RsdCpuScriptImpl {
+public:
+    virtual void populateScript(Script *) = 0;
+
+    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual int invokeRoot();
+    virtual void invokeForEach(uint32_t slot,
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+    virtual void invokeInit();
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                  const Element *e, const size_t *dims, size_t dimLength);
+    virtual void setGlobalBind(uint32_t slot, Allocation *data);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsic();
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, RsScriptIntrinsicID iid);
+
+protected:
+    RsScriptIntrinsicID mID;
+    outer_foreach_t mRootPtr;
+
+};
+
+
+
+}
+}
+
+#endif
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
new file mode 100644
index 0000000..57286d5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBlend : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+
+    virtual ~RsdCpuScriptIntrinsicBlend();
+    RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    static void kernel(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+enum {
+    BLEND_CLEAR = 0,
+    BLEND_SRC = 1,
+    BLEND_DST = 2,
+    BLEND_SRC_OVER = 3,
+    BLEND_DST_OVER = 4,
+    BLEND_SRC_IN = 5,
+    BLEND_DST_IN = 6,
+    BLEND_SRC_OUT = 7,
+    BLEND_DST_OUT = 8,
+    BLEND_SRC_ATOP = 9,
+    BLEND_DST_ATOP = 10,
+    BLEND_XOR = 11,
+
+    BLEND_NORMAL = 12,
+    BLEND_AVERAGE = 13,
+    BLEND_MULTIPLY = 14,
+    BLEND_SCREEN = 15,
+    BLEND_DARKEN = 16,
+    BLEND_LIGHTEN = 17,
+    BLEND_OVERLAY = 18,
+    BLEND_HARDLIGHT = 19,
+    BLEND_SOFTLIGHT = 20,
+    BLEND_DIFFERENCE = 21,
+    BLEND_NEGATION = 22,
+    BLEND_EXCLUSION = 23,
+    BLEND_COLOR_DODGE = 24,
+    BLEND_INVERSE_COLOR_DODGE = 25,
+    BLEND_SOFT_DODGE = 26,
+    BLEND_COLOR_BURN = 27,
+    BLEND_INVERSE_COLOR_BURN = 28,
+    BLEND_SOFT_BURN = 29,
+    BLEND_REFLECT = 30,
+    BLEND_GLOW = 31,
+    BLEND_FREEZE = 32,
+    BLEND_HEAT = 33,
+    BLEND_ADD = 34,
+    BLEND_SUBTRACT = 35,
+    BLEND_STAMP = 36,
+    BLEND_RED = 37,
+    BLEND_GREEN = 38,
+    BLEND_BLUE = 39,
+    BLEND_HUE = 40,
+    BLEND_SATURATION = 41,
+    BLEND_COLOR = 42,
+    BLEND_LUMINOSITY = 43
+};
+
+extern "C" void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
+extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
+
+//#undef ARCH_ARM_HAVE_NEON
+
+void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
+                                        uint32_t xstart, uint32_t xend,
+                                        uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
+
+    // instep/outstep can be ignored--sizeof(uchar4) known at compile time
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    switch (p->slot) {
+    case BLEND_CLEAR:
+        for (;x1 < x2; x1++, out++) {
+            *out = 0;
+        }
+        break;
+    case BLEND_SRC:
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = *in;
+        }
+        break;
+    //BLEND_DST is a NOP
+    case BLEND_DST:
+        break;
+    case BLEND_SRC_OVER:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendSrcOver_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            in_s = in_s + ((out_s * (short4)(255 - in_s.a)) >> (short4)8);
+            *out = convert_uchar4(in_s);
+        }
+        break;
+    case BLEND_DST_OVER:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendDstOver_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            in_s = out_s + ((in_s * (short4)(255 - out_s.a)) >> (short4)8);
+            *out = convert_uchar4(in_s);
+        }
+        break;
+    case BLEND_SRC_IN:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendSrcIn_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            in_s = (in_s * out->a) >> (short4)8;
+            *out = convert_uchar4(in_s);
+        }
+        break;
+    case BLEND_DST_IN:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendDstIn_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 out_s = convert_short4(*out);
+            out_s = (out_s * in->a) >> (short4)8;
+            *out = convert_uchar4(out_s);
+        }
+        break;
+    case BLEND_SRC_OUT:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendSrcOut_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            in_s = (in_s * (short4)(255 - out->a)) >> (short4)8;
+            *out = convert_uchar4(in_s);
+        }
+        break;
+    case BLEND_DST_OUT:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendDstOut_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 out_s = convert_short4(*out);
+            out_s = (out_s * (short4)(255 - in->a)) >> (short4)8;
+            *out = convert_uchar4(out_s);
+        }
+        break;
+    case BLEND_SRC_ATOP:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendSrcAtop_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            out_s.rgb = ((in_s.rgb * out_s.a) +
+              (out_s.rgb * ((short3)255 - (short3)in_s.a))) >> (short3)8;
+            *out = convert_uchar4(out_s);
+        }
+        break;
+    case BLEND_DST_ATOP:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendDstAtop_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            short4 in_s = convert_short4(*in);
+            short4 out_s = convert_short4(*out);
+            out_s.rgb = ((out_s.rgb * in_s.a) +
+              (in_s.rgb * ((short3)255 - (short3)out_s.a))) >> (short3)8;
+            *out = convert_uchar4(out_s);
+        }
+        break;
+    case BLEND_XOR:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendXor_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            *out = *in ^ *out;
+        }
+        break;
+    case BLEND_NORMAL:
+        ALOGE("Called unimplemented blend intrinsic BLEND_NORMAL");
+        rsAssert(false);
+        break;
+    case BLEND_AVERAGE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_AVERAGE");
+        rsAssert(false);
+        break;
+    case BLEND_MULTIPLY:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendMultiply_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = convert_uchar4((convert_short4(*in) * convert_short4(*out))
+                                >> (short4)8);
+        }
+        break;
+    case BLEND_SCREEN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_SCREEN");
+        rsAssert(false);
+        break;
+    case BLEND_DARKEN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_DARKEN");
+        rsAssert(false);
+        break;
+    case BLEND_LIGHTEN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_LIGHTEN");
+        rsAssert(false);
+        break;
+    case BLEND_OVERLAY:
+        ALOGE("Called unimplemented blend intrinsic BLEND_OVERLAY");
+        rsAssert(false);
+        break;
+    case BLEND_HARDLIGHT:
+        ALOGE("Called unimplemented blend intrinsic BLEND_HARDLIGHT");
+        rsAssert(false);
+        break;
+    case BLEND_SOFTLIGHT:
+        ALOGE("Called unimplemented blend intrinsic BLEND_SOFTLIGHT");
+        rsAssert(false);
+        break;
+    case BLEND_DIFFERENCE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_DIFFERENCE");
+        rsAssert(false);
+        break;
+    case BLEND_NEGATION:
+        ALOGE("Called unimplemented blend intrinsic BLEND_NEGATION");
+        rsAssert(false);
+        break;
+    case BLEND_EXCLUSION:
+        ALOGE("Called unimplemented blend intrinsic BLEND_EXCLUSION");
+        rsAssert(false);
+        break;
+    case BLEND_COLOR_DODGE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_COLOR_DODGE");
+        rsAssert(false);
+        break;
+    case BLEND_INVERSE_COLOR_DODGE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_INVERSE_COLOR_DODGE");
+        rsAssert(false);
+        break;
+    case BLEND_SOFT_DODGE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_SOFT_DODGE");
+        rsAssert(false);
+        break;
+    case BLEND_COLOR_BURN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_COLOR_BURN");
+        rsAssert(false);
+        break;
+    case BLEND_INVERSE_COLOR_BURN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_INVERSE_COLOR_BURN");
+        rsAssert(false);
+        break;
+    case BLEND_SOFT_BURN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_SOFT_BURN");
+        rsAssert(false);
+        break;
+    case BLEND_REFLECT:
+        ALOGE("Called unimplemented blend intrinsic BLEND_REFLECT");
+        rsAssert(false);
+        break;
+    case BLEND_GLOW:
+        ALOGE("Called unimplemented blend intrinsic BLEND_GLOW");
+        rsAssert(false);
+        break;
+    case BLEND_FREEZE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_FREEZE");
+        rsAssert(false);
+        break;
+    case BLEND_HEAT:
+        ALOGE("Called unimplemented blend intrinsic BLEND_HEAT");
+        rsAssert(false);
+        break;
+    case BLEND_ADD:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendAdd_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            uint32_t iR = in->r, iG = in->g, iB = in->b, iA = in->a,
+                oR = out->r, oG = out->g, oB = out->b, oA = out->a;
+            out->r = (oR + iR) > 255 ? 255 : oR + iR;
+            out->g = (oG + iG) > 255 ? 255 : oG + iG;
+            out->b = (oB + iB) > 255 ? 255 : oB + iB;
+            out->a = (oA + iA) > 255 ? 255 : oA + iA;
+        }
+        break;
+    case BLEND_SUBTRACT:
+#if defined(ARCH_ARM_HAVE_NEON)
+        if((x1 + 8) < x2) {
+            uint32_t len = (x2 - x1) >> 3;
+            rsdIntrinsicBlendSub_K(out, in, len);
+            x1 += len << 3;
+            out += len << 3;
+            in += len << 3;
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            int32_t iR = in->r, iG = in->g, iB = in->b, iA = in->a,
+                oR = out->r, oG = out->g, oB = out->b, oA = out->a;
+            out->r = (oR - iR) < 0 ? 0 : oR - iR;
+            out->g = (oG - iG) < 0 ? 0 : oG - iG;
+            out->b = (oB - iB) < 0 ? 0 : oB - iB;
+            out->a = (oA - iA) < 0 ? 0 : oA - iA;
+        }
+        break;
+    case BLEND_STAMP:
+        ALOGE("Called unimplemented blend intrinsic BLEND_STAMP");
+        rsAssert(false);
+        break;
+    case BLEND_RED:
+        ALOGE("Called unimplemented blend intrinsic BLEND_RED");
+        rsAssert(false);
+        break;
+    case BLEND_GREEN:
+        ALOGE("Called unimplemented blend intrinsic BLEND_GREEN");
+        rsAssert(false);
+        break;
+    case BLEND_BLUE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_BLUE");
+        rsAssert(false);
+        break;
+    case BLEND_HUE:
+        ALOGE("Called unimplemented blend intrinsic BLEND_HUE");
+        rsAssert(false);
+        break;
+    case BLEND_SATURATION:
+        ALOGE("Called unimplemented blend intrinsic BLEND_SATURATION");
+        rsAssert(false);
+        break;
+    case BLEND_COLOR:
+        ALOGE("Called unimplemented blend intrinsic BLEND_COLOR");
+        rsAssert(false);
+        break;
+    case BLEND_LUMINOSITY:
+        ALOGE("Called unimplemented blend intrinsic BLEND_LUMINOSITY");
+        rsAssert(false);
+        break;
+
+    default:
+        ALOGE("Called unimplemented value %d", p->slot);
+        rsAssert(false);
+
+    }
+}
+
+
+RsdCpuScriptIntrinsicBlend::RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLEND) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsicBlend::~RsdCpuScriptIntrinsicBlend() {
+}
+
+void RsdCpuScriptIntrinsicBlend::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 0;
+}
+
+RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx, const Script *s) {
+    return new RsdCpuScriptIntrinsicBlend(ctx, s);
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
new file mode 100644
index 0000000..48363d1
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicBlur();
+    RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[104];
+    short ip[104];
+    float radius;
+    int iradius;
+    ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+    void ComputeGaussianWeights();
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
+    // Compute gaussian weights for the blur
+    // e is the euler's number
+    float e = 2.718281828459045f;
+    float pi = 3.1415926535897932f;
+    // g(x) = ( 1 / sqrt( 2 * pi ) * sigma) * e ^ ( -x^2 / 2 * sigma^2 )
+    // x is of the form [-radius .. 0 .. radius]
+    // and sigma varies with radius.
+    // Based on some experimental radius values and sigma's
+    // we approximately fit sigma = f(radius) as
+    // sigma = radius * 0.4  + 0.6
+    // The larger the radius gets, the more our gaussian blur
+    // will resemble a box blur since with large sigma
+    // the gaussian curve begins to lose its shape
+    float sigma = 0.4f * radius + 0.6f;
+
+    // Now compute the coefficients. We will store some redundant values to save
+    // some math during the blur calculations precompute some values
+    float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
+    float coeff2 = - 1.0f / (2.0f * sigma * sigma);
+
+    float normalizeFactor = 0.0f;
+    float floatR = 0.0f;
+    int r;
+    iradius = (float)ceil(radius) + 0.5f;
+    for (r = -iradius; r <= iradius; r ++) {
+        floatR = (float)r;
+        fp[r + iradius] = coeff1 * powf(e, floatR * floatR * coeff2);
+        normalizeFactor += fp[r + iradius];
+    }
+
+    //Now we need to normalize the weights because all our coefficients need to add up to one
+    normalizeFactor = 1.0f / normalizeFactor;
+    for (r = -iradius; r <= iradius; r ++) {
+        fp[r + iradius] *= normalizeFactor;
+        ip[r + iradius] = (short)(ip[r + iradius] * 32768);
+    }
+}
+
+void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    rsAssert(slot == 0);
+    radius = ((const float *)data)[0];
+    ComputeGaussianWeights();
+}
+
+
+
+static void OneV(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+                 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+
+    const uchar *pi = ptrIn + x*4;
+
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = rsMax((y + r), 0);
+        validY = rsMin(validY, (int)(p->dimY - 1));
+        const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
+        float4 pf = convert_float4(pvy[0]);
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out->xyzw = blurredPixel;
+}
+
+extern "C" void rsdIntrinsicBlurVF_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int x2);
+extern "C" void rsdIntrinsicBlurHF_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int x2);
+
+static void OneVF(float4 *out,
+                  const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+                  int x1, int x2) {
+
+#if defined(ARCH_ARM_HAVE_NEON)
+    {
+        int t = (x2 - x1);
+        t &= ~1;
+        if(t) {
+            rsdIntrinsicBlurVF_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+        }
+        x1 += t;
+    }
+#endif
+
+    while(x2 > x1) {
+        const uchar *pi = ptrIn;
+        float4 blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float4 pf = convert_float4(((const uchar4 *)pi)[0]);
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out->xyzw = blurredPixel;
+        x1++;
+        out++;
+    }
+}
+
+static void OneH(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+                const float4 *ptrIn, const float* gPtr, int iradius) {
+
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = rsMax((x + r), 0);
+        validX = rsMin(validX, (int)(p->dimX - 1));
+        float4 pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out->xyzw = convert_uchar4(blurredPixel);
+}
+
+
+void RsdCpuScriptIntrinsicBlur::kernel(const RsForEachStubParamStruct *p,
+                                       uint32_t xstart, uint32_t xend,
+                                       uint32_t instep, uint32_t outstep) {
+    float buf[4 * 2048];
+    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("Blur executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    float4 *fout = (float4 *)buf;
+    int y = p->y;
+    if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) {
+        const uchar *pi = pin + (y - cp->iradius) * stride;
+        OneVF(fout, pi, stride, cp->fp, cp->iradius * 2 + 1, x1, x2);
+    } else {
+        while(x2 > x1) {
+            OneV(p, fout, x1, y, pin, stride, cp->fp, cp->iradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
+        OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_ARM_HAVE_NEON)
+    if ((x1 + cp->iradius) < x2) {
+        rsdIntrinsicBlurHF_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, x2 - cp->iradius);
+        out += (x2 - cp->iradius) - x1;
+        x1 = x2 - cp->iradius;
+    }
+#endif
+    while(x2 > x1) {
+        OneH(p, out, x1, (float4 *)buf, cp->fp, cp->iradius);
+        out++;
+        x1++;
+    }
+
+}
+
+RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_BLUR) {
+
+    mRootPtr = &kernel;
+    radius = 5;
+    ComputeGaussianWeights();
+}
+
+RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
+}
+
+void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicBlur(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
new file mode 100644
index 0000000..8f3196d
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+
+    virtual ~RsdCpuScriptIntrinsicColorMatrix();
+    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[16];
+    short ip[16];
+
+    static void kernel4x4(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernel3x3(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+    static void kernelDot(const RsForEachStubParamStruct *p,
+                          uint32_t xstart, uint32_t xend,
+                          uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
+                                                    size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (fp, data, dataLength);
+    for(int ct=0; ct < 16; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+
+    mRootPtr = &kernel4x4;
+    if ((ip[3] == 0) && (ip[7] == 0) && (ip[11] == 0) &&
+        (ip[12] == 0) && (ip[13] == 0) && (ip[14] == 0) && (ip[15] == 255)) {
+        mRootPtr = &kernel3x3;
+
+        if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
+            (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
+            (ip[8] == ip[9]) && (ip[8] == ip[10])) {
+            mRootPtr = &kernelDot;
+        }
+    }
+}
+
+extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
+extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
+
+static void One(const RsForEachStubParamStruct *p, uchar4 *out,
+                const uchar4 *py, const float* coeff) {
+    float4 i = convert_float4(py[0]);
+
+    float4 sum;
+    sum.x = i.x * coeff[0] +
+            i.y * coeff[4] +
+            i.z * coeff[8] +
+            i.w * coeff[12];
+    sum.y = i.x * coeff[1] +
+            i.y * coeff[5] +
+            i.z * coeff[9] +
+            i.w * coeff[13];
+    sum.z = i.x * coeff[2] +
+            i.y * coeff[6] +
+            i.z * coeff[10] +
+            i.w * coeff[14];
+    sum.w = i.x * coeff[3] +
+            i.y * coeff[7] +
+            i.z * coeff[11] +
+            i.w * coeff[15];
+
+    sum.x = sum.x < 0 ? 0 : (sum.x > 255 ? 255 : sum.x);
+    sum.y = sum.y < 0 ? 0 : (sum.y > 255 ? 255 : sum.y);
+    sum.z = sum.z < 0 ? 0 : (sum.z > 255 ? 255 : sum.z);
+    sum.w = sum.w < 0 ? 0 : (sum.w > 255 ? 255 : sum.w);
+
+    *out = convert_uchar4(sum);
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernel4x4(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernel3x3(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::kernelDot(const RsForEachStubParamStruct *p,
+                                                 uint32_t xstart, uint32_t xend,
+                                                 uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1) >> 2;
+        if(len > 0) {
+            rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
+            x1 += len << 2;
+            out += len << 2;
+            in += len << 2;
+        }
+#endif
+
+        while(x1 != x2) {
+            One(p, out++, in++, cp->fp);
+            x1++;
+        }
+    }
+}
+
+
+RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
+
+    const static float defaultMatrix[] = {
+        1.f, 0.f, 0.f, 0.f,
+        0.f, 1.f, 0.f, 0.f,
+        0.f, 0.f, 1.f, 0.f,
+        0.f, 0.f, 0.f, 1.f
+    };
+    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
+}
+
+RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s);
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
new file mode 100644
index 0000000..18a5311
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicConvolve3x3();
+    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[16];
+    short ip[16];
+    ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
+                                                    size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (&fp, data, dataLength);
+    for(int ct=0; ct < 9; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
+                                          const void *y2, const short *coef, uint32_t count);
+
+
+static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                        const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
+                        const float* coeff) {
+
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+
+    float4 px = convert_float4(py0[x1]) * coeff[0] +
+                convert_float4(py0[x]) * coeff[1] +
+                convert_float4(py0[x2]) * coeff[2] +
+                convert_float4(py1[x1]) * coeff[3] +
+                convert_float4(py1[x]) * coeff[4] +
+                convert_float4(py1[x2]) * coeff[5] +
+                convert_float4(py2[x1]) * coeff[6] +
+                convert_float4(py2[x]) * coeff[7] +
+                convert_float4(py2[x2]) * coeff[8];
+
+    px = clamp(px, 0.f, 255.f);
+    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
+    *out = o;
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::kernel(const RsForEachStubParamStruct *p,
+                                              uint32_t xstart, uint32_t xend,
+                                              uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
+
+    if (!cp->alloc.get()) {
+        ALOGE("Convolve3x3 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
+    const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y);
+    const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if(x1 == 0) {
+        ConvolveOne(p, 0, out, py0, py1, py2, cp->fp);
+        x1 ++;
+        out++;
+    }
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 1;
+        if(len > 0) {
+            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->ip, len);
+            x1 += len << 1;
+            out += len << 1;
+        }
+#endif
+
+        while(x1 != x2) {
+            ConvolveOne(p, x1, out, py0, py1, py2, cp->fp);
+            out++;
+            x1++;
+        }
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
+
+    mRootPtr = &kernel;
+    for(int ct=0; ct < 9; ct++) {
+        fp[ct] = 1.f / 9.f;
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
new file mode 100644
index 0000000..2cae2c0
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicConvolve5x5();
+    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    float fp[28];
+    short ip[28];
+    ObjectBaseRef<Allocation> alloc;
+
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 1);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
+                                                    const void *data, size_t dataLength) {
+    rsAssert(slot == 0);
+    memcpy (&fp, data, dataLength);
+    for(int ct=0; ct < 25; ct++) {
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+
+static void One(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+                const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
+                const float* coeff) {
+
+    uint32_t x0 = rsMax((int32_t)x-2, 0);
+    uint32_t x1 = rsMax((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
+    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
+
+    float4 px = convert_float4(py0[x0]) * coeff[0] +
+                convert_float4(py0[x1]) * coeff[1] +
+                convert_float4(py0[x2]) * coeff[2] +
+                convert_float4(py0[x3]) * coeff[3] +
+                convert_float4(py0[x4]) * coeff[4] +
+
+                convert_float4(py1[x0]) * coeff[5] +
+                convert_float4(py1[x1]) * coeff[6] +
+                convert_float4(py1[x2]) * coeff[7] +
+                convert_float4(py1[x3]) * coeff[8] +
+                convert_float4(py1[x4]) * coeff[9] +
+
+                convert_float4(py2[x0]) * coeff[10] +
+                convert_float4(py2[x1]) * coeff[11] +
+                convert_float4(py2[x2]) * coeff[12] +
+                convert_float4(py2[x3]) * coeff[13] +
+                convert_float4(py2[x4]) * coeff[14] +
+
+                convert_float4(py3[x0]) * coeff[15] +
+                convert_float4(py3[x1]) * coeff[16] +
+                convert_float4(py3[x2]) * coeff[17] +
+                convert_float4(py3[x3]) * coeff[18] +
+                convert_float4(py3[x4]) * coeff[19] +
+
+                convert_float4(py4[x0]) * coeff[20] +
+                convert_float4(py4[x1]) * coeff[21] +
+                convert_float4(py4[x2]) * coeff[22] +
+                convert_float4(py4[x3]) * coeff[23] +
+                convert_float4(py4[x4]) * coeff[24];
+
+    px = clamp(px, 0.f, 255.f);
+    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
+    *out = o;
+}
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
+                                          const void *y2, const void *y3, const void *y4,
+                                          const short *coef, uint32_t count);
+
+void RsdCpuScriptIntrinsicConvolve5x5::kernel(const RsForEachStubParamStruct *p,
+                                              uint32_t xstart, uint32_t xend,
+                                              uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("Convolve5x5 executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
+    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
+    uint32_t y2 = p->y;
+    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
+    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
+
+    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
+    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
+    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
+    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
+    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while((x1 < x2) && (x1 < 2)) {
+        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
+        out++;
+        x1++;
+    }
+
+#if defined(ARCH_ARM_HAVE_NEON)
+    if((x1 + 3) < x2) {
+        uint32_t len = (x2 - x1 - 3) >> 1;
+        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
+        out += len << 1;
+        x1 += len << 1;
+    }
+#endif
+
+    while(x1 < x2) {
+        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
+        out++;
+        x1++;
+    }
+}
+
+
+RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
+
+    mRootPtr = &kernel;
+    for(int ct=0; ct < 9; ct++) {
+        fp[ct] = 1.f / 25.f;
+        ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+    }
+}
+
+RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 2;
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s);
+}
+
+
+
diff --git a/cpu_ref/rsCpuIntrinsicInlines.h b/cpu_ref/rsCpuIntrinsicInlines.h
new file mode 100644
index 0000000..ab11b4f
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicInlines.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+
+enum IntrinsicEnums {
+    INTRINSIC_UNDEFINED,
+    INTRINSIC_CONVOLVE_3x3,
+    INTRINXIC_COLORMATRIX
+
+};
+
+static inline int4 convert_int4(uchar4 i) {
+    int4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline short4 convert_short4(uchar4 i) {
+    short4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline float4 convert_float4(uchar4 i) {
+    float4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+static inline uchar4 convert_uchar4(short4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+static inline uchar4 convert_uchar4(int4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+static inline uchar4 convert_uchar4(float4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+
+static inline int4 clamp(int4 amount, int low, int high) {
+    int4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+static inline float4 clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
new file mode 100644
index 0000000..188ed2b
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicLUT : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicLUT();
+    RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    ObjectBaseRef<Allocation> lut;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 0);
+    lut.set(static_cast<Allocation *>(data));
+}
+
+
+void RsdCpuScriptIntrinsicLUT::kernel(const RsForEachStubParamStruct *p,
+                                      uint32_t xstart, uint32_t xend,
+                                      uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->in;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    const uchar *tr = (const uchar *)cp->lut->mHal.drvState.lod[0].mallocPtr;
+    const uchar *tg = &tr[256];
+    const uchar *tb = &tg[256];
+    const uchar *ta = &tb[256];
+
+    while (x1 < x2) {
+        uchar4 p = *in;
+        uchar4 o = {tr[p.x], tg[p.y], tb[p.z], ta[p.w]};
+        *out = o;
+        in++;
+        out++;
+        x1++;
+    }
+}
+
+RsdCpuScriptIntrinsicLUT::RsdCpuScriptIntrinsicLUT(RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_LUT) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsicLUT::~RsdCpuScriptIntrinsicLUT() {
+}
+
+void RsdCpuScriptIntrinsicLUT::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsicLUT::invokeFreeChildren() {
+    lut.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx, const Script *s) {
+
+    return new RsdCpuScriptIntrinsicLUT(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
new file mode 100644
index 0000000..7b8f768
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicYuvToRGB : public RsdCpuScriptIntrinsic {
+public:
+    virtual void populateScript(Script *);
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+    virtual ~RsdCpuScriptIntrinsicYuvToRGB();
+    RsdCpuScriptIntrinsicYuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+    ObjectBaseRef<Allocation> alloc;
+
+    static void kernel(const RsForEachStubParamStruct *p,
+                       uint32_t xstart, uint32_t xend,
+                       uint32_t instep, uint32_t outstep);
+};
+
+}
+}
+
+
+void RsdCpuScriptIntrinsicYuvToRGB::setGlobalObj(uint32_t slot, ObjectBase *data) {
+    rsAssert(slot == 0);
+    alloc.set(static_cast<Allocation *>(data));
+}
+
+
+
+
+static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
+    short Y = ((short)y) - 16;
+    short U = ((short)u) - 128;
+    short V = ((short)v) - 128;
+
+    short4 p;
+    p.r = (Y * 298 + V * 409 + 128) >> 8;
+    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+    p.b = (Y * 298 + U * 516 + 128) >> 8;
+    p.a = 255;
+    if(p.r < 0) {
+        p.r = 0;
+    }
+    if(p.r > 255) {
+        p.r = 255;
+    }
+    if(p.g < 0) {
+        p.g = 0;
+    }
+    if(p.g > 255) {
+        p.g = 255;
+    }
+    if(p.b < 0) {
+        p.b = 0;
+    }
+    if(p.b > 255) {
+        p.b = 255;
+    }
+
+    return (uchar4){p.r, p.g, p.b, p.a};
+}
+
+
+static short YuvCoeff[] = {
+    298, 409, -100, 516,   -208, 255, 0, 0,
+    16, 16, 16, 16,        16, 16, 16, 16,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    298, 298, 298, 298, 298, 298, 298, 298,
+    255, 255, 255, 255, 255, 255, 255, 255
+
+
+};
+
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t count, const short *param);
+
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
+                                           uint32_t xstart, uint32_t xend,
+                                           uint32_t instep, uint32_t outstep) {
+    RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
+    if (!cp->alloc.get()) {
+        ALOGE("YuvToRGB executed without input, skipping");
+        return;
+    }
+    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
+    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
+
+    const uchar *Y = pin + (p->y * p->dimX);
+    const uchar *uv = pin + (p->dimX * p->dimY);
+    uv += (p->y>>1) * p->dimX;
+
+    uchar4 *out = (uchar4 *)p->out;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    if(x2 > x1) {
+#if defined(ARCH_ARM_HAVE_NEON)
+        int32_t len = (x2 - x1 - 1) >> 3;
+        if(len > 0) {
+            rsdIntrinsicYuv_K(out, Y, uv, len, YuvCoeff);
+            x1 += len << 3;
+            out += len << 3;
+        }
+#endif
+
+       // ALOGE("y %i  %i  %i", p->y, x1, x2);
+        while(x1 < x2) {
+            uchar u = uv[(x1 & 0xffffe) + 1];
+            uchar v = uv[(x1 & 0xffffe) + 0];
+            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(Y[x1], u, v);
+            out++;
+            x1++;
+        }
+    }
+}
+
+RsdCpuScriptIntrinsicYuvToRGB::RsdCpuScriptIntrinsicYuvToRGB(
+            RsdCpuReferenceImpl *ctx, const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB) {
+
+    mRootPtr = &kernel;
+}
+
+RsdCpuScriptIntrinsicYuvToRGB::~RsdCpuScriptIntrinsicYuvToRGB() {
+}
+
+void RsdCpuScriptIntrinsicYuvToRGB::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 1;
+}
+
+void RsdCpuScriptIntrinsicYuvToRGB::invokeFreeChildren() {
+    alloc.clear();
+}
+
+
+RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx, const Script *s) {
+    return new RsdCpuScriptIntrinsicYuvToRGB(ctx, s);
+}
+
+
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
new file mode 100644
index 0000000..04dd8b1
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -0,0 +1,1524 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        sp = coeffs
+        sp = length / 2
+*/
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        /* Get the coeffs pointer from the stack and load the
+           coefficients in the q0, q1 NEON registers */
+        ldr r4, [sp, #32+64]
+        vld1.16 {q0, q1}, [r4]
+
+        /* Get count from the stack */
+        ldr r4, [sp, #36+64]
+
+        /* Load the frequently used immediate in a register */
+        mov r5, #8
+
+1:
+        /* Load and post-increase the address by r5=#8 */
+        vld1.8 {q13}, [r1], r5
+        vld1.8 {q14}, [r2], r5
+        vld1.8 {q15}, [r3], r5
+
+        /* Signal memory for data that will be used in the loop after the next */
+        PLD         (r1, r5)
+        PLD         (r2, r5)
+        PLD         (r3, r5)
+
+        vmovl.u8 q2, d26
+        vmovl.u8 q3, d27
+        vmovl.u8 q4, d28
+        vmovl.u8 q5, d29
+        vmovl.u8 q6, d30
+        vmovl.u8 q7, d31
+
+/*
+        The two pixel source array is
+        d4,  d5,  d6,  d7
+        d8,  d9,  d10, d11
+        d12, d13, d14, d15
+*/
+
+        vmull.s16 q8, d4, d0[0]
+        vmlal.s16 q8, d5, d0[1]
+        vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q8, d8, d0[3]
+        vmlal.s16 q8, d9, d1[0]
+        vmlal.s16 q8, d10, d1[1]
+        vmlal.s16 q8, d12, d1[2]
+        vmlal.s16 q8, d13, d1[3]
+        vmlal.s16 q8, d14, d2[0]
+
+        vmull.s16 q9, d5, d0[0]
+        vmlal.s16 q9, d6, d0[1]
+        vmlal.s16 q9, d7, d0[2]
+        vmlal.s16 q9, d9, d0[3]
+        vmlal.s16 q9, d10, d1[0]
+        vmlal.s16 q9, d11, d1[1]
+        vmlal.s16 q9, d13, d1[2]
+        vmlal.s16 q9, d14, d1[3]
+        vmlal.s16 q9, d15, d2[0]
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d17, q9, #8
+
+        vqmovun.s16 d16, q8
+        vst1.8 d16, [r0]!
+
+        /* Are we done yet? */
+        subs r4, r4, #1
+        bne 1b
+
+        /* We're done, bye! */
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicConvolve3x3_K)
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrix
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrix4x4_K)
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0  /* R */
+        vmovl.u8 q13, d1  /* G */
+        vmovl.u8 q14, d2  /* B */
+        vmovl.u8 q15, d3  /* A */
+
+        vmull.s16 q8,  d24, d4[0]
+        vmull.s16 q9,  d24, d4[1]
+        vmull.s16 q10, d24, d4[2]
+        vmull.s16 q11, d24, d4[3]
+
+        vmlal.s16 q8,  d26, d5[0]
+        vmlal.s16 q9,  d26, d5[1]
+        vmlal.s16 q10, d26, d5[2]
+        vmlal.s16 q11, d26, d5[3]
+
+        vmlal.s16 q8,  d28, d6[0]
+        vmlal.s16 q9,  d28, d6[1]
+        vmlal.s16 q10, d28, d6[2]
+        vmlal.s16 q11, d28, d6[3]
+
+        vmlal.s16 q8,  d30, d7[0]
+        vmlal.s16 q9,  d30, d7[1]
+        vmlal.s16 q10, d30, d7[2]
+        vmlal.s16 q11, d30, d7[3]
+
+        vshrn.i32 d24, q8, #8
+        vshrn.i32 d26, q9, #8
+        vshrn.i32 d28, q10, #8
+        vshrn.i32 d30, q11, #8
+
+        vqmovun.s16 d0, q12
+        vqmovun.s16 d1, q13
+        vqmovun.s16 d2, q14
+        vqmovun.s16 d3, q15
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrix4x4_K)
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrix
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrix3x3_K)
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+
+        vmull.s16 q8,  d24, d4[0]
+        vmull.s16 q9,  d24, d4[1]
+        vmull.s16 q10, d24, d4[2]
+
+        vmlal.s16 q8,  d26, d5[0]
+        vmlal.s16 q9,  d26, d5[1]
+        vmlal.s16 q10, d26, d5[2]
+
+        vmlal.s16 q8,  d28, d6[0]
+        vmlal.s16 q9,  d28, d6[1]
+        vmlal.s16 q10, d28, d6[2]
+
+        vshrn.i32 d24, q8, #8
+        vshrn.i32 d26, q9, #8
+        vshrn.i32 d28, q10, #8
+
+        vqmovun.s16 d0, q12
+        vqmovun.s16 d1, q13
+        vqmovun.s16 d2, q14
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrix3x3_K)
+
+/*
+        r0 = dst
+        r1 = src
+        r2 = matrix
+        r3 = length
+*/
+ENTRY(rsdIntrinsicColorMatrixDot_K)
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        vld1.16 {q2}, [r2]!
+        vld1.16 {q3}, [r2]!
+
+1:
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+
+        vmull.s16 q8,  d24, d4[0]
+        vmlal.s16 q8,  d26, d5[0]
+        vmlal.s16 q8,  d28, d6[0]
+        vshrn.i32 d24, q8, #8
+        vqmovun.s16 d0, q12
+        vmov.u8 d1, d0
+        vmov.u8 d2, d0
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicColorMatrixDot_K)
+
+
+/*
+static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
+                  const float* gPtr, int iradius, int x1, int x2)
+
+    r0 = out
+    r1 = pin
+    r2 = stride
+    r3 = gptr
+    r4 = sp, ct
+    r5 = sp+4, x1
+    r6 = sp+8, x2
+*/
+ENTRY(rsdIntrinsicBlurVF_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        ldr r4, [sp, #32+64]
+        ldr r5, [sp, #32+64 + 4]
+        ldr r6, [sp, #32+64 + 8]
+
+1:
+        veor q10, q10, q10         /* float4 blurredPixel = 0; */
+        veor q11, q11, q11         /* float4 blurredPixel = 0; */
+        add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
+        mov r10, r3
+
+        mov r11, r4
+
+2:
+        vld1.32 {d2}, [r7]
+        vmovl.u8 q1, d2
+        vmovl.u16 q3, d2
+        vmovl.u16 q4, d3
+        vcvt.f32.s32 q3, q3
+        vcvt.f32.s32 q4, q4
+        vld1.32 {d0[0]}, [r10]!
+        add r7, r7, r2
+        vmla.f32 q10, q3, d0[0]
+        vmla.f32 q11, q4, d0[0]
+        subs r11, r11, #1
+        bne 2b
+
+        vst1.32 {q10}, [r0]!
+        vst1.32 {q11}, [r0]!
+        add r5, r5, #2
+        cmp r5, r6
+        bne 1b
+
+
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicBlurVF_K)
+
+/*
+static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
+                  const float* gPtr, int iradius, int x1, int x2)
+
+    r0 = out
+    r1 = pin
+    r2 = gptr
+    r3 = ct
+    r4 = sp, x1
+    r5 = sp+4, x2
+*/
+ENTRY(rsdIntrinsicBlurHF_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        ldr r4, [sp, #32+64]
+        ldr r5, [sp, #32+64 + 4]
+
+1:
+        add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
+        mov r10, r2
+        mov r11, r3
+
+        vld1.32 {q1}, [r7]!
+        vld1.32 {d6[0]}, [r10]!
+        vmul.f32 q0, q1, d6[0]
+        sub r11, r11, #1
+
+2:
+        vld1.32 {q1}, [r7]!
+        vld1.32 {q2}, [r7]!
+        vld1.32 {d6[0]}, [r10]!
+        vld1.32 {d6[1]}, [r10]!
+        vmla.f32 q0, q1, d6[0]
+        vmla.f32 q0, q2, d6[1]
+        subs r11, r11, #2
+        bne 2b
+
+        vcvt.s32.f32 q0, q0
+        vmovn.u32 d0, q0
+        vmovn.u16 d0, q0
+
+        vst1.32 {d0[0]}, [r0]!
+        add r4, r4, #1
+        cmp r4, r5
+        bne 1b
+
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicBlurHF_K)
+
+/*
+        r0 = dst
+        r1 = Y
+        r2 = VU
+        r3 = length (pixels / 8)
+        r4 = sp, params
+
+        This function converts 8 pixels per iteration
+*/
+ENTRY(rsdIntrinsicYuv_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        ldr r4, [sp, #32+64]
+        vld1.16 {q2}, [r4]!  // mults
+        vld1.16 {q3}, [r4]!  // y offset
+        vld1.16 {q4}, [r4]!  // 128
+        vdup.8 d3, d5[1]
+
+1:
+        vld1.8 {d10}, [r1]!
+        vld1.8 {d12}, [r2]!
+        vmovl.u8 q5, d10 // Y at .16
+        vmovl.u8 q6, d12 // vu at .16
+
+        vsub.i16 q5, q5, q3
+        vsub.i16 q6, q6, q4
+        vtrn.16 d12, d13  // d12 = u, d13 = v
+        vmov q7, q6
+        vtrn.16 d12, d14
+        vtrn.32 d12, d14
+        vtrn.16 d13, d15
+        vtrn.32 d13, d15
+
+        vmull.s16 q8, d10, d4[0]
+        vmull.s16 q11, d11, d4[0]
+        vmov q9, q8
+        vmov q10, q8
+        vmov q12, q11
+        vmov q13, q11
+
+        vmlal.s16 q8,  d12, d4[1]
+        vmlal.s16 q9,  d12, d5[0]
+        vmlal.s16 q10, d13, d4[3]
+        vmlal.s16 q9,  d13, d4[2]
+
+        vmlal.s16 q11, d14, d4[1]
+        vmlal.s16 q12, d14, d5[0]
+        vmlal.s16 q13, d15, d4[3]
+        vmlal.s16 q12, d15, d4[2]
+
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d18, q9, #8
+        vshrn.i32 d20, q10, #8
+        vqmovun.s16 d0, q8
+        vqmovun.s16 d1, q9
+        vqmovun.s16 d2, q10
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        vshrn.i32 d16, q11, #8
+        vshrn.i32 d18, q12, #8
+        vshrn.i32 d20, q13, #8
+        vqmovun.s16 d0, q8
+        vqmovun.s16 d1, q9
+        vqmovun.s16 d2, q10
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+        subs r3, r3, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicYuv_K)
+
+/* Convolve 5x5 */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        r4 = y3 base pointer
+        r5 = y4 base pointer
+        r6 = coeffs
+        r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        push        {r4-r7, lr}
+        vpush       {q4-q7}
+
+        /* load y3 in r4 */
+        ldr     r4, [sp, #20 + 64]
+
+        /* load y4 in r5 */
+        ldr     r5, [sp, #24 + 64]
+
+        /* Load the coefficients pointer */
+        ldr     r6, [sp, #28 + 64]
+
+        /* Create the coefficients vector */
+        vld1.16     {d0, d1, d2, d3}, [r6]!
+        vld1.16     {d4, d5, d6}, [r6]
+
+        /* load the count */
+        ldr     r6, [sp, #32 + 64]
+
+        /* Load the frequently used immediate in a register */
+        mov     r7, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
+        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        PLD         (r1, r7)
+        PLD         (r2, r7)
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmull.s16 q4, d18, d0[0]
+        vmlal.s16 q4, d19, d0[1]
+        vmlal.s16 q4, d20, d0[2]
+        vmlal.s16 q4, d21, d0[3]
+        vmlal.s16 q4, d22, d1[0]
+
+        vmlal.s16 q4, d24, d1[1]
+        vmlal.s16 q4, d25, d1[2]
+        vmlal.s16 q4, d26, d1[3]
+        vmlal.s16 q4, d27, d2[0]
+        vmlal.s16 q4, d28, d2[1]
+
+        vmull.s16 q5, d19, d0[0]
+        vmlal.s16 q5, d20, d0[1]
+        vmlal.s16 q5, d21, d0[2]
+        vmlal.s16 q5, d22, d0[3]
+        vmlal.s16 q5, d23, d1[0]
+
+        vmlal.s16 q5, d25, d1[1]
+        vmlal.s16 q5, d26, d1[2]
+        vmlal.s16 q5, d27, d1[3]
+        vmlal.s16 q5, d28, d2[0]
+        vmlal.s16 q5, d29, d2[1]
+
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
+        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        PLD         (r3, r7)
+        PLD         (r4, r7)
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmlal.s16 q4, d18, d2[2]
+        vmlal.s16 q4, d19, d2[3]
+        vmlal.s16 q4, d20, d3[0]
+        vmlal.s16 q4, d21, d3[1]
+        vmlal.s16 q4, d22, d3[2]
+
+        vmlal.s16 q4, d24, d3[3]
+        vmlal.s16 q4, d25, d4[0]
+        vmlal.s16 q4, d26, d4[1]
+        vmlal.s16 q4, d27, d4[2]
+        vmlal.s16 q4, d28, d4[3]
+
+        vmlal.s16 q5, d19, d2[2]
+        vmlal.s16 q5, d20, d2[3]
+        vmlal.s16 q5, d21, d3[0]
+        vmlal.s16 q5, d22, d3[1]
+        vmlal.s16 q5, d23, d3[2]
+
+        vmlal.s16 q5, d25, d3[3]
+        vmlal.s16 q5, d26, d4[0]
+        vmlal.s16 q5, d27, d4[1]
+        vmlal.s16 q5, d28, d4[2]
+        vmlal.s16 q5, d29, d4[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        PLD         (r5, r7)
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+
+        vmlal.s16 q4, d18, d5[0]
+        vmlal.s16 q4, d19, d5[1]
+        vmlal.s16 q4, d20, d5[2]
+        vmlal.s16 q4, d21, d5[3]
+        vmlal.s16 q4, d22, d6[0]
+
+        vmlal.s16 q5, d19, d5[0]
+        vmlal.s16 q5, d20, d5[1]
+        vmlal.s16 q5, d21, d5[2]
+        vmlal.s16 q5, d22, d5[3]
+        vmlal.s16 q5, d23, d6[0]
+
+
+
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        vshrn.i32 d8, q4, #8
+        vshrn.i32 d9, q5, #8
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        vqmovun.s16 d8, q4
+
+        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
+
+        /* Are we done? */
+        subs r6, r6, #1
+        bne 1b
+
+        /* Yup, bye */
+        vpop        {q4-q7}
+        pop         {r4-r7, lr}
+        bx          lr
+
+END(rsdIntrinsicConvolve5x5_K)
+
+
+
+
+/*
+        dst = src + dst * (1.0 - src.a)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendSrcOver_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vshll.u8 q12, d0, #8
+        vshll.u8 q13, d1, #8
+        vshll.u8 q14, d2, #8
+        vmovl.u8 q6, d3
+        vsub.i16 q6, q7, q6        // q6 = 1 - src.a
+        vshll.u8 q15, d3, #8
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+        vmla.i16 q12, q8, q6
+        vmla.i16 q13, q9, q6
+        vmla.i16 q14, q10, q6
+        vmla.i16 q15, q11, q6
+
+        vshrn.i16 d0, q12, #8
+        vshrn.i16 d1, q13, #8
+        vshrn.i16 d2, q14, #8
+        vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendSrcOver_K)
+
+/*
+        dst = dst + src * (1.0 - dst.a)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendDstOver_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vshll.u8 q8, d0, #8
+        vshll.u8 q9, d1, #8
+        vshll.u8 q10, d2, #8
+        vmovl.u8 q6, d3
+        vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
+        vshll.u8 q11, d3, #8
+
+
+        vmla.i16 q8, q12, q6
+        vmla.i16 q9, q13, q6
+        vmla.i16 q10, q14, q6
+        vmla.i16 q11, q15, q6
+
+        vshrn.i16 d0, q8, #8
+        vshrn.i16 d1, q9, #8
+        vshrn.i16 d2, q10, #8
+        vshrn.i16 d3, q11, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendDstOver_K)
+
+/*
+        dst = src * dst.a
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendSrcIn_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        //vmovl.u8 q8, d0
+        //vmovl.u8 q9, d1
+        //vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+        vmul.i16 q12, q12, q11
+        vmul.i16 q13, q13, q11
+        vmul.i16 q14, q14, q11
+        vmul.i16 q15, q15, q11
+
+        vshrn.i16 d0, q12, #8
+        vshrn.i16 d1, q13, #8
+        vshrn.i16 d2, q14, #8
+        vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendSrcIn_K)
+
+/*
+        dst = dst * src.a
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendDstIn_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        //vmovl.u8 q12, d0
+        //vmovl.u8 q13, d1
+        //vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+        vmul.i16 q8, q8, q15
+        vmul.i16 q9, q9, q15
+        vmul.i16 q10, q10, q15
+        vmul.i16 q11, q11, q15
+
+        vshrn.i16 d0, q8, #8
+        vshrn.i16 d1, q9, #8
+        vshrn.i16 d2, q10, #8
+        vshrn.i16 d3, q11, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendDstIn_K)
+
+
+
+/*
+        dst = src * (1.0 - dst.a)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendSrcOut_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        //vmovl.u8 q8, d0
+        //vmovl.u8 q9, d1
+        //vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
+        vmul.i16 q12, q12, q6
+        vmul.i16 q13, q13, q6
+        vmul.i16 q14, q14, q6
+        vmul.i16 q15, q15, q6
+
+        vshrn.i16 d0, q12, #8
+        vshrn.i16 d1, q13, #8
+        vshrn.i16 d2, q14, #8
+        vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendSrcOut_K)
+
+
+/*
+        dst = dst * (1.0 - src.a)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendDstOut_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        //vmovl.u8 q12, d0
+        //vmovl.u8 q13, d1
+        //vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
+        vmul.i16 q12, q8, q6
+        vmul.i16 q13, q9, q6
+        vmul.i16 q14, q10, q6
+        vmul.i16 q15, q11, q6
+
+        vshrn.i16 d0, q12, #8
+        vshrn.i16 d1, q13, #8
+        vshrn.i16 d2, q14, #8
+        vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendDstOut_K)
+
+
+/*
+        dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
+        dst.a = dst.a
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendSrcAtop_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vsub.i16 q6, q7, q15        // q6 = 1 - src.a
+        vmul.i16 q8, q8, q6
+        vmul.i16 q9, q9, q6
+        vmul.i16 q10, q10, q6
+
+        vmla.i16 q8, q12, q11
+        vmla.i16 q9, q13, q11
+        vmla.i16 q10, q14, q11
+
+
+        vshrn.i16 d0, q8, #8
+        vshrn.i16 d1, q9, #8
+        vshrn.i16 d2, q10, #8
+        //vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendSrcAtop_K)
+
+/*
+        dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
+        dst.a = src.a
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendDstAtop_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
+        vmul.i16 q12, q12, q6
+        vmul.i16 q13, q13, q6
+        vmul.i16 q14, q14, q6
+
+        vmla.i16 q12, q8, q15
+        vmla.i16 q13, q9, q15
+        vmla.i16 q14, q10, q15
+
+
+        vshrn.i16 d0, q12, #8
+        vshrn.i16 d1, q13, #8
+        vshrn.i16 d2, q14, #8
+        //vshrn.i16 d3, q15, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendDstAtop_K)
+
+/*
+        dst = dst ^ src
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendXor_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmov.u8 d4, d0
+        vmov.u8 d5, d1
+        vmov.u8 d6, d2
+        vmov.u8 d7, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+
+        veor d0, d0, d4
+        veor d1, d1, d5
+        veor d2, d2, d6
+        veor d3, d3, d7
+
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendXor_K)
+
+/*
+        dst = dst * src
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendMultiply_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vmul.i16 q8, q8, q12
+        vmul.i16 q9, q9, q13
+        vmul.i16 q10, q10, q14
+        vmul.i16 q11, q11, q15
+
+        vshrn.i16 d0, q8, #8
+        vshrn.i16 d1, q9, #8
+        vshrn.i16 d2, q10, #8
+        vshrn.i16 d3, q11, #8
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendMultiply_K)
+
+/*
+        dst = min(src + dst, 1.0)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendAdd_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vadd.i16 q8, q8, q12
+        vadd.i16 q9, q9, q13
+        vadd.i16 q10, q10, q14
+        vadd.i16 q11, q11, q15
+
+        vqmovun.s16 d0, q8
+        vqmovun.s16 d1, q9
+        vqmovun.s16 d2, q10
+        vqmovun.s16 d3, q11
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendAdd_K)
+
+
+/*
+        dst = max(dst - src, 0.0)
+
+        r0 = dst
+        r1 = src
+        r2 = length
+*/
+ENTRY(rsdIntrinsicBlendSub_K)
+        .save           {r4, lr}
+        stmfd           sp!, {r4, lr}
+        vpush           {q4-q7}
+
+        mov r4, #255
+        vdup.16 q7, r4
+
+        mov r4, r0
+1:
+
+        /* src */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
+        vmovl.u8 q12, d0
+        vmovl.u8 q13, d1
+        vmovl.u8 q14, d2
+        vmovl.u8 q15, d3
+
+        /* dst */
+        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+        vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
+        vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
+        vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
+        vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
+        vmovl.u8 q8, d0
+        vmovl.u8 q9, d1
+        vmovl.u8 q10, d2
+        vmovl.u8 q11, d3
+
+
+        vsub.i16 q8, q8, q12
+        vsub.i16 q9, q9, q13
+        vsub.i16 q10, q10, q14
+        vsub.i16 q11, q11, q15
+
+        vqmovun.s16 d0, q8
+        vqmovun.s16 d1, q9
+        vqmovun.s16 d2, q10
+        vqmovun.s16 d3, q11
+        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
+        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
+        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
+        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
+        vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
+        vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
+        vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
+        vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
+
+        subs r2, r2, #1
+        bne 1b
+
+        vpop            {q4-q7}
+        ldmfd           sp!, {r4, lr}
+        bx              lr
+END(rsdIntrinsicBlendSub_K)
+
diff --git a/cpu_ref/rsCpuRuntimeMath.cpp b/cpu_ref/rsCpuRuntimeMath.cpp
new file mode 100644
index 0000000..cf2c8a4
--- /dev/null
+++ b/cpu_ref/rsCpuRuntimeMath.cpp
@@ -0,0 +1,546 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cutils/compiler.h>
+
+#include "rsContext.h"
+#include "rsScriptC.h"
+#include "rsMatrix4x4.h"
+#include "rsMatrix3x3.h"
+#include "rsMatrix2x2.h"
+
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+
+
+using namespace android;
+using namespace android::renderscript;
+
+
+static float SC_exp10(float v) {
+    return pow(10.f, v);
+}
+
+static float SC_fract(float v, float *iptr) {
+    int i = (int)floor(v);
+    iptr[0] = (float)i;
+    return fmin(v - i, 0x1.fffffep-1f);
+}
+
+static float SC_log2(float v) {
+    return log10(v) / log10(2.f);
+}
+
+#if 0
+static float SC_pown(float v, int p) {
+    return powf(v, (float)p);
+}
+
+static float SC_powr(float v, float p) {
+    return powf(v, p);
+}
+#endif
+
+float SC_rootn(float v, int r) {
+    return pow(v, 1.f / r);
+}
+
+float SC_rsqrt(float v) {
+    return 1.f / sqrtf(v);
+}
+
+float SC_sincos(float v, float *cosptr) {
+    *cosptr = cosf(v);
+    return sinf(v);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Integer
+//////////////////////////////////////////////////////////////////////////////
+
+
+static uint32_t SC_abs_i32(int32_t v) {return abs(v);}
+static uint16_t SC_abs_i16(int16_t v) {return (uint16_t)abs(v);}
+static uint8_t SC_abs_i8(int8_t v) {return (uint8_t)abs(v);}
+
+static uint32_t SC_clz_u32(uint32_t v) {return __builtin_clz(v);}
+static uint16_t SC_clz_u16(uint16_t v) {return (uint16_t)__builtin_clz(v);}
+static uint8_t SC_clz_u8(uint8_t v) {return (uint8_t)__builtin_clz(v);}
+static int32_t SC_clz_i32(int32_t v) {return (int32_t)__builtin_clz((uint32_t)v);}
+static int16_t SC_clz_i16(int16_t v) {return (int16_t)__builtin_clz(v);}
+static int8_t SC_clz_i8(int8_t v) {return (int8_t)__builtin_clz(v);}
+
+static uint32_t SC_max_u32(uint32_t v, uint32_t v2) {return rsMax(v, v2);}
+static uint16_t SC_max_u16(uint16_t v, uint16_t v2) {return rsMax(v, v2);}
+static uint8_t SC_max_u8(uint8_t v, uint8_t v2) {return rsMax(v, v2);}
+static int32_t SC_max_i32(int32_t v, int32_t v2) {return rsMax(v, v2);}
+static int16_t SC_max_i16(int16_t v, int16_t v2) {return rsMax(v, v2);}
+static int8_t SC_max_i8(int8_t v, int8_t v2) {return rsMax(v, v2);}
+
+static uint32_t SC_min_u32(uint32_t v, uint32_t v2) {return rsMin(v, v2);}
+static uint16_t SC_min_u16(uint16_t v, uint16_t v2) {return rsMin(v, v2);}
+static uint8_t SC_min_u8(uint8_t v, uint8_t v2) {return rsMin(v, v2);}
+static int32_t SC_min_i32(int32_t v, int32_t v2) {return rsMin(v, v2);}
+static int16_t SC_min_i16(int16_t v, int16_t v2) {return rsMin(v, v2);}
+static int8_t SC_min_i8(int8_t v, int8_t v2) {return rsMin(v, v2);}
+
+//////////////////////////////////////////////////////////////////////////////
+// Float util
+//////////////////////////////////////////////////////////////////////////////
+
+static float SC_clamp_f32(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+static float SC_max_f32(float v, float v2) {
+    return rsMax(v, v2);
+}
+
+static float SC_min_f32(float v, float v2) {
+    return rsMin(v, v2);
+}
+
+static float SC_step_f32(float edge, float v) {
+    if (v < edge) return 0.f;
+    return 1.f;
+}
+
+static float SC_sign_f32(float value) {
+    if (value > 0) return 1.f;
+    if (value < 0) return -1.f;
+    return value;
+}
+
+static void SC_MatrixLoadIdentity_4x4(Matrix4x4 *m) {
+    m->loadIdentity();
+}
+static void SC_MatrixLoadIdentity_3x3(Matrix3x3 *m) {
+    m->loadIdentity();
+}
+static void SC_MatrixLoadIdentity_2x2(Matrix2x2 *m) {
+    m->loadIdentity();
+}
+
+static void SC_MatrixLoad_4x4_f(Matrix4x4 *m, const float *f) {
+    m->load(f);
+}
+static void SC_MatrixLoad_3x3_f(Matrix3x3 *m, const float *f) {
+    m->load(f);
+}
+static void SC_MatrixLoad_2x2_f(Matrix2x2 *m, const float *f) {
+    m->load(f);
+}
+
+static void SC_MatrixLoad_4x4_4x4(Matrix4x4 *m, const Matrix4x4 *s) {
+    m->load(s);
+}
+static void SC_MatrixLoad_4x4_3x3(Matrix4x4 *m, const Matrix3x3 *s) {
+    m->load(s);
+}
+static void SC_MatrixLoad_4x4_2x2(Matrix4x4 *m, const Matrix2x2 *s) {
+    m->load(s);
+}
+static void SC_MatrixLoad_3x3_3x3(Matrix3x3 *m, const Matrix3x3 *s) {
+    m->load(s);
+}
+static void SC_MatrixLoad_2x2_2x2(Matrix2x2 *m, const Matrix2x2 *s) {
+    m->load(s);
+}
+
+static void SC_MatrixLoadRotate(Matrix4x4 *m, float rot, float x, float y, float z) {
+    m->loadRotate(rot, x, y, z);
+}
+static void SC_MatrixLoadScale(Matrix4x4 *m, float x, float y, float z) {
+    m->loadScale(x, y, z);
+}
+static void SC_MatrixLoadTranslate(Matrix4x4 *m, float x, float y, float z) {
+    m->loadTranslate(x, y, z);
+}
+static void SC_MatrixRotate(Matrix4x4 *m, float rot, float x, float y, float z) {
+    m->rotate(rot, x, y, z);
+}
+static void SC_MatrixScale(Matrix4x4 *m, float x, float y, float z) {
+    m->scale(x, y, z);
+}
+static void SC_MatrixTranslate(Matrix4x4 *m, float x, float y, float z) {
+    m->translate(x, y, z);
+}
+
+static void SC_MatrixLoadMultiply_4x4_4x4_4x4(Matrix4x4 *m, const Matrix4x4 *lhs, const Matrix4x4 *rhs) {
+    m->loadMultiply(lhs, rhs);
+}
+static void SC_MatrixLoadMultiply_3x3_3x3_3x3(Matrix3x3 *m, const Matrix3x3 *lhs, const Matrix3x3 *rhs) {
+    m->loadMultiply(lhs, rhs);
+}
+static void SC_MatrixLoadMultiply_2x2_2x2_2x2(Matrix2x2 *m, const Matrix2x2 *lhs, const Matrix2x2 *rhs) {
+    m->loadMultiply(lhs, rhs);
+}
+
+static void SC_MatrixMultiply_4x4_4x4(Matrix4x4 *m, const Matrix4x4 *rhs) {
+    m->multiply(rhs);
+}
+static void SC_MatrixMultiply_3x3_3x3(Matrix3x3 *m, const Matrix3x3 *rhs) {
+    m->multiply(rhs);
+}
+static void SC_MatrixMultiply_2x2_2x2(Matrix2x2 *m, const Matrix2x2 *rhs) {
+    m->multiply(rhs);
+}
+
+static void SC_MatrixLoadOrtho(Matrix4x4 *m, float l, float r, float b, float t, float n, float f) {
+    m->loadOrtho(l, r, b, t, n, f);
+}
+static void SC_MatrixLoadFrustum(Matrix4x4 *m, float l, float r, float b, float t, float n, float f) {
+    m->loadFrustum(l, r, b, t, n, f);
+}
+static void SC_MatrixLoadPerspective(Matrix4x4 *m, float fovy, float aspect, float near, float far) {
+    m->loadPerspective(fovy, aspect, near, far);
+}
+
+static bool SC_MatrixInverse_4x4(Matrix4x4 *m) {
+    return m->inverse();
+}
+static bool SC_MatrixInverseTranspose_4x4(Matrix4x4 *m) {
+    return m->inverseTranspose();
+}
+static void SC_MatrixTranspose_4x4(Matrix4x4 *m) {
+    m->transpose();
+}
+static void SC_MatrixTranspose_3x3(Matrix3x3 *m) {
+    m->transpose();
+}
+static void SC_MatrixTranspose_2x2(Matrix2x2 *m) {
+    m->transpose();
+}
+
+static float SC_randf(float max) {
+    float r = (float)rand();
+    r *= max;
+    r /= RAND_MAX;
+    return r;
+}
+
+static float SC_randf2(float min, float max) {
+    float r = (float)rand();
+    r /= RAND_MAX;
+    r = r * (max - min) + min;
+    return r;
+}
+
+static int SC_randi(int max) {
+    return (int)SC_randf(max);
+}
+
+static int SC_randi2(int min, int max) {
+    return (int)SC_randf2(min, max);
+}
+
+static float SC_frac(float v) {
+    int i = (int)floor(v);
+    return fmin(v - i, 0x1.fffffep-1f);
+}
+
+
+static int32_t SC_AtomicCas(volatile int32_t *ptr, int32_t expectedValue, int32_t newValue) {
+    int32_t prev;
+
+    do {
+        int32_t ret = android_atomic_release_cas(expectedValue, newValue, ptr);
+        if (!ret) {
+            // The android cas return 0 if it wrote the value.  This means the
+            // previous value was the expected value and we can return.
+            return expectedValue;
+        }
+        // We didn't write the value and need to load the "previous" value.
+        prev = *ptr;
+
+        // A race condition exists where the expected value could appear after our cas failed
+        // above.  In this case loop until we have a legit previous value or the
+        // write passes.
+        } while (prev == expectedValue);
+    return prev;
+}
+
+
+static int32_t SC_AtomicInc(volatile int32_t *ptr) {
+    return android_atomic_inc(ptr);
+}
+
+static int32_t SC_AtomicDec(volatile int32_t *ptr) {
+    return android_atomic_dec(ptr);
+}
+
+static int32_t SC_AtomicAdd(volatile int32_t *ptr, int32_t value) {
+    return android_atomic_add(value, ptr);
+}
+
+static int32_t SC_AtomicSub(volatile int32_t *ptr, int32_t value) {
+    int32_t prev, status;
+    do {
+        prev = *ptr;
+        status = android_atomic_release_cas(prev, prev - value, ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+static int32_t SC_AtomicAnd(volatile int32_t *ptr, int32_t value) {
+    return android_atomic_and(value, ptr);
+}
+
+static int32_t SC_AtomicOr(volatile int32_t *ptr, int32_t value) {
+    return android_atomic_or(value, ptr);
+}
+
+static int32_t SC_AtomicXor(volatile int32_t *ptr, int32_t value) {
+    int32_t prev, status;
+    do {
+        prev = *ptr;
+        status = android_atomic_release_cas(prev, prev ^ value, ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+static uint32_t SC_AtomicUMin(volatile uint32_t *ptr, uint32_t value) {
+    uint32_t prev, status;
+    do {
+        prev = *ptr;
+        uint32_t n = rsMin(value, prev);
+        status = android_atomic_release_cas((int32_t) prev, (int32_t)n, (volatile int32_t*) ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+static int32_t SC_AtomicMin(volatile int32_t *ptr, int32_t value) {
+    int32_t prev, status;
+    do {
+        prev = *ptr;
+        int32_t n = rsMin(value, prev);
+        status = android_atomic_release_cas(prev, n, ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+static uint32_t SC_AtomicUMax(volatile uint32_t *ptr, uint32_t value) {
+    uint32_t prev, status;
+    do {
+        prev = *ptr;
+        uint32_t n = rsMax(value, prev);
+        status = android_atomic_release_cas((int32_t) prev, (int32_t) n, (volatile int32_t*) ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+static int32_t SC_AtomicMax(volatile int32_t *ptr, int32_t value) {
+    int32_t prev, status;
+    do {
+        prev = *ptr;
+        int32_t n = rsMax(value, prev);
+        status = android_atomic_release_cas(prev, n, ptr);
+    } while (CC_UNLIKELY(status != 0));
+    return prev;
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Class implementation
+//////////////////////////////////////////////////////////////////////////////
+
+// llvm name mangling ref
+//  <builtin-type> ::= v  # void
+//                 ::= b  # bool
+//                 ::= c  # char
+//                 ::= a  # signed char
+//                 ::= h  # unsigned char
+//                 ::= s  # short
+//                 ::= t  # unsigned short
+//                 ::= i  # int
+//                 ::= j  # unsigned int
+//                 ::= l  # long
+//                 ::= m  # unsigned long
+//                 ::= x  # long long, __int64
+//                 ::= y  # unsigned long long, __int64
+//                 ::= f  # float
+//                 ::= d  # double
+
+static RsdCpuReference::CpuSymbol gSyms[] = {
+    { "_Z4acosf", (void *)&acosf, true },
+    { "_Z5acoshf", (void *)&acoshf, true },
+    { "_Z4asinf", (void *)&asinf, true },
+    { "_Z5asinhf", (void *)&asinhf, true },
+    { "_Z4atanf", (void *)&atanf, true },
+    { "_Z5atan2ff", (void *)&atan2f, true },
+    { "_Z5atanhf", (void *)&atanhf, true },
+    { "_Z4cbrtf", (void *)&cbrtf, true },
+    { "_Z4ceilf", (void *)&ceilf, true },
+    { "_Z8copysignff", (void *)&copysignf, true },
+    { "_Z3cosf", (void *)&cosf, true },
+    { "_Z4coshf", (void *)&coshf, true },
+    { "_Z4erfcf", (void *)&erfcf, true },
+    { "_Z3erff", (void *)&erff, true },
+    { "_Z3expf", (void *)&expf, true },
+    { "_Z4exp2f", (void *)&exp2f, true },
+    { "_Z5exp10f", (void *)&SC_exp10, true },
+    { "_Z5expm1f", (void *)&expm1f, true },
+    { "_Z4fabsf", (void *)&fabsf, true },
+    { "_Z4fdimff", (void *)&fdimf, true },
+    { "_Z5floorf", (void *)&floorf, true },
+    { "_Z3fmafff", (void *)&fmaf, true },
+    { "_Z4fmaxff", (void *)&fmaxf, true },
+    { "_Z4fminff", (void *)&fminf, true },  // float fmin(float, float)
+    { "_Z4fmodff", (void *)&fmodf, true },
+    { "_Z5fractfPf", (void *)&SC_fract, true },
+    { "_Z5frexpfPi", (void *)&frexpf, true },
+    { "_Z5hypotff", (void *)&hypotf, true },
+    { "_Z5ilogbf", (void *)&ilogbf, true },
+    { "_Z5ldexpfi", (void *)&ldexpf, true },
+    { "_Z6lgammaf", (void *)&lgammaf, true },
+    { "_Z6lgammafPi", (void *)&lgammaf_r, true },
+    { "_Z3logf", (void *)&logf, true },
+    { "_Z4log2f", (void *)&SC_log2, true },
+    { "_Z5log10f", (void *)&log10f, true },
+    { "_Z5log1pf", (void *)&log1pf, true },
+    { "_Z4logbf", (void *)&logbf, true },
+    { "_Z4modffPf", (void *)&modff, true },
+    //{ "_Z3nanj", (void *)&SC_nan, true },
+    { "_Z9nextafterff", (void *)&nextafterf, true },
+    { "_Z3powff", (void *)&powf, true },
+    { "_Z9remainderff", (void *)&remainderf, true },
+    { "_Z6remquoffPi", (void *)&remquof, true },
+    { "_Z4rintf", (void *)&rintf, true },
+    { "_Z5rootnfi", (void *)&SC_rootn, true },
+    { "_Z5roundf", (void *)&roundf, true },
+    { "_Z5rsqrtf", (void *)&SC_rsqrt, true },
+    { "_Z3sinf", (void *)&sinf, true },
+    { "_Z6sincosfPf", (void *)&SC_sincos, true },
+    { "_Z4sinhf", (void *)&sinhf, true },
+    { "_Z4sqrtf", (void *)&sqrtf, true },
+    { "_Z3tanf", (void *)&tanf, true },
+    { "_Z4tanhf", (void *)&tanhf, true },
+    { "_Z6tgammaf", (void *)&tgammaf, true },
+    { "_Z5truncf", (void *)&truncf, true },
+
+    { "_Z3absi", (void *)&SC_abs_i32, true },
+    { "_Z3abss", (void *)&SC_abs_i16, true },
+    { "_Z3absc", (void *)&SC_abs_i8, true },
+    { "_Z3clzj", (void *)&SC_clz_u32, true },
+    { "_Z3clzt", (void *)&SC_clz_u16, true },
+    { "_Z3clzh", (void *)&SC_clz_u8, true },
+    { "_Z3clzi", (void *)&SC_clz_i32, true },
+    { "_Z3clzs", (void *)&SC_clz_i16, true },
+    { "_Z3clzc", (void *)&SC_clz_i8, true },
+    { "_Z3maxjj", (void *)&SC_max_u32, true },
+    { "_Z3maxtt", (void *)&SC_max_u16, true },
+    { "_Z3maxhh", (void *)&SC_max_u8, true },
+    { "_Z3maxii", (void *)&SC_max_i32, true },
+    { "_Z3maxss", (void *)&SC_max_i16, true },
+    { "_Z3maxcc", (void *)&SC_max_i8, true },
+    { "_Z3minjj", (void *)&SC_min_u32, true },
+    { "_Z3mintt", (void *)&SC_min_u16, true },
+    { "_Z3minhh", (void *)&SC_min_u8, true },
+    { "_Z3minii", (void *)&SC_min_i32, true },
+    { "_Z3minss", (void *)&SC_min_i16, true },
+    { "_Z3mincc", (void *)&SC_min_i8, true },
+
+    { "_Z5clampfff", (void *)&SC_clamp_f32, true },
+    { "_Z3maxff", (void *)&SC_max_f32, true },
+    { "_Z3minff", (void *)&SC_min_f32, true },
+    { "_Z4stepff", (void *)&SC_step_f32, true },
+    //{ "smoothstep", (void *)&, true },
+    { "_Z4signf", (void *)&SC_sign_f32, true },
+
+    // matrix
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix4x4", (void *)&SC_MatrixLoadIdentity_4x4, true },
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix3x3", (void *)&SC_MatrixLoadIdentity_3x3, true },
+    { "_Z20rsMatrixLoadIdentityP12rs_matrix2x2", (void *)&SC_MatrixLoadIdentity_2x2, true },
+
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PKf", (void *)&SC_MatrixLoad_4x4_f, true },
+    { "_Z12rsMatrixLoadP12rs_matrix3x3PKf", (void *)&SC_MatrixLoad_3x3_f, true },
+    { "_Z12rsMatrixLoadP12rs_matrix2x2PKf", (void *)&SC_MatrixLoad_2x2_f, true },
+
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PKS_", (void *)&SC_MatrixLoad_4x4_4x4, true },
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PK12rs_matrix3x3", (void *)&SC_MatrixLoad_4x4_3x3, true },
+    { "_Z12rsMatrixLoadP12rs_matrix4x4PK12rs_matrix2x2", (void *)&SC_MatrixLoad_4x4_2x2, true },
+    { "_Z12rsMatrixLoadP12rs_matrix3x3PKS_", (void *)&SC_MatrixLoad_3x3_3x3, true },
+    { "_Z12rsMatrixLoadP12rs_matrix2x2PKS_", (void *)&SC_MatrixLoad_2x2_2x2, true },
+
+    { "_Z18rsMatrixLoadRotateP12rs_matrix4x4ffff", (void *)&SC_MatrixLoadRotate, true },
+    { "_Z17rsMatrixLoadScaleP12rs_matrix4x4fff", (void *)&SC_MatrixLoadScale, true },
+    { "_Z21rsMatrixLoadTranslateP12rs_matrix4x4fff", (void *)&SC_MatrixLoadTranslate, true },
+    { "_Z14rsMatrixRotateP12rs_matrix4x4ffff", (void *)&SC_MatrixRotate, true },
+    { "_Z13rsMatrixScaleP12rs_matrix4x4fff", (void *)&SC_MatrixScale, true },
+    { "_Z17rsMatrixTranslateP12rs_matrix4x4fff", (void *)&SC_MatrixTranslate, true },
+
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix4x4PKS_S2_", (void *)&SC_MatrixLoadMultiply_4x4_4x4_4x4, true },
+    { "_Z16rsMatrixMultiplyP12rs_matrix4x4PKS_", (void *)&SC_MatrixMultiply_4x4_4x4, true },
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix3x3PKS_S2_", (void *)&SC_MatrixLoadMultiply_3x3_3x3_3x3, true },
+    { "_Z16rsMatrixMultiplyP12rs_matrix3x3PKS_", (void *)&SC_MatrixMultiply_3x3_3x3, true },
+    { "_Z20rsMatrixLoadMultiplyP12rs_matrix2x2PKS_S2_", (void *)&SC_MatrixLoadMultiply_2x2_2x2_2x2, true },
+    { "_Z16rsMatrixMultiplyP12rs_matrix2x2PKS_", (void *)&SC_MatrixMultiply_2x2_2x2, true },
+
+    { "_Z17rsMatrixLoadOrthoP12rs_matrix4x4ffffff", (void *)&SC_MatrixLoadOrtho, true },
+    { "_Z19rsMatrixLoadFrustumP12rs_matrix4x4ffffff", (void *)&SC_MatrixLoadFrustum, true },
+    { "_Z23rsMatrixLoadPerspectiveP12rs_matrix4x4ffff", (void *)&SC_MatrixLoadPerspective, true },
+
+    { "_Z15rsMatrixInverseP12rs_matrix4x4", (void *)&SC_MatrixInverse_4x4, true },
+    { "_Z24rsMatrixInverseTransposeP12rs_matrix4x4", (void *)&SC_MatrixInverseTranspose_4x4, true },
+    { "_Z17rsMatrixTransposeP12rs_matrix4x4", (void *)&SC_MatrixTranspose_4x4, true },
+    { "_Z17rsMatrixTransposeP12rs_matrix3x3", (void *)&SC_MatrixTranspose_3x3, true },
+    { "_Z17rsMatrixTransposeP12rs_matrix2x2", (void *)&SC_MatrixTranspose_2x2, true },
+
+    // RS Math
+    { "_Z6rsRandi", (void *)&SC_randi, true },
+    { "_Z6rsRandii", (void *)&SC_randi2, true },
+    { "_Z6rsRandf", (void *)&SC_randf, true },
+    { "_Z6rsRandff", (void *)&SC_randf2, true },
+    { "_Z6rsFracf", (void *)&SC_frac, true },
+
+    // Atomics
+    { "_Z11rsAtomicIncPVi", (void *)&SC_AtomicInc, true },
+    { "_Z11rsAtomicIncPVj", (void *)&SC_AtomicInc, true },
+    { "_Z11rsAtomicDecPVi", (void *)&SC_AtomicDec, true },
+    { "_Z11rsAtomicDecPVj", (void *)&SC_AtomicDec, true },
+    { "_Z11rsAtomicAddPVii", (void *)&SC_AtomicAdd, true },
+    { "_Z11rsAtomicAddPVjj", (void *)&SC_AtomicAdd, true },
+    { "_Z11rsAtomicSubPVii", (void *)&SC_AtomicSub, true },
+    { "_Z11rsAtomicSubPVjj", (void *)&SC_AtomicSub, true },
+    { "_Z11rsAtomicAndPVii", (void *)&SC_AtomicAnd, true },
+    { "_Z11rsAtomicAndPVjj", (void *)&SC_AtomicAnd, true },
+    { "_Z10rsAtomicOrPVii", (void *)&SC_AtomicOr, true },
+    { "_Z10rsAtomicOrPVjj", (void *)&SC_AtomicOr, true },
+    { "_Z11rsAtomicXorPVii", (void *)&SC_AtomicXor, true },
+    { "_Z11rsAtomicXorPVjj", (void *)&SC_AtomicXor, true },
+    { "_Z11rsAtomicMinPVii", (void *)&SC_AtomicMin, true },
+    { "_Z11rsAtomicMinPVjj", (void *)&SC_AtomicUMin, true },
+    { "_Z11rsAtomicMaxPVii", (void *)&SC_AtomicMax, true },
+    { "_Z11rsAtomicMaxPVjj", (void *)&SC_AtomicUMax, true },
+    { "_Z11rsAtomicCasPViii", (void *)&SC_AtomicCas, true },
+    { "_Z11rsAtomicCasPVjjj", (void *)&SC_AtomicCas, true },
+
+    { NULL, NULL, false }
+};
+
+const RsdCpuReference::CpuSymbol * RsdCpuScriptImpl::lookupSymbolMath(const char *sym) {
+    const RsdCpuReference::CpuSymbol *syms = gSyms;
+
+    while (syms->fnPtr) {
+        if (!strcmp(syms->name, sym)) {
+            return syms;
+        }
+        syms++;
+    }
+    return NULL;
+}
+
diff --git a/cpu_ref/rsCpuRuntimeStubs.cpp b/cpu_ref/rsCpuRuntimeStubs.cpp
new file mode 100644
index 0000000..b87a639
--- /dev/null
+++ b/cpu_ref/rsCpuRuntimeStubs.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsContext.h"
+#include "rsScriptC.h"
+#include "rsMatrix4x4.h"
+#include "rsMatrix3x3.h"
+#include "rsMatrix2x2.h"
+#include "rsRuntime.h"
+
+#include "utils/Timers.h"
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+
+#include <time.h>
+
+using namespace android;
+using namespace android::renderscript;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+typedef unsigned char uchar3 __attribute__((ext_vector_type(3)));
+typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
+typedef unsigned short ushort3 __attribute__((ext_vector_type(3)));
+typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+typedef int32_t int2 __attribute__((ext_vector_type(2)));
+typedef int32_t int3 __attribute__((ext_vector_type(3)));
+typedef int32_t int4 __attribute__((ext_vector_type(4)));
+typedef uint32_t uint2 __attribute__((ext_vector_type(2)));
+typedef uint32_t uint3 __attribute__((ext_vector_type(3)));
+typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
+typedef long long long2 __attribute__((ext_vector_type(2)));
+typedef long long long3 __attribute__((ext_vector_type(3)));
+typedef long long long4 __attribute__((ext_vector_type(4)));
+typedef unsigned long long ulong2 __attribute__((ext_vector_type(2)));
+typedef unsigned long long ulong3 __attribute__((ext_vector_type(3)));
+typedef unsigned long long ulong4 __attribute__((ext_vector_type(4)));
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Message routines
+//////////////////////////////////////////////////////////////////////////////
+
+
+int SC_divsi3(int a, int b) {
+    return a / b;
+}
+
+int SC_modsi3(int a, int b) {
+    return a % b;
+}
+
+unsigned int SC_udivsi3(unsigned int a, unsigned int b) {
+    return a / b;
+}
+
+unsigned int SC_umodsi3(unsigned int a, unsigned int b) {
+    return a % b;
+}
+
+static void SC_debugF(const char *s, float f) {
+    ALOGD("%s %f, 0x%08x", s, f, *((int *) (&f)));
+}
+static void SC_debugFv2(const char *s, float f1, float f2) {
+    ALOGD("%s {%f, %f}", s, f1, f2);
+}
+static void SC_debugFv3(const char *s, float f1, float f2, float f3) {
+    ALOGD("%s {%f, %f, %f}", s, f1, f2, f3);
+}
+static void SC_debugFv4(const char *s, float f1, float f2, float f3, float f4) {
+    ALOGD("%s {%f, %f, %f, %f}", s, f1, f2, f3, f4);
+}
+static void SC_debugF2(const char *s, float2 f) {
+    ALOGD("%s {%f, %f}", s, f.x, f.y);
+}
+static void SC_debugF3(const char *s, float3 f) {
+    ALOGD("%s {%f, %f, %f}", s, f.x, f.y, f.z);
+}
+static void SC_debugF4(const char *s, float4 f) {
+    ALOGD("%s {%f, %f, %f, %f}", s, f.x, f.y, f.z, f.w);
+}
+static void SC_debugD(const char *s, double d) {
+    ALOGD("%s %f, 0x%08llx", s, d, *((long long *) (&d)));
+}
+static void SC_debugFM4v4(const char *s, const float *f) {
+    ALOGD("%s {%f, %f, %f, %f", s, f[0], f[4], f[8], f[12]);
+    ALOGD("%s  %f, %f, %f, %f", s, f[1], f[5], f[9], f[13]);
+    ALOGD("%s  %f, %f, %f, %f", s, f[2], f[6], f[10], f[14]);
+    ALOGD("%s  %f, %f, %f, %f}", s, f[3], f[7], f[11], f[15]);
+}
+static void SC_debugFM3v3(const char *s, const float *f) {
+    ALOGD("%s {%f, %f, %f", s, f[0], f[3], f[6]);
+    ALOGD("%s  %f, %f, %f", s, f[1], f[4], f[7]);
+    ALOGD("%s  %f, %f, %f}",s, f[2], f[5], f[8]);
+}
+static void SC_debugFM2v2(const char *s, const float *f) {
+    ALOGD("%s {%f, %f", s, f[0], f[2]);
+    ALOGD("%s  %f, %f}",s, f[1], f[3]);
+}
+static void SC_debugI8(const char *s, char c) {
+    ALOGD("%s %hhd  0x%hhx", s, c, (unsigned char)c);
+}
+static void SC_debugC2(const char *s, char2 c) {
+    ALOGD("%s {%hhd, %hhd}  0x%hhx 0x%hhx", s, c.x, c.y, (unsigned char)c.x, (unsigned char)c.y);
+}
+static void SC_debugC3(const char *s, char3 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z);
+}
+static void SC_debugC4(const char *s, char4 c) {
+    ALOGD("%s {%hhd, %hhd, %hhd, %hhd}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, (unsigned char)c.x, (unsigned char)c.y, (unsigned char)c.z, (unsigned char)c.w);
+}
+static void SC_debugU8(const char *s, unsigned char c) {
+    ALOGD("%s %hhu  0x%hhx", s, c, c);
+}
+static void SC_debugUC2(const char *s, uchar2 c) {
+    ALOGD("%s {%hhu, %hhu}  0x%hhx 0x%hhx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUC3(const char *s, uchar3 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUC4(const char *s, uchar4 c) {
+    ALOGD("%s {%hhu, %hhu, %hhu, %hhu}  0x%hhx 0x%hhx 0x%hhx 0x%hhx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugI16(const char *s, short c) {
+    ALOGD("%s %hd  0x%hx", s, c, c);
+}
+static void SC_debugS2(const char *s, short2 c) {
+    ALOGD("%s {%hd, %hd}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugS3(const char *s, short3 c) {
+    ALOGD("%s {%hd, %hd, %hd}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugS4(const char *s, short4 c) {
+    ALOGD("%s {%hd, %hd, %hd, %hd}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugU16(const char *s, unsigned short c) {
+    ALOGD("%s %hu  0x%hx", s, c, c);
+}
+static void SC_debugUS2(const char *s, ushort2 c) {
+    ALOGD("%s {%hu, %hu}  0x%hx 0x%hx", s, c.x, c.y, c.x, c.y);
+}
+static void SC_debugUS3(const char *s, ushort3 c) {
+    ALOGD("%s {%hu, %hu, %hu}  0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.x, c.y, c.z);
+}
+static void SC_debugUS4(const char *s, ushort4 c) {
+    ALOGD("%s {%hu, %hu, %hu, %hu}  0x%hx 0x%hx 0x%hx 0x%hx", s, c.x, c.y, c.z, c.w, c.x, c.y, c.z, c.w);
+}
+static void SC_debugI32(const char *s, int32_t i) {
+    ALOGD("%s %d  0x%x", s, i, i);
+}
+static void SC_debugI2(const char *s, int2 i) {
+    ALOGD("%s {%d, %d}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugI3(const char *s, int3 i) {
+    ALOGD("%s {%d, %d, %d}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugI4(const char *s, int4 i) {
+    ALOGD("%s {%d, %d, %d, %d}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
+}
+static void SC_debugU32(const char *s, uint32_t i) {
+    ALOGD("%s %u  0x%x", s, i, i);
+}
+static void SC_debugUI2(const char *s, uint2 i) {
+    ALOGD("%s {%u, %u}  0x%x 0x%x", s, i.x, i.y, i.x, i.y);
+}
+static void SC_debugUI3(const char *s, uint3 i) {
+    ALOGD("%s {%u, %u, %u}  0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.x, i.y, i.z);
+}
+static void SC_debugUI4(const char *s, uint4 i) {
+    ALOGD("%s {%u, %u, %u, %u}  0x%x 0x%x 0x%x 0x%x", s, i.x, i.y, i.z, i.w, i.x, i.y, i.z, i.w);
+}
+static void SC_debugLL64(const char *s, long long ll) {
+    ALOGD("%s %lld  0x%llx", s, ll, ll);
+}
+static void SC_debugL2(const char *s, long2 ll) {
+    ALOGD("%s {%lld, %lld}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugL3(const char *s, long3 ll) {
+    ALOGD("%s {%lld, %lld, %lld}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugL4(const char *s, long4 ll) {
+    ALOGD("%s {%lld, %lld, %lld, %lld}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
+static void SC_debugULL64(const char *s, unsigned long long ll) {
+    ALOGD("%s %llu  0x%llx", s, ll, ll);
+}
+static void SC_debugUL2(const char *s, ulong2 ll) {
+    ALOGD("%s {%llu, %llu}  0x%llx 0x%llx", s, ll.x, ll.y, ll.x, ll.y);
+}
+static void SC_debugUL3(const char *s, ulong3 ll) {
+    ALOGD("%s {%llu, %llu, %llu}  0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.x, ll.y, ll.z);
+}
+static void SC_debugUL4(const char *s, ulong4 ll) {
+    ALOGD("%s {%llu, %llu, %llu, %llu}  0x%llx 0x%llx 0x%llx 0x%llx", s, ll.x, ll.y, ll.z, ll.w, ll.x, ll.y, ll.z, ll.w);
+}
+static void SC_debugP(const char *s, const void *p) {
+    ALOGD("%s %p", s, p);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Stub implementation
+//////////////////////////////////////////////////////////////////////////////
+
+// llvm name mangling ref
+//  <builtin-type> ::= v  # void
+//                 ::= b  # bool
+//                 ::= c  # char
+//                 ::= a  # signed char
+//                 ::= h  # unsigned char
+//                 ::= s  # short
+//                 ::= t  # unsigned short
+//                 ::= i  # int
+//                 ::= j  # unsigned int
+//                 ::= l  # long
+//                 ::= m  # unsigned long
+//                 ::= x  # long long, __int64
+//                 ::= y  # unsigned long long, __int64
+//                 ::= f  # float
+//                 ::= d  # double
+
+static RsdCpuReference::CpuSymbol gSyms[] = {
+    { "memset", (void *)&memset, true },
+    { "memcpy", (void *)&memcpy, true },
+
+    // Debug
+    { "_Z7rsDebugPKcf", (void *)&SC_debugF, true },
+    { "_Z7rsDebugPKcff", (void *)&SC_debugFv2, true },
+    { "_Z7rsDebugPKcfff", (void *)&SC_debugFv3, true },
+    { "_Z7rsDebugPKcffff", (void *)&SC_debugFv4, true },
+    { "_Z7rsDebugPKcDv2_f", (void *)&SC_debugF2, true },
+    { "_Z7rsDebugPKcDv3_f", (void *)&SC_debugF3, true },
+    { "_Z7rsDebugPKcDv4_f", (void *)&SC_debugF4, true },
+    { "_Z7rsDebugPKcd", (void *)&SC_debugD, true },
+    { "_Z7rsDebugPKcPK12rs_matrix4x4", (void *)&SC_debugFM4v4, true },
+    { "_Z7rsDebugPKcPK12rs_matrix3x3", (void *)&SC_debugFM3v3, true },
+    { "_Z7rsDebugPKcPK12rs_matrix2x2", (void *)&SC_debugFM2v2, true },
+    { "_Z7rsDebugPKcc", (void *)&SC_debugI8, true },
+    { "_Z7rsDebugPKcDv2_c", (void *)&SC_debugC2, true },
+    { "_Z7rsDebugPKcDv3_c", (void *)&SC_debugC3, true },
+    { "_Z7rsDebugPKcDv4_c", (void *)&SC_debugC4, true },
+    { "_Z7rsDebugPKch", (void *)&SC_debugU8, true },
+    { "_Z7rsDebugPKcDv2_h", (void *)&SC_debugUC2, true },
+    { "_Z7rsDebugPKcDv3_h", (void *)&SC_debugUC3, true },
+    { "_Z7rsDebugPKcDv4_h", (void *)&SC_debugUC4, true },
+    { "_Z7rsDebugPKcs", (void *)&SC_debugI16, true },
+    { "_Z7rsDebugPKcDv2_s", (void *)&SC_debugS2, true },
+    { "_Z7rsDebugPKcDv3_s", (void *)&SC_debugS3, true },
+    { "_Z7rsDebugPKcDv4_s", (void *)&SC_debugS4, true },
+    { "_Z7rsDebugPKct", (void *)&SC_debugU16, true },
+    { "_Z7rsDebugPKcDv2_t", (void *)&SC_debugUS2, true },
+    { "_Z7rsDebugPKcDv3_t", (void *)&SC_debugUS3, true },
+    { "_Z7rsDebugPKcDv4_t", (void *)&SC_debugUS4, true },
+    { "_Z7rsDebugPKci", (void *)&SC_debugI32, true },
+    { "_Z7rsDebugPKcDv2_i", (void *)&SC_debugI2, true },
+    { "_Z7rsDebugPKcDv3_i", (void *)&SC_debugI3, true },
+    { "_Z7rsDebugPKcDv4_i", (void *)&SC_debugI4, true },
+    { "_Z7rsDebugPKcj", (void *)&SC_debugU32, true },
+    { "_Z7rsDebugPKcDv2_j", (void *)&SC_debugUI2, true },
+    { "_Z7rsDebugPKcDv3_j", (void *)&SC_debugUI3, true },
+    { "_Z7rsDebugPKcDv4_j", (void *)&SC_debugUI4, true },
+    // Both "long" and "unsigned long" need to be redirected to their
+    // 64-bit counterparts, since we have hacked Slang to use 64-bit
+    // for "long" on Arm (to be similar to Java).
+    { "_Z7rsDebugPKcl", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_l", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_l", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_l", (void *)&SC_debugL4, true },
+    { "_Z7rsDebugPKcm", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_m", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_m", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_m", (void *)&SC_debugUL4, true },
+    { "_Z7rsDebugPKcx", (void *)&SC_debugLL64, true },
+    { "_Z7rsDebugPKcDv2_x", (void *)&SC_debugL2, true },
+    { "_Z7rsDebugPKcDv3_x", (void *)&SC_debugL3, true },
+    { "_Z7rsDebugPKcDv4_x", (void *)&SC_debugL4, true },
+    { "_Z7rsDebugPKcy", (void *)&SC_debugULL64, true },
+    { "_Z7rsDebugPKcDv2_y", (void *)&SC_debugUL2, true },
+    { "_Z7rsDebugPKcDv3_y", (void *)&SC_debugUL3, true },
+    { "_Z7rsDebugPKcDv4_y", (void *)&SC_debugUL4, true },
+    { "_Z7rsDebugPKcPKv", (void *)&SC_debugP, true },
+
+    { NULL, NULL, false }
+};
+
+
+void * RsdCpuScriptImpl::lookupRuntimeStub(void* pContext, char const* name) {
+    RsdCpuScriptImpl *s = (RsdCpuScriptImpl *)pContext;
+    const RsdCpuReference::CpuSymbol *syms = gSyms;
+    const RsdCpuReference::CpuSymbol *sym = NULL;
+
+    sym = s->mCtx->symLookup(name);
+    if (!sym) {
+        sym = s->lookupSymbolMath(name);
+    }
+    if (!sym) {
+        while (syms->fnPtr) {
+            if (!strcmp(syms->name, name)) {
+                sym = syms;
+            }
+            syms++;
+        }
+    }
+
+    if (sym) {
+        s->mIsThreadable &= sym->threadable;
+        return sym->fnPtr;
+    }
+    ALOGE("ScriptC sym lookup failed for %s", name);
+    return NULL;
+}
+
+
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
new file mode 100644
index 0000000..06ce4bb
--- /dev/null
+++ b/cpu_ref/rsCpuScript.cpp
@@ -0,0 +1,463 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#include "rsCpuCore.h"
+
+#include "rsCpuScript.h"
+//#include "rsdRuntime.h"
+//#include "rsdAllocation.h"
+//#include "rsCpuIntrinsics.h"
+
+
+#include "utils/Vector.h"
+#include "utils/Timers.h"
+#include "utils/StopWatch.h"
+
+
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
+
+namespace android {
+namespace renderscript {
+
+
+
+RsdCpuScriptImpl::RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s) {
+    mCtx = ctx;
+    mScript = s;
+
+    mRoot = NULL;
+    mRootExpand = NULL;
+    mInit = NULL;
+    mFreeChildren = NULL;
+
+    mCompilerContext = NULL;
+    mCompilerDriver = NULL;
+    mExecutable = NULL;
+
+    mBoundAllocs = NULL;
+    mIntrinsicData = NULL;
+    mIsThreadable = true;
+}
+
+
+bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir,
+                            uint8_t const *bitcode, size_t bitcodeSize,
+                            uint32_t flags) {
+    //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir, bitcode, bitcodeSize, flags, lookupFunc);
+    //ALOGE("rsdScriptInit %p %p", rsc, script);
+
+    mCtx->lockMutex();
+
+    bcc::RSExecutable *exec;
+    const bcc::RSInfo *info;
+
+    mCompilerContext = NULL;
+    mCompilerDriver = NULL;
+    mExecutable = NULL;
+
+    mCompilerContext = new bcc::BCCContext();
+    if (mCompilerContext == NULL) {
+        ALOGE("bcc: FAILS to create compiler context (out of memory)");
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mCompilerDriver = new bcc::RSCompilerDriver();
+    if (mCompilerDriver == NULL) {
+        ALOGE("bcc: FAILS to create compiler driver (out of memory)");
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mCompilerDriver->setRSRuntimeLookupFunction(lookupRuntimeStub);
+    mCompilerDriver->setRSRuntimeLookupContext(this);
+
+    exec = mCompilerDriver->build(*mCompilerContext, cacheDir, resName,
+                                  (const char *)bitcode, bitcodeSize, NULL);
+
+    if (exec == NULL) {
+        ALOGE("bcc: FAILS to prepare executable for '%s'", resName);
+        mCtx->unlockMutex();
+        return false;
+    }
+
+    mExecutable = exec;
+
+    exec->setThreadable(mIsThreadable);
+    if (!exec->syncInfo()) {
+        ALOGW("bcc: FAILS to synchronize the RS info file to the disk");
+    }
+
+    mRoot = reinterpret_cast<int (*)()>(exec->getSymbolAddress("root"));
+    mRootExpand =
+        reinterpret_cast<int (*)()>(exec->getSymbolAddress("root.expand"));
+    mInit = reinterpret_cast<void (*)()>(exec->getSymbolAddress("init"));
+    mFreeChildren =
+        reinterpret_cast<void (*)()>(exec->getSymbolAddress(".rs.dtor"));
+
+
+    info = &mExecutable->getInfo();
+    if (info->getExportVarNames().size()) {
+        mBoundAllocs = new Allocation *[info->getExportVarNames().size()];
+        memset(mBoundAllocs, 0, sizeof(void *) * info->getExportVarNames().size());
+    }
+
+    mCtx->unlockMutex();
+    return true;
+}
+
+void RsdCpuScriptImpl::populateScript(Script *script) {
+    const bcc::RSInfo *info = &mExecutable->getInfo();
+
+    // Copy info over to runtime
+    script->mHal.info.exportedFunctionCount = info->getExportFuncNames().size();
+    script->mHal.info.exportedVariableCount = info->getExportVarNames().size();
+    script->mHal.info.exportedPragmaCount = info->getPragmas().size();
+    script->mHal.info.exportedPragmaKeyList =
+        const_cast<const char**>(mExecutable->getPragmaKeys().array());
+    script->mHal.info.exportedPragmaValueList =
+        const_cast<const char**>(mExecutable->getPragmaValues().array());
+
+    if (mRootExpand) {
+        script->mHal.info.root = mRootExpand;
+    } else {
+        script->mHal.info.root = mRoot;
+    }
+}
+
+/*
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+    pthread_mutex_lock(&rsdgInitMutex);
+
+    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
+    if (drv == NULL) {
+        goto error;
+    }
+    s->mHal.drv = drv;
+    drv->mIntrinsicID = iid;
+    drv->mIntrinsicData = rsdIntrinsic_Init(rsc, s, iid, &drv->mIntrinsicFuncs);
+    s->mHal.info.isThreadable = true;
+
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return true;
+
+error:
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return false;
+}
+*/
+
+typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                                        const void * usr, uint32_t usrLen,
+                                        const RsScriptCall *sc,
+                                        MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    if (ain) {
+        mtls->fep.dimX = ain->getType()->getDimX();
+        mtls->fep.dimY = ain->getType()->getDimY();
+        mtls->fep.dimZ = ain->getType()->getDimZ();
+        //mtls->dimArray = ain->getType()->getDimArray();
+    } else if (aout) {
+        mtls->fep.dimX = aout->getType()->getDimX();
+        mtls->fep.dimY = aout->getType()->getDimY();
+        mtls->fep.dimZ = aout->getType()->getDimZ();
+        //mtls->dimArray = aout->getType()->getDimArray();
+    } else {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
+
+    mtls->rsc = mCtx;
+    mtls->ain = ain;
+    mtls->aout = aout;
+    mtls->fep.usr = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 1;
+    mtls->mSliceNum = 0;
+
+    mtls->fep.ptrIn = NULL;
+    mtls->fep.eStrideIn = 0;
+    mtls->isThreadable = mIsThreadable;
+
+    if (ain) {
+        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    }
+}
+
+
+void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
+                                     const Allocation * ain,
+                                     Allocation * aout,
+                                     const void * usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
+    forEachKernelSetup(slot, &mtls);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ain, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
+
+    mtls->script = this;
+    mtls->fep.slot = slot;
+
+    rsAssert(slot < mExecutable->getExportForeachFuncAddrs().size());
+    mtls->kernel = reinterpret_cast<ForEachFunc_t>(
+                      mExecutable->getExportForeachFuncAddrs()[slot]);
+    rsAssert(mtls->kernel != NULL);
+    mtls->sig = mExecutable->getInfo().getExportForeachFuncs()[slot].second;
+}
+
+int RsdCpuScriptImpl::invokeRoot() {
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    int ret = mRoot();
+    mCtx->setTLS(oldTLS);
+    return ret;
+}
+
+void RsdCpuScriptImpl::invokeInit() {
+    if (mInit) {
+        mInit();
+    }
+}
+
+void RsdCpuScriptImpl::invokeFreeChildren() {
+    if (mFreeChildren) {
+        mFreeChildren();
+    }
+}
+
+void RsdCpuScriptImpl::invokeFunction(uint32_t slot, const void *params,
+                                      size_t paramLength) {
+    //ALOGE("invoke %p %p %i %p %i", dc, script, slot, params, paramLength);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    reinterpret_cast<void (*)(const void *, uint32_t)>(
+        mExecutable->getExportFuncAddrs()[slot])(params, paramLength);
+    mCtx->setTLS(oldTLS);
+}
+
+void RsdCpuScriptImpl::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
+    //rsAssert(!script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalVar %p %p %i %p %i", dc, script, slot, data, dataLength);
+
+    //if (mIntrinsicID) {
+        //mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
+        //return;
+    //}
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    memcpy(destPtr, data, dataLength);
+}
+
+void RsdCpuScriptImpl::setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                                const Element *elem,
+                                                const size_t *dims, size_t dimLength) {
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+        mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    // We want to look at dimension in terms of integer components,
+    // but dimLength is given in terms of bytes.
+    dimLength /= sizeof(int);
+
+    // Only a single dimension is currently supported.
+    rsAssert(dimLength == 1);
+    if (dimLength == 1) {
+        // First do the increment loop.
+        size_t stride = elem->getSizeBytes();
+        const char *cVal = reinterpret_cast<const char *>(data);
+        for (size_t i = 0; i < dims[0]; i++) {
+            elem->incRefs(cVal);
+            cVal += stride;
+        }
+
+        // Decrement loop comes after (to prevent race conditions).
+        char *oldVal = reinterpret_cast<char *>(destPtr);
+        for (size_t i = 0; i < dims[0]; i++) {
+            elem->decRefs(oldVal);
+            oldVal += stride;
+        }
+    }
+
+    memcpy(destPtr, data, dataLength);
+}
+
+void RsdCpuScriptImpl::setGlobalBind(uint32_t slot, Allocation *data) {
+
+    //rsAssert(!script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalBind %p %p %i %p", dc, script, slot, data);
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    void *ptr = NULL;
+    mBoundAllocs[slot] = data;
+    if(data) {
+        ptr = data->mHal.drvState.lod[0].mallocPtr;
+    }
+    memcpy(destPtr, &ptr, sizeof(void *));
+}
+
+void RsdCpuScriptImpl::setGlobalObj(uint32_t slot, ObjectBase *data) {
+
+    //rsAssert(script->mFieldIsObject[slot]);
+    //ALOGE("setGlobalObj %p %p %i %p", dc, script, slot, data);
+
+    //if (mIntrinsicID) {
+        //mIntrinsicFuncs.setVarObj(dc, script, drv->mIntrinsicData, slot, alloc);
+        //return;
+    //}
+
+    int32_t *destPtr = reinterpret_cast<int32_t *>(
+                          mExecutable->getExportVarAddrs()[slot]);
+    if (!destPtr) {
+        //ALOGV("Calling setVar on slot = %i which is null", slot);
+        return;
+    }
+
+    rsrSetObject(mCtx->getContext(), (ObjectBase **)destPtr, data);
+}
+
+RsdCpuScriptImpl::~RsdCpuScriptImpl() {
+
+    if (mExecutable) {
+        Vector<void *>::const_iterator var_addr_iter =
+            mExecutable->getExportVarAddrs().begin();
+        Vector<void *>::const_iterator var_addr_end =
+            mExecutable->getExportVarAddrs().end();
+
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
+            mExecutable->getInfo().getObjectSlots().begin();
+        bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_end =
+            mExecutable->getInfo().getObjectSlots().end();
+
+        while ((var_addr_iter != var_addr_end) &&
+               (is_object_iter != is_object_end)) {
+            // The field address can be NULL if the script-side has optimized
+            // the corresponding global variable away.
+            ObjectBase **obj_addr =
+                reinterpret_cast<ObjectBase **>(*var_addr_iter);
+            if (*is_object_iter) {
+                if (*var_addr_iter != NULL) {
+                    rsrClearObject(mCtx->getContext(), obj_addr);
+                }
+            }
+            var_addr_iter++;
+            is_object_iter++;
+        }
+    }
+
+    if (mCompilerContext) {
+        delete mCompilerContext;
+    }
+    if (mCompilerDriver) {
+        delete mCompilerDriver;
+    }
+    if (mExecutable) {
+        delete mExecutable;
+    }
+    if (mBoundAllocs) {
+        delete[] mBoundAllocs;
+    }
+}
+
+Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
+    if (!ptr) {
+        return NULL;
+    }
+
+    for (uint32_t ct=0; ct < mScript->mHal.info.exportedVariableCount; ct++) {
+        Allocation *a = mBoundAllocs[ct];
+        if (!a) continue;
+        if (a->mHal.drvState.lod[0].mallocPtr == ptr) {
+            return a;
+        }
+    }
+    ALOGE("rsGetAllocation, failed to find %p", ptr);
+    return NULL;
+}
+
+
+}
+}
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
new file mode 100644
index 0000000..2197a20
--- /dev/null
+++ b/cpu_ref/rsCpuScript.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_BCC_H
+#define RSD_BCC_H
+
+#include <rs_hal.h>
+#include <rsRuntime.h>
+
+#include "rsCpuCore.h"
+
+namespace bcc {
+    class BCCContext;
+    class RSCompilerDriver;
+    class RSExecutable;
+}
+
+namespace android {
+namespace renderscript {
+
+
+
+class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
+public:
+    typedef void (*outer_foreach_t)(
+        const RsForEachStubParamStruct *,
+        uint32_t x1, uint32_t x2,
+        uint32_t instep, uint32_t outstep);
+
+    bool init(char const *resName, char const *cacheDir,
+              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags);
+    virtual void populateScript(Script *);
+
+    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual int invokeRoot();
+    virtual void invokeForEach(uint32_t slot,
+                       const Allocation * ain,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+    virtual void invokeInit();
+    virtual void invokeFreeChildren();
+
+    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                  const Element *e, const size_t *dims, size_t dimLength);
+    virtual void setGlobalBind(uint32_t slot, Allocation *data);
+    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
+
+
+    virtual ~RsdCpuScriptImpl();
+    RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s);
+
+    const Script * getScript() {return mScript;}
+
+    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
+                          const void * usr, uint32_t usrLen,
+                          const RsScriptCall *sc, MTLaunchStruct *mtls);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+
+
+    const RsdCpuReference::CpuSymbol * lookupSymbolMath(const char *sym);
+    static void * lookupRuntimeStub(void* pContext, char const* name);
+
+    virtual Allocation * getAllocationForPointer(const void *ptr) const;
+
+
+protected:
+    RsdCpuReferenceImpl *mCtx;
+    const Script *mScript;
+
+    int (*mRoot)();
+    int (*mRootExpand)();
+    void (*mInit)();
+    void (*mFreeChildren)();
+
+    bcc::BCCContext *mCompilerContext;
+    bcc::RSCompilerDriver *mCompilerDriver;
+    bcc::RSExecutable *mExecutable;
+
+    Allocation **mBoundAllocs;
+    void * mIntrinsicData;
+    bool mIsThreadable;
+
+};
+
+
+Allocation * rsdScriptGetAllocationForPointer(
+                        const Context *dc,
+                        const Script *script,
+                        const void *);
+
+
+
+}
+}
+
+#endif
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
new file mode 100644
index 0000000..765057d
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsCpuScriptGroup.h"
+
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
+
+#include "rsScript.h"
+#include "rsScriptGroup.h"
+#include "rsCpuScriptGroup.h"
+//#include "rsdBcc.h"
+//#include "rsdAllocation.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg) {
+    mCtx = ctx;
+    mSG = sg;
+}
+
+CpuScriptGroupImpl::~CpuScriptGroupImpl() {
+
+}
+
+bool CpuScriptGroupImpl::init() {
+    return true;
+}
+
+void CpuScriptGroupImpl::setInput(const ScriptKernelID *kid, Allocation *a) {
+}
+
+void CpuScriptGroupImpl::setOutput(const ScriptKernelID *kid, Allocation *a) {
+}
+
+
+typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
+                                      uint32_t xstart, uint32_t xend,
+                                      uint32_t instep, uint32_t outstep);
+
+void CpuScriptGroupImpl::scriptGroupRoot(const RsForEachStubParamStruct *p,
+                                         uint32_t xstart, uint32_t xend,
+                                         uint32_t instep, uint32_t outstep) {
+
+
+    const ScriptList *sl = (const ScriptList *)p->usr;
+    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
+    const void *oldUsr = p->usr;
+
+    for(size_t ct=0; ct < sl->count; ct++) {
+        ScriptGroupRootFunc_t func;
+        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
+        mp->usr = sl->usrPtrs[ct];
+
+        mp->ptrIn = NULL;
+        mp->in = NULL;
+        mp->ptrOut = NULL;
+        mp->out = NULL;
+
+        if (sl->ins[ct]) {
+            mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+            mp->in = mp->ptrIn;
+            if (sl->inExts[ct]) {
+                mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
+            } else {
+                if (sl->ins[ct]->mHal.drvState.lod[0].dimY > p->lid) {
+                    mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->lid;
+                }
+            }
+        }
+
+        if (sl->outs[ct]) {
+            mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
+            mp->out = mp->ptrOut;
+            if (sl->outExts[ct]) {
+                mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
+            } else {
+                if (sl->outs[ct]->mHal.drvState.lod[0].dimY > p->lid) {
+                    mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->lid;
+                }
+            }
+        }
+
+        //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
+        func(p, xstart, xend, instep, outstep);
+    }
+    //ALOGE("script group root");
+
+    //ConvolveParams *cp = (ConvolveParams *)p->usr;
+
+    mp->usr = oldUsr;
+}
+
+
+
+void CpuScriptGroupImpl::execute() {
+    Vector<Allocation *> ins;
+    Vector<bool> inExts;
+    Vector<Allocation *> outs;
+    Vector<bool> outExts;
+    Vector<const ScriptKernelID *> kernels;
+    bool fieldDep = false;
+
+    for (size_t ct=0; ct < mSG->mNodes.size(); ct++) {
+        ScriptGroup::Node *n = mSG->mNodes[ct];
+        Script *s = n->mKernels[0]->mScript;
+
+        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
+
+        for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
+            if (n->mInputs[ct2]->mDstField.get() && n->mInputs[ct2]->mDstField->mScript) {
+                //ALOGE("field %p %zu", n->mInputs[ct2]->mDstField->mScript, n->mInputs[ct2]->mDstField->mSlot);
+                s->setVarObj(n->mInputs[ct2]->mDstField->mSlot, n->mInputs[ct2]->mAlloc.get());
+            }
+        }
+
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            const ScriptKernelID *k = n->mKernels[ct2];
+            Allocation *ain = NULL;
+            Allocation *aout = NULL;
+            bool inExt = false;
+            bool outExt = false;
+
+            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
+                if (n->mInputs[ct3]->mDstKernel.get() == k) {
+                    ain = n->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" link in %p", ain);
+                }
+            }
+            for (size_t ct3=0; ct3 < mSG->mInputs.size(); ct3++) {
+                if (mSG->mInputs[ct3]->mKernel == k) {
+                    ain = mSG->mInputs[ct3]->mAlloc.get();
+                    inExt = true;
+                    //ALOGE(" io in %p", ain);
+                }
+            }
+
+            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == k) {
+                    aout = n->mOutputs[ct3]->mAlloc.get();
+                    if(n->mOutputs[ct3]->mDstField.get() != NULL) {
+                        fieldDep = true;
+                    }
+                    //ALOGE(" link out %p", aout);
+                }
+            }
+            for (size_t ct3=0; ct3 < mSG->mOutputs.size(); ct3++) {
+                if (mSG->mOutputs[ct3]->mKernel == k) {
+                    aout = mSG->mOutputs[ct3]->mAlloc.get();
+                    outExt = true;
+                    //ALOGE(" io out %p", aout);
+                }
+            }
+
+            if ((k->mHasKernelOutput == (aout != NULL)) &&
+                (k->mHasKernelInput == (ain != NULL))) {
+                ins.add(ain);
+                inExts.add(inExt);
+                outs.add(aout);
+                outExts.add(outExt);
+                kernels.add(k);
+            }
+        }
+
+    }
+
+    MTLaunchStruct mtls;
+
+    if(fieldDep) {
+        for (size_t ct=0; ct < ins.size(); ct++) {
+            Script *s = kernels[ct]->mScript;
+            RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+            uint32_t slot = kernels[ct]->mSlot;
+
+            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+            si->forEachKernelSetup(slot, &mtls);
+            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
+        }
+    } else {
+        ScriptList sl;
+        sl.ins = ins.array();
+        sl.outs = outs.array();
+        sl.kernels = kernels.array();
+        sl.count = kernels.size();
+
+        Vector<const void *> usrPtrs;
+        Vector<const void *> fnPtrs;
+        Vector<uint32_t> sigs;
+        for (size_t ct=0; ct < kernels.size(); ct++) {
+            Script *s = kernels[ct]->mScript;
+            RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+
+            si->forEachKernelSetup(kernels[ct]->mSlot, &mtls);
+            fnPtrs.add((void *)mtls.kernel);
+            usrPtrs.add(mtls.fep.usr);
+            sigs.add(mtls.fep.usrLen);
+        }
+        sl.sigs = sigs.array();
+        sl.usrPtrs = usrPtrs.array();
+        sl.fnPtrs = fnPtrs.array();
+        sl.inExts = inExts.array();
+        sl.outExts = outExts.array();
+
+        Script *s = kernels[0]->mScript;
+        RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
+        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
+        mtls.script = NULL;
+        mtls.kernel = (void (*)())&scriptGroupRoot;
+        mtls.fep.usr = &sl;
+        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+    }
+}
+
+void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg) {
+}
+
+
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
new file mode 100644
index 0000000..f6fa2ac
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2011-2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_SCRIPT_GROUP_H
+#define RSD_SCRIPT_GROUP_H
+
+#include <rsd_cpu.h>
+
+namespace android {
+namespace renderscript {
+
+
+class CpuScriptGroupImpl : public RsdCpuReference::CpuScriptGroup {
+public:
+    virtual void setInput(const ScriptKernelID *kid, Allocation *);
+    virtual void setOutput(const ScriptKernelID *kid, Allocation *);
+    virtual void execute();
+    virtual ~CpuScriptGroupImpl();
+
+    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
+    bool init();
+
+    static void scriptGroupRoot(const RsForEachStubParamStruct *p,
+                                uint32_t xstart, uint32_t xend,
+                                uint32_t instep, uint32_t outstep);
+
+protected:
+    struct ScriptList {
+        size_t count;
+        Allocation *const* ins;
+        bool const* inExts;
+        Allocation *const* outs;
+        bool const* outExts;
+        const void *const* usrPtrs;
+        size_t const *usrSizes;
+        uint32_t const *sigs;
+        const void *const* fnPtrs;
+
+        const ScriptKernelID *const* kernels;
+    };
+    ScriptList mSl;
+    const ScriptGroup *mSG;
+    RsdCpuReferenceImpl *mCtx;
+};
+
+}
+}
+
+#endif // RSD_SCRIPT_GROUP_H
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
new file mode 100644
index 0000000..d96d2d1
--- /dev/null
+++ b/cpu_ref/rsd_cpu.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_CPU_H
+#define RSD_CPU_H
+
+#include "rsAllocation.h"
+
+
+namespace android {
+namespace renderscript {
+
+class ScriptC;
+class Script;
+class ScriptGroup;
+class ScriptKernelID;
+
+
+class RsdCpuReference {
+public:
+    struct CpuSymbol {
+        const char * name;
+        void * fnPtr;
+        bool threadable;
+    };
+
+    typedef const CpuSymbol * (* sym_lookup_t)(Context *, const char *name);
+
+    struct CpuTls {
+        Context *rsc;
+        const ScriptC * sc;
+    };
+
+    class CpuScript {
+    public:
+        virtual void populateScript(Script *) = 0;
+        virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
+        virtual int invokeRoot() = 0;
+        virtual void invokeForEach(uint32_t slot,
+                           const Allocation * ain,
+                           Allocation * aout,
+                           const void * usr,
+                           uint32_t usrLen,
+                           const RsScriptCall *sc) = 0;
+        virtual void invokeInit() = 0;
+        virtual void invokeFreeChildren() = 0;
+
+        virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) = 0;
+        virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
+                                      const Element *e, const size_t *dims, size_t dimLength) = 0;
+        virtual void setGlobalBind(uint32_t slot, Allocation *data) = 0;
+        virtual void setGlobalObj(uint32_t slot, ObjectBase *obj) = 0;
+
+        virtual Allocation * getAllocationForPointer(const void *ptr) const = 0;
+        virtual ~CpuScript() {}
+    };
+    typedef CpuScript * (* script_lookup_t)(Context *, const Script *s);
+
+    class CpuScriptGroup {
+    public:
+        virtual void setInput(const ScriptKernelID *kid, Allocation *) = 0;
+        virtual void setOutput(const ScriptKernelID *kid, Allocation *) = 0;
+        virtual void execute() = 0;
+        virtual ~CpuScriptGroup() {};
+    };
+
+    static Context * getTlsContext();
+    static const Script * getTlsScript();
+
+    static RsdCpuReference * create(Context *c, uint32_t version_major,
+                                    uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn);
+    virtual ~RsdCpuReference();
+    virtual void setPriority(int32_t priority) = 0;
+
+    virtual CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
+                                     uint8_t const *bitcode, size_t bitcodeSize,
+                                     uint32_t flags) = 0;
+    virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
+    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg) = 0;
+};
+
+
+}
+}
+
+#endif