am c214fe59: Fix incorrect error check for mmap

* commit 'c214fe59fc48740ed003a3cde4e5a60517c5d5ce':
  Fix incorrect error check for mmap
diff --git a/Android.mk b/Android.mk
index b14d2d8..4d76839 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,7 +1,7 @@
 
 LOCAL_PATH:=$(call my-dir)
 
-rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions -std=c++11
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
@@ -48,7 +48,7 @@
 	driver/rsdVertexArray.cpp
 
 
-LOCAL_SHARED_LIBRARIES += libRS libRSCpuRef libc++
+LOCAL_SHARED_LIBRARIES += libRS libRSCpuRef
 LOCAL_SHARED_LIBRARIES += liblog libcutils libutils libEGL libGLESv1_CM libGLESv2
 LOCAL_SHARED_LIBRARIES += libui libgui libsync
 
@@ -56,7 +56,8 @@
 
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
 LOCAL_C_INCLUDES += frameworks/rs/cpu_ref/linkloader/include
-LOCAL_C_INCLUDES += external/libcxx/include
+
+LOCAL_CXX_STL := libc++
 
 LOCAL_CFLAGS += $(rs_base_CFLAGS)
 LOCAL_CPPFLAGS += -fno-exceptions
@@ -169,7 +170,7 @@
 	rsThreadIO.cpp \
 	rsType.cpp
 
-LOCAL_SHARED_LIBRARIES += liblog libcutils libutils libEGL libGLESv1_CM libGLESv2 libc++
+LOCAL_SHARED_LIBRARIES += liblog libcutils libutils libEGL libGLESv1_CM libGLESv2
 LOCAL_SHARED_LIBRARIES += libgui libsync libdl libui
 LOCAL_SHARED_LIBRARIES += libft2 libpng libz
 
@@ -177,9 +178,13 @@
 
 LOCAL_C_INCLUDES += external/freetype/include
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
-LOCAL_C_INCLUDES += external/libcxx/include
+
+LOCAL_CXX_STL := libc++
 
 LOCAL_CFLAGS += $(rs_base_CFLAGS)
+# TODO: external/freetype still uses the register keyword
+# Bug: 17163086
+LOCAL_CFLAGS += -Wno-deprecated-register
 
 LOCAL_CPPFLAGS += -fno-exceptions
 
@@ -294,7 +299,6 @@
 
 include $(CLEAR_VARS)
 
-
 LOCAL_MODULE := librsloader
 LOCAL_MODULE_TAGS := optional
 
@@ -302,13 +306,14 @@
 
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
+LOCAL_CXX_STL := libc++
+
 LOCAL_CFLAGS += $(rs_base_CFLAGS)
 LOCAL_CPPFLAGS += -fno-exceptions
 
 LOCAL_C_INCLUDES := \
   $(LOCAL_PATH)/cpu_ref/linkloader \
   $(LOCAL_PATH)/cpu_ref/linkloader/include \
-  external/libcxx/include \
   $(LOCAL_C_INCLUDES)
 
 include $(LLVM_ROOT_PATH)/llvm-device-build.mk
@@ -335,6 +340,8 @@
 
 LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
+LOCAL_CXX_STL := libc++
+
 LOCAL_CFLAGS += $(rs_base_CFLAGS)
 LOCAL_CFLAGS += -D__HOST__
 LOCAL_CPPFLAGS += -fno-exceptions
@@ -348,7 +355,6 @@
 LOCAL_C_INCLUDES := \
   $(LOCAL_PATH)/cpu_ref/linkloader \
   $(LOCAL_PATH)/cpu_ref/linkloader/include \
-  external/libcxx/include \
   $(LOCAL_C_INCLUDES)
 endif
 
diff --git a/api/gen_runtime.cpp b/api/gen_runtime.cpp
index 106d3bb..4570d68 100644
--- a/api/gen_runtime.cpp
+++ b/api/gen_runtime.cpp
@@ -704,7 +704,7 @@
     mLargestVersionNumber = 0;
     while (1) {
         Specification* spec = Specification::scanSpecification(specFile);
-        if (spec == NULL) {
+        if (spec == nullptr) {
             break;
         }
         getFunction(spec->getCleanName())->addSpecification(spec);
@@ -985,7 +985,7 @@
                 return spec;
             } else {
                 delete spec;
-                return NULL;
+                return nullptr;
             }
         }
 
@@ -1048,7 +1048,7 @@
     }
 
     delete spec;
-    return NULL;
+    return nullptr;
 }
 
 void Specification::writeFiles(ofstream& headerFile, ofstream& rsFile, ofstream& javaFile,
diff --git a/cpp/Allocation.cpp b/cpp/Allocation.cpp
index 91ccbd1..50ae239 100644
--- a/cpp/Allocation.cpp
+++ b/cpp/Allocation.cpp
@@ -64,7 +64,7 @@
     mType = t;
     mUsage = usage;
 
-    if (t != NULL) {
+    if (t != nullptr) {
         updateCacheInfo(t);
     }
 
@@ -125,7 +125,7 @@
     BaseObj::updateFromNative();
 
     const void *typeID = RS::dispatch->AllocationGetType(mRS->getContext(), getID());
-    if(typeID != NULL) {
+    if(typeID != nullptr) {
         sp<const Type> old = mType;
         sp<Type> t = new Type((void *)typeID, mRS);
         t->updateFromNative();
@@ -170,23 +170,23 @@
 }
 
 void * Allocation::getPointer(size_t *stride) {
-    void *p = NULL;
+    void *p = nullptr;
     if (!(mUsage & RS_ALLOCATION_USAGE_SHARED)) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Allocation does not support USAGE_SHARED.");
-        return NULL;
+        return nullptr;
     }
 
     // FIXME: decide if lack of getPointer should cause compat mode
-    if (RS::dispatch->AllocationGetPointer == NULL) {
+    if (RS::dispatch->AllocationGetPointer == nullptr) {
         mRS->throwError(RS_ERROR_RUNTIME_ERROR, "Can't use getPointer on older APIs");
-        return NULL;
+        return nullptr;
     }
 
     p = RS::dispatch->AllocationGetPointer(mRS->getContext(), getIDSafe(), 0,
                                            RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X, 0, 0, stride);
     if (mRS->getError() != RS_SUCCESS) {
         mRS->throwError(RS_ERROR_RUNTIME_ERROR, "Allocation lock failed");
-        p = NULL;
+        p = nullptr;
     }
     return p;
 }
@@ -241,7 +241,7 @@
 
 
 void Allocation::validate2DRange(uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h) {
-    if (mAdaptedAllocation != NULL) {
+    if (mAdaptedAllocation != nullptr) {
 
     } else {
         if (((xoff + w) > mCurrentDimX) || ((yoff + h) > mCurrentDimY)) {
@@ -303,7 +303,7 @@
 
 void Allocation::validate3DRange(uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t w,
                                  uint32_t h, uint32_t d) {
-    if (mAdaptedAllocation != NULL) {
+    if (mAdaptedAllocation != nullptr) {
 
     } else {
         if (((xoff + w) > mCurrentDimX) || ((yoff + h) > mCurrentDimY) || ((zoff + d) > mCurrentDimZ)) {
@@ -338,7 +338,7 @@
     }
     if (id == 0) {
         rs->throwError(RS_ERROR_RUNTIME_ERROR, "Allocation creation failed");
-        return NULL;
+        return nullptr;
     }
     return new Allocation(id, rs, type, usage);
 }
@@ -353,7 +353,7 @@
     }
     if (id == 0) {
         rs->throwError(RS_ERROR_RUNTIME_ERROR, "Allocation creation failed");
-        return NULL;
+        return nullptr;
     }
     return new Allocation(id, rs, type, usage);
 }
diff --git a/cpp/Android.mk b/cpp/Android.mk
index a4e4c90..f2457af 100644
--- a/cpp/Android.mk
+++ b/cpp/Android.mk
@@ -11,10 +11,11 @@
 
 LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
 include frameworks/compile/slang/rs_version.mk
 local_cflags_for_rs_cpp += $(RS_VERSION_DEFINE)
-local_cflags_for_rs_cpp += -Wno-unused-parameter
+local_cflags_for_rs_cpp += -Wno-unused-parameter -std=c++11
 
 LOCAL_SRC_FILES := $(rs_cpp_SRC_FILES)
 
@@ -29,20 +30,20 @@
 	libutils \
 	liblog \
 	libdl \
-	libstlport
 
 LOCAL_MODULE:= libRScpp
 
 LOCAL_MODULE_TAGS := optional
 
 LOCAL_C_INCLUDES += frameworks/rs
-LOCAL_C_INCLUDES += external/stlport/stlport bionic/ bionic/libstdc++/include
 LOCAL_C_INCLUDES += $(intermediates)
 
+include external/stlport/libstlport.mk
 include $(BUILD_SHARED_LIBRARY)
 
 
 include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
 ifneq ($(HOST_OS),windows)
 LOCAL_CLANG := true
diff --git a/cpp/BaseObj.cpp b/cpp/BaseObj.cpp
index 2e0a637..e32d0a6 100644
--- a/cpp/BaseObj.cpp
+++ b/cpp/BaseObj.cpp
@@ -21,14 +21,14 @@
 using namespace RSC;
 
 void * BaseObj::getID() const {
-    if (mID == NULL) {
+    if (mID == nullptr) {
         ALOGE("Internal error: Object id 0.");
     }
     return mID;
 }
 
 void * BaseObj::getObjID(sp<const BaseObj> o) {
-    return o == NULL ? NULL : o->getID();
+    return o == nullptr ? nullptr : o->getID();
 }
 
 
@@ -47,12 +47,12 @@
     if (mRS && mRS->getContext()) {
         RS::dispatch->ObjDestroy(mRS->getContext(), mID);
     }
-    mRS = NULL;
-    mID = NULL;
+    mRS = nullptr;
+    mID = nullptr;
 }
 
 void BaseObj::updateFromNative() {
-    const char *name = NULL;
+    const char *name = nullptr;
     RS::dispatch->GetName(mRS->getContext(), mID, &name);
     mName = name;
 }
diff --git a/cpp/Element.cpp b/cpp/Element.cpp
index b122926..b019b0e 100644
--- a/cpp/Element.cpp
+++ b/cpp/Element.cpp
@@ -26,11 +26,11 @@
 android::RSC::sp<const Element> Element::getSubElement(uint32_t index) {
     if (!mVisibleElementMap.size()) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Element contains no sub-elements");
-        return NULL;
+        return nullptr;
     }
     if (index >= mVisibleElementMap.size()) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Illegal sub-element index");
-        return NULL;
+        return nullptr;
     }
     return mElements[mVisibleElementMap[index]];
 }
@@ -38,11 +38,11 @@
 const char * Element::getSubElementName(uint32_t index) {
     if (!mVisibleElementMap.size()) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Element contains no sub-elements");
-        return NULL;
+        return nullptr;
     }
     if (index >= mVisibleElementMap.size()) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "Illegal sub-element index");
-        return NULL;
+        return nullptr;
     }
     return mElementNames[mVisibleElementMap[index]].c_str();
 }
@@ -73,7 +73,7 @@
 
 
 #define CREATE_USER(N, T) android::RSC::sp<const Element> Element::N(android::RSC::sp<RS> rs) { \
-    if (rs->mElements.N == NULL) {                                  \
+    if (rs->mElements.N == nullptr) {                               \
         rs->mElements.N = (createUser(rs, RS_TYPE_##T));            \
     }                                                               \
     return rs->mElements.N;                                         \
@@ -100,10 +100,10 @@
 CREATE_USER(MATRIX_2X2, MATRIX_2X2);
 
 #define CREATE_PIXEL(N, T, K) android::RSC::sp<const Element> Element::N(android::RSC::sp<RS> rs) { \
-    if (rs->mElements.N == NULL) {                                  \
-        rs->mElements.N = createPixel(rs, RS_TYPE_##T, RS_KIND_##K);    \
-    }                                                                   \
-    return rs->mElements.N;                                             \
+    if (rs->mElements.N == nullptr) {                                \
+        rs->mElements.N = createPixel(rs, RS_TYPE_##T, RS_KIND_##K); \
+    }                                                                \
+    return rs->mElements.N;                                          \
 }
 
 CREATE_PIXEL(A_8, UNSIGNED_8, PIXEL_A);
@@ -115,22 +115,22 @@
 CREATE_PIXEL(RGBA_5551, UNSIGNED_5_5_5_1, PIXEL_RGBA);
 
 #define CREATE_VECTOR(N, T) android::RSC::sp<const Element> Element::N##_2(android::RSC::sp<RS> rs) { \
-    if (rs->mElements.N##_2 == NULL) {                                  \
-        rs->mElements.N##_2 = createVector(rs, RS_TYPE_##T, 2);         \
-    }                                                                   \
-    return rs->mElements.N##_2;                                         \
-}                                                                       \
+    if (rs->mElements.N##_2 == nullptr) {                                 \
+        rs->mElements.N##_2 = createVector(rs, RS_TYPE_##T, 2);           \
+    }                                                                     \
+    return rs->mElements.N##_2;                                           \
+}                                                                         \
 android::RSC::sp<const Element> Element::N##_3(android::RSC::sp<RS> rs) { \
-    if (rs->mElements.N##_3 == NULL) {                                  \
-        rs->mElements.N##_3 = createVector(rs, RS_TYPE_##T, 3);         \
-    }                                                                   \
-    return rs->mElements.N##_3;                                         \
+    if (rs->mElements.N##_3 == nullptr) {                                 \
+        rs->mElements.N##_3 = createVector(rs, RS_TYPE_##T, 3);           \
+    }                                                                     \
+    return rs->mElements.N##_3;                                           \
 } \
 android::RSC::sp<const Element> Element::N##_4(android::RSC::sp<RS> rs) { \
-    if (rs->mElements.N##_4 == NULL) {                                  \
-        rs->mElements.N##_4 = createVector(rs, RS_TYPE_##T, 4);         \
-    }                                                                   \
-    return rs->mElements.N##_4;                                         \
+    if (rs->mElements.N##_4 == nullptr) {                                 \
+        rs->mElements.N##_4 = createVector(rs, RS_TYPE_##T, 4);           \
+    }                                                                     \
+    return rs->mElements.N##_4;                                           \
 }
 CREATE_VECTOR(U8, UNSIGNED_8);
 CREATE_VECTOR(I8, SIGNED_8);
@@ -279,7 +279,7 @@
 android::RSC::sp<const Element> Element::createVector(android::RSC::sp<RS> rs, RsDataType dt, uint32_t size) {
     if (size < 2 || size > 4) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Vector size out of range 2-4.");
-        return NULL;
+        return nullptr;
     }
     void *id = RS::dispatch->ElementCreate(rs->getContext(), dt, RS_KIND_USER, false, size);
     return new Element(id, rs, dt, RS_KIND_USER, false, size);
@@ -293,7 +293,7 @@
           dk == RS_KIND_PIXEL_RGBA ||
           dk == RS_KIND_PIXEL_DEPTH)) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Unsupported DataKind");
-        return NULL;
+        return nullptr;
     }
     if (!(dt == RS_TYPE_UNSIGNED_8 ||
           dt == RS_TYPE_UNSIGNED_16 ||
@@ -301,23 +301,23 @@
           dt == RS_TYPE_UNSIGNED_4_4_4_4 ||
           dt == RS_TYPE_UNSIGNED_5_5_5_1)) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Unsupported DataType");
-        return NULL;
+        return nullptr;
     }
     if (dt == RS_TYPE_UNSIGNED_5_6_5 && dk != RS_KIND_PIXEL_RGB) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Bad kind and type combo");
-        return NULL;
+        return nullptr;
     }
     if (dt == RS_TYPE_UNSIGNED_5_5_5_1 && dk != RS_KIND_PIXEL_RGBA) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Bad kind and type combo");
-        return NULL;
+        return nullptr;
     }
     if (dt == RS_TYPE_UNSIGNED_4_4_4_4 && dk != RS_KIND_PIXEL_RGBA) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Bad kind and type combo");
-        return NULL;
+        return nullptr;
     }
     if (dt == RS_TYPE_UNSIGNED_16 && dk != RS_KIND_PIXEL_DEPTH) {
         rs->throwError(RS_ERROR_INVALID_PARAMETER, "Bad kind and type combo");
-        return NULL;
+        return nullptr;
     }
 
     int size = 1;
@@ -411,4 +411,3 @@
     free(elementArray);
     return new Element(id, mRS, mElements, mElementNames, mArraySizes);
 }
-
diff --git a/cpp/RenderScript.cpp b/cpp/RenderScript.cpp
index 49f28a6..e2fd651 100644
--- a/cpp/RenderScript.cpp
+++ b/cpp/RenderScript.cpp
@@ -38,14 +38,14 @@
 bool RS::gInitialized = false;
 bool RS::usingNative = false;
 pthread_mutex_t RS::gInitMutex = PTHREAD_MUTEX_INITIALIZER;
-dispatchTable* RS::dispatch = NULL;
+dispatchTable* RS::dispatch = nullptr;
 static int gInitError = 0;
 
 RS::RS() {
-    mDev = NULL;
-    mContext = NULL;
-    mErrorFunc = NULL;
-    mMessageFunc = NULL;
+    mDev = nullptr;
+    mContext = nullptr;
+    mErrorFunc = nullptr;
+    mMessageFunc = nullptr;
     mMessageRun = false;
     mInit = false;
     mCurrentError = RS_SUCCESS;
@@ -61,15 +61,15 @@
         if (mContext) {
             RS::dispatch->ContextDeinitToClient(mContext);
 
-            void *res = NULL;
+            void *res = nullptr;
             int status = pthread_join(mMessageThreadId, &res);
 
             RS::dispatch->ContextDestroy(mContext);
-            mContext = NULL;
+            mContext = nullptr;
         }
         if (mDev) {
             RS::dispatch->DeviceDestroy(mDev);
-            mDev = NULL;
+            mDev = nullptr;
         }
     }
 }
@@ -81,332 +81,332 @@
 static bool loadSymbols(void* handle) {
 
     RS::dispatch->AllocationGetType = (AllocationGetTypeFnPtr)dlsym(handle, "rsaAllocationGetType");
-    if (RS::dispatch->AllocationGetType == NULL) {
+    if (RS::dispatch->AllocationGetType == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationGetType");
         return false;
     }
     RS::dispatch->TypeGetNativeData = (TypeGetNativeDataFnPtr)dlsym(handle, "rsaTypeGetNativeData");
-    if (RS::dispatch->TypeGetNativeData == NULL) {
+    if (RS::dispatch->TypeGetNativeData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->TypeGetNativeData");
         return false;
     }
     RS::dispatch->ElementGetNativeData = (ElementGetNativeDataFnPtr)dlsym(handle, "rsaElementGetNativeData");
-    if (RS::dispatch->ElementGetNativeData == NULL) {
+    if (RS::dispatch->ElementGetNativeData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ElementGetNativeData");
         return false;
     }
     RS::dispatch->ElementGetSubElements = (ElementGetSubElementsFnPtr)dlsym(handle, "rsaElementGetSubElements");
-    if (RS::dispatch->ElementGetSubElements == NULL) {
+    if (RS::dispatch->ElementGetSubElements == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ElementGetSubElements");
         return false;
     }
     RS::dispatch->DeviceCreate = (DeviceCreateFnPtr)dlsym(handle, "rsDeviceCreate");
-    if (RS::dispatch->DeviceCreate == NULL) {
+    if (RS::dispatch->DeviceCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->DeviceCreate");
         return false;
     }
     RS::dispatch->DeviceDestroy = (DeviceDestroyFnPtr)dlsym(handle, "rsDeviceDestroy");
-    if (RS::dispatch->DeviceDestroy == NULL) {
+    if (RS::dispatch->DeviceDestroy == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->DeviceDestroy");
         return false;
     }
     RS::dispatch->DeviceSetConfig = (DeviceSetConfigFnPtr)dlsym(handle, "rsDeviceSetConfig");
-    if (RS::dispatch->DeviceSetConfig == NULL) {
+    if (RS::dispatch->DeviceSetConfig == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->DeviceSetConfig");
         return false;
     }
     RS::dispatch->ContextCreate = (ContextCreateFnPtr)dlsym(handle, "rsContextCreate");;
-    if (RS::dispatch->ContextCreate == NULL) {
+    if (RS::dispatch->ContextCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextCreate");
         return false;
     }
     RS::dispatch->GetName = (GetNameFnPtr)dlsym(handle, "rsaGetName");;
-    if (RS::dispatch->GetName == NULL) {
+    if (RS::dispatch->GetName == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->GetName");
         return false;
     }
     RS::dispatch->ContextDestroy = (ContextDestroyFnPtr)dlsym(handle, "rsContextDestroy");
-    if (RS::dispatch->ContextDestroy == NULL) {
+    if (RS::dispatch->ContextDestroy == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextDestroy");
         return false;
     }
     RS::dispatch->ContextGetMessage = (ContextGetMessageFnPtr)dlsym(handle, "rsContextGetMessage");
-    if (RS::dispatch->ContextGetMessage == NULL) {
+    if (RS::dispatch->ContextGetMessage == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextGetMessage");
         return false;
     }
     RS::dispatch->ContextPeekMessage = (ContextPeekMessageFnPtr)dlsym(handle, "rsContextPeekMessage");
-    if (RS::dispatch->ContextPeekMessage == NULL) {
+    if (RS::dispatch->ContextPeekMessage == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextPeekMessage");
         return false;
     }
     RS::dispatch->ContextSendMessage = (ContextSendMessageFnPtr)dlsym(handle, "rsContextSendMessage");
-    if (RS::dispatch->ContextSendMessage == NULL) {
+    if (RS::dispatch->ContextSendMessage == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextSendMessage");
         return false;
     }
     RS::dispatch->ContextInitToClient = (ContextInitToClientFnPtr)dlsym(handle, "rsContextInitToClient");
-    if (RS::dispatch->ContextInitToClient == NULL) {
+    if (RS::dispatch->ContextInitToClient == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextInitToClient");
         return false;
     }
     RS::dispatch->ContextDeinitToClient = (ContextDeinitToClientFnPtr)dlsym(handle, "rsContextDeinitToClient");
-    if (RS::dispatch->ContextDeinitToClient == NULL) {
+    if (RS::dispatch->ContextDeinitToClient == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextDeinitToClient");
         return false;
     }
     RS::dispatch->TypeCreate = (TypeCreateFnPtr)dlsym(handle, "rsTypeCreate");
-    if (RS::dispatch->TypeCreate == NULL) {
+    if (RS::dispatch->TypeCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->TypeCreate");
         return false;
     }
     RS::dispatch->AllocationCreateTyped = (AllocationCreateTypedFnPtr)dlsym(handle, "rsAllocationCreateTyped");
-    if (RS::dispatch->AllocationCreateTyped == NULL) {
+    if (RS::dispatch->AllocationCreateTyped == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCreateTyped");
         return false;
     }
     RS::dispatch->AllocationCreateFromBitmap = (AllocationCreateFromBitmapFnPtr)dlsym(handle, "rsAllocationCreateFromBitmap");
-    if (RS::dispatch->AllocationCreateFromBitmap == NULL) {
+    if (RS::dispatch->AllocationCreateFromBitmap == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCreateFromBitmap");
         return false;
     }
     RS::dispatch->AllocationCubeCreateFromBitmap = (AllocationCubeCreateFromBitmapFnPtr)dlsym(handle, "rsAllocationCubeCreateFromBitmap");
-    if (RS::dispatch->AllocationCubeCreateFromBitmap == NULL) {
+    if (RS::dispatch->AllocationCubeCreateFromBitmap == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCubeCreateFromBitmap");
         return false;
     }
     RS::dispatch->AllocationGetSurface = (AllocationGetSurfaceFnPtr)dlsym(handle, "rsAllocationGetSurface");
-    if (RS::dispatch->AllocationGetSurface == NULL) {
+    if (RS::dispatch->AllocationGetSurface == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationGetSurface");
         return false;
     }
     RS::dispatch->AllocationSetSurface = (AllocationSetSurfaceFnPtr)dlsym(handle, "rsAllocationSetSurface");
-    if (RS::dispatch->AllocationSetSurface == NULL) {
+    if (RS::dispatch->AllocationSetSurface == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationSetSurface");
         return false;
     }
     RS::dispatch->ContextFinish = (ContextFinishFnPtr)dlsym(handle, "rsContextFinish");
-    if (RS::dispatch->ContextFinish == NULL) {
+    if (RS::dispatch->ContextFinish == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextFinish");
         return false;
     }
     RS::dispatch->ContextDump = (ContextDumpFnPtr)dlsym(handle, "rsContextDump");
-    if (RS::dispatch->ContextDump == NULL) {
+    if (RS::dispatch->ContextDump == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextDump");
         return false;
     }
     RS::dispatch->ContextSetPriority = (ContextSetPriorityFnPtr)dlsym(handle, "rsContextSetPriority");
-    if (RS::dispatch->ContextSetPriority == NULL) {
+    if (RS::dispatch->ContextSetPriority == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ContextSetPriority");
         return false;
     }
     RS::dispatch->AssignName = (AssignNameFnPtr)dlsym(handle, "rsAssignName");
-    if (RS::dispatch->AssignName == NULL) {
+    if (RS::dispatch->AssignName == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AssignName");
         return false;
     }
     RS::dispatch->ObjDestroy = (ObjDestroyFnPtr)dlsym(handle, "rsObjDestroy");
-    if (RS::dispatch->ObjDestroy == NULL) {
+    if (RS::dispatch->ObjDestroy == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ObjDestroy");
         return false;
     }
     RS::dispatch->ElementCreate = (ElementCreateFnPtr)dlsym(handle, "rsElementCreate");
-    if (RS::dispatch->ElementCreate == NULL) {
+    if (RS::dispatch->ElementCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ElementCreate");
         return false;
     }
     RS::dispatch->ElementCreate2 = (ElementCreate2FnPtr)dlsym(handle, "rsElementCreate2");
-    if (RS::dispatch->ElementCreate2 == NULL) {
+    if (RS::dispatch->ElementCreate2 == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ElementCreate2");
         return false;
     }
     RS::dispatch->AllocationCopyToBitmap = (AllocationCopyToBitmapFnPtr)dlsym(handle, "rsAllocationCopyToBitmap");
-    if (RS::dispatch->AllocationCopyToBitmap == NULL) {
+    if (RS::dispatch->AllocationCopyToBitmap == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCopyToBitmap");
         return false;
     }
     RS::dispatch->Allocation1DData = (Allocation1DDataFnPtr)dlsym(handle, "rsAllocation1DData");
-    if (RS::dispatch->Allocation1DData == NULL) {
+    if (RS::dispatch->Allocation1DData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation1DData");
         return false;
     }
     RS::dispatch->Allocation1DElementData = (Allocation1DElementDataFnPtr)dlsym(handle, "rsAllocation1DElementData");
-    if (RS::dispatch->Allocation1DElementData == NULL) {
+    if (RS::dispatch->Allocation1DElementData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation1DElementData");
         return false;
     }
     RS::dispatch->Allocation2DData = (Allocation2DDataFnPtr)dlsym(handle, "rsAllocation2DData");
-    if (RS::dispatch->Allocation2DData == NULL) {
+    if (RS::dispatch->Allocation2DData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation2DData");
         return false;
     }
     RS::dispatch->Allocation3DData = (Allocation3DDataFnPtr)dlsym(handle, "rsAllocation3DData");
-    if (RS::dispatch->Allocation3DData == NULL) {
+    if (RS::dispatch->Allocation3DData == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation3DData");
         return false;
     }
     RS::dispatch->AllocationGenerateMipmaps = (AllocationGenerateMipmapsFnPtr)dlsym(handle, "rsAllocationGenerateMipmaps");
-    if (RS::dispatch->AllocationGenerateMipmaps == NULL) {
+    if (RS::dispatch->AllocationGenerateMipmaps == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationGenerateMipmaps");
         return false;
     }
     RS::dispatch->AllocationRead = (AllocationReadFnPtr)dlsym(handle, "rsAllocationRead");
-    if (RS::dispatch->AllocationRead == NULL) {
+    if (RS::dispatch->AllocationRead == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationRead");
         return false;
     }
     RS::dispatch->Allocation1DRead = (Allocation1DReadFnPtr)dlsym(handle, "rsAllocation1DRead");
-    if (RS::dispatch->Allocation1DRead == NULL) {
+    if (RS::dispatch->Allocation1DRead == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation1DRead");
         return false;
     }
     RS::dispatch->Allocation2DRead = (Allocation2DReadFnPtr)dlsym(handle, "rsAllocation2DRead");
-    if (RS::dispatch->Allocation2DRead == NULL) {
+    if (RS::dispatch->Allocation2DRead == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->Allocation2DRead");
         return false;
     }
     RS::dispatch->AllocationSyncAll = (AllocationSyncAllFnPtr)dlsym(handle, "rsAllocationSyncAll");
-    if (RS::dispatch->AllocationSyncAll == NULL) {
+    if (RS::dispatch->AllocationSyncAll == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationSyncAll");
         return false;
     }
     RS::dispatch->AllocationResize1D = (AllocationResize1DFnPtr)dlsym(handle, "rsAllocationResize1D");
-    if (RS::dispatch->AllocationResize1D == NULL) {
+    if (RS::dispatch->AllocationResize1D == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationResize1D");
         return false;
     }
     RS::dispatch->AllocationCopy2DRange = (AllocationCopy2DRangeFnPtr)dlsym(handle, "rsAllocationCopy2DRange");
-    if (RS::dispatch->AllocationCopy2DRange == NULL) {
+    if (RS::dispatch->AllocationCopy2DRange == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCopy2DRange");
         return false;
     }
     RS::dispatch->AllocationCopy3DRange = (AllocationCopy3DRangeFnPtr)dlsym(handle, "rsAllocationCopy3DRange");
-    if (RS::dispatch->AllocationCopy3DRange == NULL) {
+    if (RS::dispatch->AllocationCopy3DRange == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationCopy3DRange");
         return false;
     }
     RS::dispatch->SamplerCreate = (SamplerCreateFnPtr)dlsym(handle, "rsSamplerCreate");
-    if (RS::dispatch->SamplerCreate == NULL) {
+    if (RS::dispatch->SamplerCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->SamplerCreate");
         return false;
     }
     RS::dispatch->ScriptBindAllocation = (ScriptBindAllocationFnPtr)dlsym(handle, "rsScriptBindAllocation");
-    if (RS::dispatch->ScriptBindAllocation == NULL) {
+    if (RS::dispatch->ScriptBindAllocation == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptBindAllocation");
         return false;
     }
     RS::dispatch->ScriptSetTimeZone = (ScriptSetTimeZoneFnPtr)dlsym(handle, "rsScriptSetTimeZone");
-    if (RS::dispatch->ScriptSetTimeZone == NULL) {
+    if (RS::dispatch->ScriptSetTimeZone == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetTimeZone");
         return false;
     }
     RS::dispatch->ScriptInvoke = (ScriptInvokeFnPtr)dlsym(handle, "rsScriptInvoke");
-    if (RS::dispatch->ScriptInvoke == NULL) {
+    if (RS::dispatch->ScriptInvoke == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptInvoke");
         return false;
     }
     RS::dispatch->ScriptInvokeV = (ScriptInvokeVFnPtr)dlsym(handle, "rsScriptInvokeV");
-    if (RS::dispatch->ScriptInvokeV == NULL) {
+    if (RS::dispatch->ScriptInvokeV == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptInvokeV");
         return false;
     }
     RS::dispatch->ScriptForEach = (ScriptForEachFnPtr)dlsym(handle, "rsScriptForEach");
-    if (RS::dispatch->ScriptForEach == NULL) {
+    if (RS::dispatch->ScriptForEach == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptForEach");
         return false;
     }
     RS::dispatch->ScriptSetVarI = (ScriptSetVarIFnPtr)dlsym(handle, "rsScriptSetVarI");
-    if (RS::dispatch->ScriptSetVarI == NULL) {
+    if (RS::dispatch->ScriptSetVarI == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarI");
         return false;
     }
     RS::dispatch->ScriptSetVarObj = (ScriptSetVarObjFnPtr)dlsym(handle, "rsScriptSetVarObj");
-    if (RS::dispatch->ScriptSetVarObj == NULL) {
+    if (RS::dispatch->ScriptSetVarObj == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarObj");
         return false;
     }
     RS::dispatch->ScriptSetVarJ = (ScriptSetVarJFnPtr)dlsym(handle, "rsScriptSetVarJ");
-    if (RS::dispatch->ScriptSetVarJ == NULL) {
+    if (RS::dispatch->ScriptSetVarJ == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarJ");
         return false;
     }
     RS::dispatch->ScriptSetVarF = (ScriptSetVarFFnPtr)dlsym(handle, "rsScriptSetVarF");
-    if (RS::dispatch->ScriptSetVarF == NULL) {
+    if (RS::dispatch->ScriptSetVarF == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarF");
         return false;
     }
     RS::dispatch->ScriptSetVarD = (ScriptSetVarDFnPtr)dlsym(handle, "rsScriptSetVarD");
-    if (RS::dispatch->ScriptSetVarD == NULL) {
+    if (RS::dispatch->ScriptSetVarD == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarD");
         return false;
     }
     RS::dispatch->ScriptSetVarV = (ScriptSetVarVFnPtr)dlsym(handle, "rsScriptSetVarV");
-    if (RS::dispatch->ScriptSetVarV == NULL) {
+    if (RS::dispatch->ScriptSetVarV == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarV");
         return false;
     }
     RS::dispatch->ScriptGetVarV = (ScriptGetVarVFnPtr)dlsym(handle, "rsScriptGetVarV");
-    if (RS::dispatch->ScriptGetVarV == NULL) {
+    if (RS::dispatch->ScriptGetVarV == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptGetVarV");
         return false;
     }
     RS::dispatch->ScriptSetVarVE = (ScriptSetVarVEFnPtr)dlsym(handle, "rsScriptSetVarVE");
-    if (RS::dispatch->ScriptSetVarVE == NULL) {
+    if (RS::dispatch->ScriptSetVarVE == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptSetVarVE");
         return false;
     }
     RS::dispatch->ScriptCCreate = (ScriptCCreateFnPtr)dlsym(handle, "rsScriptCCreate");
-    if (RS::dispatch->ScriptCCreate == NULL) {
+    if (RS::dispatch->ScriptCCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptCCreate");
         return false;
     }
     RS::dispatch->ScriptIntrinsicCreate = (ScriptIntrinsicCreateFnPtr)dlsym(handle, "rsScriptIntrinsicCreate");
-    if (RS::dispatch->ScriptIntrinsicCreate == NULL) {
+    if (RS::dispatch->ScriptIntrinsicCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptIntrinsicCreate");
         return false;
     }
     RS::dispatch->ScriptKernelIDCreate = (ScriptKernelIDCreateFnPtr)dlsym(handle, "rsScriptKernelIDCreate");
-    if (RS::dispatch->ScriptKernelIDCreate == NULL) {
+    if (RS::dispatch->ScriptKernelIDCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptKernelIDCreate");
         return false;
     }
     RS::dispatch->ScriptFieldIDCreate = (ScriptFieldIDCreateFnPtr)dlsym(handle, "rsScriptFieldIDCreate");
-    if (RS::dispatch->ScriptFieldIDCreate == NULL) {
+    if (RS::dispatch->ScriptFieldIDCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptFieldIDCreate");
         return false;
     }
     RS::dispatch->ScriptGroupCreate = (ScriptGroupCreateFnPtr)dlsym(handle, "rsScriptGroupCreate");
-    if (RS::dispatch->ScriptGroupCreate == NULL) {
+    if (RS::dispatch->ScriptGroupCreate == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptGroupCreate");
         return false;
     }
     RS::dispatch->ScriptGroupSetOutput = (ScriptGroupSetOutputFnPtr)dlsym(handle, "rsScriptGroupSetOutput");
-    if (RS::dispatch->ScriptGroupSetOutput == NULL) {
+    if (RS::dispatch->ScriptGroupSetOutput == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptGroupSetOutput");
         return false;
     }
     RS::dispatch->ScriptGroupSetInput = (ScriptGroupSetInputFnPtr)dlsym(handle, "rsScriptGroupSetInput");
-    if (RS::dispatch->ScriptGroupSetInput == NULL) {
+    if (RS::dispatch->ScriptGroupSetInput == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptGroupSetInput");
         return false;
     }
     RS::dispatch->ScriptGroupExecute = (ScriptGroupExecuteFnPtr)dlsym(handle, "rsScriptGroupExecute");
-    if (RS::dispatch->ScriptGroupExecute == NULL) {
+    if (RS::dispatch->ScriptGroupExecute == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->ScriptGroupExecute");
         return false;
     }
     RS::dispatch->AllocationIoSend = (AllocationIoSendFnPtr)dlsym(handle, "rsAllocationIoSend");
-    if (RS::dispatch->AllocationIoSend == NULL) {
+    if (RS::dispatch->AllocationIoSend == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationIoSend");
         return false;
     }
     RS::dispatch->AllocationIoReceive = (AllocationIoReceiveFnPtr)dlsym(handle, "rsAllocationIoReceive");
-    if (RS::dispatch->AllocationIoReceive == NULL) {
+    if (RS::dispatch->AllocationIoReceive == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationIoReceive");
         return false;
     }
     RS::dispatch->AllocationGetPointer = (AllocationGetPointerFnPtr)dlsym(handle, "rsAllocationGetPointer");
-    if (RS::dispatch->AllocationGetPointer == NULL) {
+    if (RS::dispatch->AllocationGetPointer == nullptr) {
         ALOGV("Couldn't initialize RS::dispatch->AllocationGetPointer");
         //return false;
     }
@@ -418,7 +418,7 @@
 // because that's when we changed libRS to extern "C" entry points
 static bool loadSO(const char* filename) {
     void* handle = dlopen(filename, RTLD_LAZY | RTLD_LOCAL);
-    if (handle == NULL) {
+    if (handle == nullptr) {
         ALOGV("couldn't dlopen %s, %s", filename, dlerror());
         return false;
     }
@@ -507,7 +507,7 @@
 
     pid_t mNativeMessageThreadId;
 
-    int status = pthread_create(&mMessageThreadId, NULL, threadProc, this);
+    int status = pthread_create(&mMessageThreadId, nullptr, threadProc, this);
     if (status) {
         ALOGE("Failed to start RS message thread.");
         return false;
@@ -567,7 +567,7 @@
         case RS_MESSAGE_TO_CLIENT_ERROR:
             ALOGE("RS Error %s", (const char *)rbuf);
             rs->throwError(RS_ERROR_RUNTIME_ERROR, "Error returned from runtime");
-            if(rs->mMessageFunc != NULL) {
+            if(rs->mMessageFunc != nullptr) {
                 rs->mErrorFunc(usrID, (const char *)rbuf);
             }
             break;
@@ -581,7 +581,7 @@
             usleep(1000);
             break;
         case RS_MESSAGE_TO_CLIENT_USER:
-            if(rs->mMessageFunc != NULL) {
+            if(rs->mMessageFunc != nullptr) {
                 rs->mMessageFunc(usrID, rbuf, receiveLen);
             } else {
                 ALOGE("Received a message from the script with no message handler installed.");
@@ -597,7 +597,7 @@
         free(rbuf);
     }
     ALOGV("RS Message thread exiting.");
-    return NULL;
+    return nullptr;
 }
 
 void RS::setErrorHandler(ErrorHandlerFunc_t func) {
diff --git a/cpp/Sampler.cpp b/cpp/Sampler.cpp
index 767d626..bf99125 100644
--- a/cpp/Sampler.cpp
+++ b/cpp/Sampler.cpp
@@ -56,10 +56,10 @@
 }
 
 #define CREATE_SAMPLER(N, MIN, MAG, WRAPS, WRAPT) sp<const Sampler> Sampler::N(sp<RS> rs) { \
-        if (rs->mSamplers.N == NULL) {                                  \
+        if (rs->mSamplers.N == nullptr) {                                \
             rs->mSamplers.N = (create(rs, MIN, MAG, WRAPS, WRAPT, 0.f)); \
-        }                                                               \
-        return rs->mSamplers.N;                                         \
+        }                                                                \
+        return rs->mSamplers.N;                                          \
     }
 
 CREATE_SAMPLER(CLAMP_NEAREST, RS_SAMPLER_CLAMP, RS_SAMPLER_CLAMP, RS_SAMPLER_NEAREST, RS_SAMPLER_NEAREST);
diff --git a/cpp/Script.cpp b/cpp/Script.cpp
index 8e1af54..889bb02 100644
--- a/cpp/Script.cpp
+++ b/cpp/Script.cpp
@@ -28,12 +28,12 @@
 
 void Script::forEach(uint32_t slot, sp<const Allocation> ain, sp<const Allocation> aout,
                        const void *usr, size_t usrLen) const {
-    if ((ain == NULL) && (aout == NULL)) {
+    if ((ain == nullptr) && (aout == nullptr)) {
         mRS->throwError(RS_ERROR_INVALID_PARAMETER, "At least one of ain or aout is required to be non-null.");
     }
     void *in_id = BaseObj::getObjID(ain);
     void *out_id = BaseObj::getObjID(aout);
-    tryDispatch(mRS, RS::dispatch->ScriptForEach(mRS->getContext(), getID(), slot, in_id, out_id, usr, usrLen, NULL, 0));
+    tryDispatch(mRS, RS::dispatch->ScriptForEach(mRS->getContext(), getID(), slot, in_id, out_id, usr, usrLen, nullptr, 0));
 }
 
 
@@ -47,7 +47,7 @@
 
 
 void Script::setVar(uint32_t index, sp<const BaseObj> o) const {
-    tryDispatch(mRS, RS::dispatch->ScriptSetVarObj(mRS->getContext(), getID(), index, (o == NULL) ? 0 : o->getID()));
+    tryDispatch(mRS, RS::dispatch->ScriptSetVarObj(mRS->getContext(), getID(), index, (o == nullptr) ? 0 : o->getID()));
 }
 
 void Script::setVar(uint32_t index, const void *v, size_t len) const {
diff --git a/cpp/ScriptC.cpp b/cpp/ScriptC.cpp
index 69d3bd5..d431355 100644
--- a/cpp/ScriptC.cpp
+++ b/cpp/ScriptC.cpp
@@ -23,7 +23,7 @@
                  const void *codeTxt, size_t codeLength,
                  const char *cachedName, size_t cachedNameLength,
                  const char *cacheDir, size_t cacheDirLength)
-: Script(NULL, rs) {
+: Script(nullptr, rs) {
     mID = RS::dispatch->ScriptCCreate(rs->getContext(), cachedName, cachedNameLength,
                                       rs->mCacheDir.c_str(), rs->mCacheDir.length(), (const char *)codeTxt, codeLength);
 }
diff --git a/cpp/ScriptIntrinsics.cpp b/cpp/ScriptIntrinsics.cpp
index c5013b6..e3457a7 100644
--- a/cpp/ScriptIntrinsics.cpp
+++ b/cpp/ScriptIntrinsics.cpp
@@ -23,7 +23,7 @@
 using namespace RSC;
 
 ScriptIntrinsic::ScriptIntrinsic(sp<RS> rs, int id, sp<const Element> e)
-    : Script(NULL, rs) {
+    : Script(nullptr, rs) {
     mID = createDispatch(rs, RS::dispatch->ScriptIntrinsicCreate(rs->getContext(), id, e->getID()));
     mElement = e;
 }
@@ -35,7 +35,7 @@
 sp<ScriptIntrinsic3DLUT> ScriptIntrinsic3DLUT::create(sp<RS> rs, sp<const Element> e) {
     if (e->isCompatible(Element::U8_4(rs)) == false) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
-        return NULL;
+        return nullptr;
     }
     return new ScriptIntrinsic3DLUT(rs, e);
 }
@@ -50,7 +50,7 @@
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "3DLUT forEach element mismatch");
         return;
     }
-    Script::forEach(0, ain, aout, NULL, 0);
+    Script::forEach(0, ain, aout, nullptr, 0);
 }
 void ScriptIntrinsic3DLUT::setLUT(sp<Allocation> lut) {
     sp<const Type> t = lut->getType();
@@ -69,7 +69,7 @@
 sp<ScriptIntrinsicBlend> ScriptIntrinsicBlend::create(sp<RS> rs, sp<const Element> e) {
     if (e->isCompatible(Element::U8_4(rs)) == false) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Element not supported for intrinsic");
-        return NULL;
+        return nullptr;
     }
     return new ScriptIntrinsicBlend(rs, e);
 }
@@ -83,7 +83,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(0, in, out, NULL, 0);
+    Script::forEach(0, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSrc(sp<Allocation> in, sp<Allocation> out) {
@@ -91,7 +91,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(1, in, out, NULL, 0);
+    Script::forEach(1, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachDst(sp<Allocation> in, sp<Allocation> out) {
@@ -99,7 +99,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(2, in, out, NULL, 0);
+    Script::forEach(2, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSrcOver(sp<Allocation> in, sp<Allocation> out) {
@@ -107,7 +107,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(3, in, out, NULL, 0);
+    Script::forEach(3, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachDstOver(sp<Allocation> in, sp<Allocation> out) {
@@ -115,7 +115,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(4, in, out, NULL, 0);
+    Script::forEach(4, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSrcIn(sp<Allocation> in, sp<Allocation> out) {
@@ -123,7 +123,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(5, in, out, NULL, 0);
+    Script::forEach(5, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachDstIn(sp<Allocation> in, sp<Allocation> out) {
@@ -131,7 +131,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(6, in, out, NULL, 0);
+    Script::forEach(6, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSrcOut(sp<Allocation> in, sp<Allocation> out) {
@@ -139,7 +139,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(7, in, out, NULL, 0);
+    Script::forEach(7, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachDstOut(sp<Allocation> in, sp<Allocation> out) {
@@ -147,7 +147,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(8, in, out, NULL, 0);
+    Script::forEach(8, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSrcAtop(sp<Allocation> in, sp<Allocation> out) {
@@ -155,7 +155,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(9, in, out, NULL, 0);
+    Script::forEach(9, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachDstAtop(sp<Allocation> in, sp<Allocation> out) {
@@ -163,7 +163,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(10, in, out, NULL, 0);
+    Script::forEach(10, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachXor(sp<Allocation> in, sp<Allocation> out) {
@@ -171,7 +171,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(11, in, out, NULL, 0);
+    Script::forEach(11, in, out, nullptr, 0);
 }
 
 // Numbering jumps here
@@ -180,7 +180,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(14, in, out, NULL, 0);
+    Script::forEach(14, in, out, nullptr, 0);
 }
 
 // Numbering jumps here
@@ -189,7 +189,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(34, in, out, NULL, 0);
+    Script::forEach(34, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlend::forEachSubtract(sp<Allocation> in, sp<Allocation> out) {
@@ -197,7 +197,7 @@
         out->getType()->getElement()->isCompatible(mElement) == false) {
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blend");
     }
-    Script::forEach(35, in, out, NULL, 0);
+    Script::forEach(35, in, out, nullptr, 0);
 }
 
 
@@ -207,7 +207,7 @@
     if ((e->isCompatible(Element::U8_4(rs)) == false) &&
         (e->isCompatible(Element::U8(rs)) == false)) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blur");
-        return NULL;
+        return nullptr;
     }
     return new ScriptIntrinsicBlur(rs, e);
 }
@@ -230,7 +230,7 @@
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element in blur output");
         return;
     }
-    Script::forEach(0, NULL, out, NULL, 0);
+    Script::forEach(0, nullptr, out, nullptr, 0);
 }
 
 void ScriptIntrinsicBlur::setRadius(float radius) {
@@ -279,7 +279,7 @@
         return;
     }
 
-    Script::forEach(0, in, out, NULL, 0);
+    Script::forEach(0, in, out, nullptr, 0);
 }
 
 void ScriptIntrinsicColorMatrix::setAdd(float* add) {
@@ -346,7 +346,7 @@
         !(e->isCompatible(Element::F32_3(rs))) &&
         !(e->isCompatible(Element::F32_4(rs)))) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element for Convolve3x3");
-        return NULL;
+        return nullptr;
     }
 
     return new ScriptIntrinsicConvolve3x3(rs, e);
@@ -370,7 +370,7 @@
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Element mismatch in Convolve3x3");
         return;
     }
-    Script::forEach(0, NULL, out, NULL, 0);
+    Script::forEach(0, nullptr, out, nullptr, 0);
 }
 
 void ScriptIntrinsicConvolve3x3::setCoefficients(float* v) {
@@ -387,7 +387,7 @@
         !(e->isCompatible(Element::F32_3(rs))) &&
         !(e->isCompatible(Element::F32_4(rs)))) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element for Convolve5x5");
-        return NULL;
+        return nullptr;
     }
 
     return new ScriptIntrinsicConvolve5x5(rs, e);
@@ -412,7 +412,7 @@
         return;
     }
 
-    Script::forEach(0, NULL, out, NULL, 0);
+    Script::forEach(0, nullptr, out, nullptr, 0);
 }
 
 void ScriptIntrinsicConvolve5x5::setCoefficients(float* v) {
@@ -420,7 +420,7 @@
 }
 
 sp<ScriptIntrinsicHistogram> ScriptIntrinsicHistogram::create(sp<RS> rs) {
-    return new ScriptIntrinsicHistogram(rs, NULL);
+    return new ScriptIntrinsicHistogram(rs, nullptr);
 }
 
 ScriptIntrinsicHistogram::ScriptIntrinsicHistogram(sp<RS> rs, sp<const Element> e)
@@ -483,7 +483,7 @@
         return;
     }
 
-    Script::forEach(0, ain, NULL, NULL, 0);
+    Script::forEach(0, ain, nullptr, nullptr, 0);
 }
 
 
@@ -501,13 +501,13 @@
         return;
     }
 
-    Script::forEach(1, ain, NULL, NULL, 0);
+    Script::forEach(1, ain, nullptr, nullptr, 0);
 }
 
 sp<ScriptIntrinsicLUT> ScriptIntrinsicLUT::create(sp<RS> rs, sp<const Element> e) {
     if (!(e->isCompatible(Element::U8_4(rs)))) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element for LUT");
-        return NULL;
+        return nullptr;
     }
     return new ScriptIntrinsicLUT(rs, e);
 }
@@ -534,7 +534,7 @@
         mRS->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element for LUT");
         return;
     }
-    Script::forEach(0, ain, aout, NULL, 0);
+    Script::forEach(0, ain, aout, nullptr, 0);
 
 }
 
@@ -572,7 +572,7 @@
 sp<ScriptIntrinsicYuvToRGB> ScriptIntrinsicYuvToRGB::create(sp<RS> rs, sp<const Element> e) {
     if (!(e->isCompatible(Element::U8_4(rs)))) {
         rs->throwError(RS_ERROR_INVALID_ELEMENT, "Invalid element for YuvToRGB");
-        return NULL;
+        return nullptr;
     }
     return new ScriptIntrinsicYuvToRGB(rs, e);
 }
@@ -596,5 +596,7 @@
         return;
     }
 
-    Script::forEach(0, NULL, out, NULL, 0);
+    Script::forEach(0, nullptr, out, nullptr, 0);
 }
+
+
diff --git a/cpp/Type.cpp b/cpp/Type.cpp
index d053730..bc28165 100644
--- a/cpp/Type.cpp
+++ b/cpp/Type.cpp
@@ -72,7 +72,7 @@
     mDimZ = 0;
     mDimMipmaps = false;
     mDimFaces = false;
-    mElement = NULL;
+    mElement = nullptr;
     mYuvFormat = RS_YUV_NONE;
 }
 
diff --git a/cpp/rsCppInternal.h b/cpp/rsCppInternal.h
index ae43d47..81b690f 100644
--- a/cpp/rsCppInternal.h
+++ b/cpp/rsCppInternal.h
@@ -26,7 +26,7 @@
     }
 
 #define createDispatch(rs, dispatch) \
-    rs->getError() == RS_SUCCESS ? dispatch : NULL
+    rs->getError() == RS_SUCCESS ? dispatch : nullptr
 
 #undef LOG_TAG
 #undef LOG_NDEBUG
diff --git a/cpp/util/RefBase.h b/cpp/util/RefBase.h
index 5993e28..01c0b5f 100644
--- a/cpp/util/RefBase.h
+++ b/cpp/util/RefBase.h
@@ -374,7 +374,7 @@
 wp<T>& wp<T>::operator = (const sp<T>& other)
 {
     weakref_type* newRefs =
-        other != NULL ? other->createWeak(this) : 0;
+        other != NULL ? other->createWeak(this) : NULL;
     T* otherPtr(other.m_ptr);
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = otherPtr;
@@ -386,7 +386,7 @@
 wp<T>& wp<T>::operator = (U* other)
 {
     weakref_type* newRefs =
-        other ? other->createWeak(this) : 0;
+        other ? other->createWeak(this) : NULL;
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = other;
     m_refs = newRefs;
@@ -409,7 +409,7 @@
 wp<T>& wp<T>::operator = (const sp<U>& other)
 {
     weakref_type* newRefs =
-        other != NULL ? other->createWeak(this) : 0;
+        other != NULL ? other->createWeak(this) : NULL;
     U* otherPtr(other.m_ptr);
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = otherPtr;
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bd276bf..a91e373 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -1,7 +1,8 @@
 
 LOCAL_PATH:=$(call my-dir)
 
-rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable -fno-exceptions
+rs_base_CFLAGS := -Werror -Wall -Wno-unused-parameter -Wno-unused-variable \
+                  -fno-exceptions -std=c++11
 ifeq ($(TARGET_BUILD_PDK), true)
   rs_base_CFLAGS += -D__RS_PDK__
 endif
@@ -50,6 +51,9 @@
     rsCpuIntrinsics_advsimd_YuvToRGB.S
 #    rsCpuIntrinsics_advsimd_Blend.S \
 
+# Clang does not compile rsCpuIntrinsics_advsimd_3DLUT.S.
+LOCAL_CLANG_ASFLAGS_arm64 += -no-integrated-as
+
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
     LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_NEON
 endif
@@ -65,12 +69,14 @@
     rsCpuIntrinsics_neon_YuvToRGB.S \
 
     LOCAL_ASFLAGS_arm := -mfpu=neon
+    # Clang does not compile rsCpuIntrinsics_neon_3DLUT.S.
+    LOCAL_CLANG_ASFLAGS_arm += -no-integrated-as
 endif
 
 ifeq ($(ARCH_X86_HAVE_SSSE3),true)
     LOCAL_CFLAGS += -DARCH_X86_HAVE_SSSE3
     LOCAL_SRC_FILES+= \
-    rsCpuIntrinsics_x86.c
+    rsCpuIntrinsics_x86.cpp
 endif
 
 LOCAL_SHARED_LIBRARIES += libRS libcutils libutils liblog libsync libc++
diff --git a/cpu_ref/linkloader/android/librsloader.cpp b/cpu_ref/linkloader/android/librsloader.cpp
index 7fbaa7c..fa74a7d 100644
--- a/cpu_ref/linkloader/android/librsloader.cpp
+++ b/cpu_ref/linkloader/android/librsloader.cpp
@@ -53,12 +53,12 @@
                                         void *find_symbol_context) {
   RSExecRef object = rsloaderLoadExecutable(buf, buf_size);
   if (!object) {
-    return NULL;
+    return nullptr;
   }
 
   if (!rsloaderRelocateExecutable(object, find_symbol, find_symbol_context)) {
     rsloaderDisposeExec(object);
-    return NULL;
+    return nullptr;
   }
 
   return object;
@@ -75,7 +75,7 @@
 #endif
   if (!object) {
     ALOGE("Unable to load the ELF object.");
-    return NULL;
+    return nullptr;
   }
 
   return wrap(object.release());
@@ -156,7 +156,7 @@
 #endif
 
   if (!symtab) {
-    return NULL;
+    return nullptr;
   }
 
 #if defined(__LP64__) || defined(__x86_64__)
@@ -167,7 +167,7 @@
 
   if (!symbol) {
     ALOGV("Symbol not found: %s\n", name);
-    return NULL;
+    return nullptr;
   }
 
   int machine = object->getHeader()->getMachine();
diff --git a/cpu_ref/linkloader/include/ELFHeader.h b/cpu_ref/linkloader/include/ELFHeader.h
index b8b9340..1251c78 100644
--- a/cpu_ref/linkloader/include/ELFHeader.h
+++ b/cpu_ref/linkloader/include/ELFHeader.h
@@ -150,19 +150,19 @@
   static ELFHeader *read(Archiver &AR) {
     if (!AR) {
       // Archiver is in bad state before calling read function.
-      // Return NULL and do nothing.
-      return 0;
+      // Return nullptr and do nothing.
+      return nullptr;
     }
 
     std::unique_ptr<ELFHeader> header(new ELFHeader());
     if (!header->serialize(AR)) {
-      // Unable to read the structure.  Return NULL.
-      return 0;
+      // Unable to read the structure.  Return nullptr.
+      return nullptr;
     }
 
     if (!header->isValid()) {
-      // Header read from archiver is not valid.  Return NULL.
-      return 0;
+      // Header read from archiver is not valid.  Return nullptr.
+      return nullptr;
     }
 
     return header.release();
diff --git a/cpu_ref/linkloader/include/ELFObject.h b/cpu_ref/linkloader/include/ELFObject.h
index a3c2596..d750d9e 100644
--- a/cpu_ref/linkloader/include/ELFObject.h
+++ b/cpu_ref/linkloader/include/ELFObject.h
@@ -55,7 +55,7 @@
   }
 
 private:
-  ELFObject() : SHNCommonDataPtr(NULL), missingSymbols(false) { }
+  ELFObject() : SHNCommonDataPtr(nullptr), missingSymbols(false) { }
 
 public:
   template <typename Archiver>
@@ -93,7 +93,7 @@
 
     // Ensure the free size is sufficient
     if (SHNCommonDataFreeSize < size) {
-      return NULL;
+      return nullptr;
     }
 
     // Allcoate
@@ -141,6 +141,11 @@
                     void *context,
                     ELFSectionRelTableTy *reltab,
                     ELFSectionProgBitsTy *text);
+
+  void relocateMIPS64(void *(*find_sym)(void *context, char const *name),
+                      void *context,
+                      ELFSectionRelTableTy *reltab,
+                      ELFSectionProgBitsTy *text);
 };
 
 #include "impl/ELFObject.hxx"
diff --git a/cpu_ref/linkloader/include/ELFReloc.h b/cpu_ref/linkloader/include/ELFReloc.h
index 84754b9..a6d7f5e 100644
--- a/cpu_ref/linkloader/include/ELFReloc.h
+++ b/cpu_ref/linkloader/include/ELFReloc.h
@@ -134,13 +134,27 @@
 
 public:
   xword_t getSymTabIndex() const {
+#if defined(__mips__)
+/*
+ * Packed r_info on MIPS is:
+ * r_sym (4) - r_ssym (1) - r_type3 (1) - r_type2 (1) - r_type (1)
+ * Each entry represents up to three actual relocations.
+ * Thus, the macros look different.
+ */
+#define ELF64_R_SYM(i)    ((i)&0xffffffffL)
+#else
 #define ELF64_R_SYM(i)    ((i)>>32)
+#endif
     return ELF64_R_SYM(this->r_info);
 #undef ELF64_R_SYM
   }
 
   xword_t getType() const {
+#if defined(__mips__)
+#define ELF64_R_TYPE(i)   ((i)>>32)
+#else
 #define ELF64_R_TYPE(i)   ((i)&0xffffffffL)
+#endif
     return ELF64_R_TYPE(this->r_info);
 #undef ELF64_R_TYPE
   }
diff --git a/cpu_ref/linkloader/include/ELFSectionBits.h b/cpu_ref/linkloader/include/ELFSectionBits.h
index b6e4590..33b4259 100644
--- a/cpu_ref/linkloader/include/ELFSectionBits.h
+++ b/cpu_ref/linkloader/include/ELFSectionBits.h
@@ -28,7 +28,7 @@
   MemChunk chunk;
 
 protected:
-  ELFSectionBits() : sh(NULL) { }
+  ELFSectionBits() : sh(nullptr) { }
 
 public:
   virtual void print() const;
diff --git a/cpu_ref/linkloader/include/ELFSectionProgBits.h b/cpu_ref/linkloader/include/ELFSectionProgBits.h
index a642b16..9c1d428 100644
--- a/cpu_ref/linkloader/include/ELFSectionProgBits.h
+++ b/cpu_ref/linkloader/include/ELFSectionProgBits.h
@@ -64,7 +64,7 @@
       break;
 
     default:
-      stubs = NULL;
+      stubs = nullptr;
     }
   }
 
diff --git a/cpu_ref/linkloader/include/GOT.h b/cpu_ref/linkloader/include/GOT.h
index b72bf66..0b689fe 100644
--- a/cpu_ref/linkloader/include/GOT.h
+++ b/cpu_ref/linkloader/include/GOT.h
@@ -20,12 +20,18 @@
 #include "utils/rsl_assert.h"
 #include "ELF.h"
 
-#define GP_OFFSET	((int)0x8000)
-#define GOT_SIZE	(1 << 16)	// bytes
-#define GOT_ENTRY_SIZE	4	// bytes
-#define NUM_OF_GOT_ENTRY	(GOT_SIZE/GOT_ENTRY_SIZE)
+#define GP_OFFSET    ((int)0x8000)
+#ifdef __LP64__
+#define GOT_SIZE    (1 << 17) // bytes
+#define GOT_ENTRY_SIZE    8   // bytes
+#else
+#define GOT_SIZE    (1 << 16) // bytes
+#define GOT_ENTRY_SIZE    4   // bytes
+#endif
+#define NUM_OF_GOT_ENTRY  (GOT_SIZE/GOT_ENTRY_SIZE)
 
 void *got_address();
 int search_got(int symbol_index, void *addr, uint8_t bind_type);
+void set_got_address(void *addr);
 
 #endif // GOT_H
diff --git a/cpu_ref/linkloader/include/impl/ELFObject.hxx b/cpu_ref/linkloader/include/impl/ELFObject.hxx
index 81736b5..78933e3 100644
--- a/cpu_ref/linkloader/include/impl/ELFObject.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFObject.hxx
@@ -380,7 +380,7 @@
     int16_t* inst16 = reinterpret_cast<int16_t*>(inst);
     Inst_t P = (Inst_t)(int64_t)inst;
     Inst_t A = 0;
-    Inst_t S = (Inst_t)(int64_t)sym->getAddress(EM_ARM);
+    Inst_t S = (Inst_t)(int64_t)sym->getAddress(EM_AARCH64);
     Inst_t Page_P = P & ~0xfff;         // Page address.
 
     if (S == 0 && sym->getType() == STT_NOTYPE) {
@@ -1022,6 +1022,167 @@
   }
 }
 
+template <unsigned Bitwidth>
+inline void ELFObject<Bitwidth>::
+relocateMIPS64(void *(*find_sym)(void *context, char const *name),
+               void *context,
+               ELFSectionRelTableTy *reltab,
+               ELFSectionProgBitsTy *text) {
+  ELFSectionSymTabTy *symtab =
+    static_cast<ELFSectionSymTabTy *>(getSectionByName(".symtab"));
+  rsl_assert(symtab && "Symtab is required.");
+
+  int64_t calculatedValue = 0;
+  bool applyRelocation = true;
+  bool useCalculatedValue;
+
+  for (size_t i = 0; i < reltab->size(); ++i) {
+    ELFRelocTy *rel = (*reltab)[i];
+    ELFSymbolTy *sym = (*symtab)[rel->getSymTabIndex()];
+
+    typedef int64_t Inst_t;
+    Inst_t *inst = (Inst_t *)&(*text)[rel->getOffset()];
+    Inst_t P = (Inst_t)(uintptr_t)inst;
+    Inst_t A = (Inst_t)rel->getAddend();
+    Inst_t S = (Inst_t)(uintptr_t)sym->getAddress(EM_MIPS);
+
+    if (S == 0) {
+      S = (Inst_t)(uintptr_t)find_sym(context, sym->getName());
+      if (!S) {
+        missingSymbols = true;
+      }
+      sym->setAddress((void *)S);
+    }
+
+    uint8_t rtype[3];
+    rtype[0] = (rel->getType() >> 24) & 0xFF;
+    rtype[1] = (rel->getType() >> 16) & 0xFF;
+    rtype[2] = (rel->getType() >> 8) & 0xFF;
+
+    for (size_t j = 0; j < 3; ++j) {
+      useCalculatedValue = !applyRelocation;
+      if (j < 2) {
+        applyRelocation = (rtype[j+1] == R_MIPS_NONE);
+      } else if ((i + 1) < reltab->size()) {
+        // Enter here if there are more relocations left in the table
+        // and check if the next one affects the same instruction.
+        ELFRelocTy *next_rel = (*reltab)[i + 1];
+        Inst_t *next_inst = (Inst_t *)&(*text)[next_rel->getOffset()];
+        applyRelocation = (inst != next_inst);
+      }
+
+      if (useCalculatedValue) {
+        S = 0;
+        A = calculatedValue;
+      }
+
+      switch (rtype[j]) {
+      default:
+        rsl_assert(0 && "Not implemented relocation type.");
+        break;
+
+      case R_MIPS_NONE:
+        break;
+
+      case R_MIPS_64:
+        calculatedValue = S + A;
+        if (applyRelocation) {
+          *inst = calculatedValue;
+        }
+        break;
+
+      case R_MIPS_26:
+        if (sym->getBindingAttribute() == STB_LOCAL) {
+          // Local binding.
+          A |= ((P + 4) & 0xF0000000);
+          A += S;
+          calculatedValue = (A >> 2);
+          if (applyRelocation) {
+            *inst |= (calculatedValue & 0x3FFFFFF);
+          }
+        } else {
+          // External binding.
+          A += S;
+          calculatedValue = (A >> 2);
+          if (applyRelocation) {
+            *inst |= (calculatedValue & 0x3FFFFFF);
+          }
+        }
+        break;
+
+      case R_MIPS_CALL16:
+      case R_MIPS_GOT_PAGE:
+      case R_MIPS_GOT_DISP: {
+        A = A & 0xFFFF;
+        int got_index = search_got((int)rel->getSymTabIndex(),
+                                   (void *)(S + A),
+                                   sym->getBindingAttribute());
+        calculatedValue = (got_index << 3) - 0x7FF0;
+        if (applyRelocation) {
+          *inst |= (calculatedValue & 0xFFFF);
+        }
+        break;
+      }
+
+      case R_MIPS_GPREL32:
+        calculatedValue = A + S - ((int64_t)got_address() + 0x7FF0);
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_GOT_OFST:
+        calculatedValue = (S + A) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_GPREL16:
+        calculatedValue = A + S - ((int64_t)got_address() + 0x7FF0);
+        if (applyRelocation) {
+          *inst |= (calculatedValue & 0xFFFF);
+        }
+        break;
+
+      case R_MIPS_SUB:
+        calculatedValue = S - A;
+        if (applyRelocation) {
+          *inst = calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HI16:
+        calculatedValue = ((S + A + 0x8000) >> 16) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_LO16:
+        calculatedValue = (S + A) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HIGHER:
+        calculatedValue = ((S + A + 0x80008000) >> 32) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+
+      case R_MIPS_HIGHEST:
+        calculatedValue = ((S + A + 0x800080008000) >> 48) & 0xFFFF;
+        if (applyRelocation) {
+          *inst |= calculatedValue;
+        }
+        break;
+      }
+    }
+  }
+}
 
 // TODO: Refactor all relocations.
 template <unsigned Bitwidth>
@@ -1121,7 +1282,11 @@
         relocateX86_64(find_sym, context, reltab, need_rel);
         break;
       case EM_MIPS:
-        relocateMIPS(find_sym, context, reltab, need_rel);
+        if (getHeader()->getClass() == ELFCLASS64) {
+          relocateMIPS64(find_sym, context, reltab, need_rel);
+        } else {
+          relocateMIPS(find_sym, context, reltab, need_rel);
+        }
         break;
 
       default:
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx b/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
index 93919c8..e18c9c3 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
@@ -18,6 +18,7 @@
 #define ELF_SECTION_PROGBITS_HXX
 
 #include "ELFTypes.h"
+#include "GOT.h"
 #include "StubLayout.h"
 
 #include <llvm/Support/Format.h>
@@ -65,6 +66,13 @@
     alloc_size += stub_table_size;
   }
 
+#if defined(__mips__)
+  // Add GOT section after the text section.
+  if (strcmp(sh->getName(), ".text") == 0) {
+    alloc_size += GOT_SIZE;
+  }
+#endif
+
   // Allocate text section
   if (!result->chunk.allocate(alloc_size)) {
     return NULL;
@@ -75,6 +83,12 @@
                          max_num_stubs);
   }
 
+#if defined(__mips__)
+  if (strcmp(sh->getName(), ".text") == 0) {
+    set_got_address(result->chunk.getBuffer() + alloc_size - GOT_SIZE);
+  }
+#endif
+
   result->sh = sh;
 
   if (!result->serialize(AR)) {
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx b/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
index 42c7a7a..43c4980 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
@@ -90,11 +90,10 @@
 template <unsigned Bitwidth>
 size_t ELFSectionRelTable<Bitwidth>::
 getMaxNumStubs(ELFObjectTy const *obj) const {
+  std::set<uint32_t> sym_index_set;
   switch (obj->getHeader()->getMachine()) {
   case EM_ARM:
     {
-      std::set<uint32_t> sym_index_set;
-
       for (size_t i = 0; i < size(); ++i) {
         ELFRelocTy *rel = table[i];
 
@@ -113,8 +112,6 @@
 
   case EM_AARCH64:
     {
-      std::set<uint32_t> sym_index_set;
-
       for (size_t i = 0; i < size(); ++i) {
         ELFRelocTy *rel = table[i];
 
@@ -131,8 +128,6 @@
 
   case EM_MIPS:
     {
-      std::set<uint32_t> sym_index_set;
-
       for (size_t i = 0; i < size(); ++i) {
         ELFRelocTy *rel = table[i];
 
@@ -145,8 +140,34 @@
     }
 
   case EM_386:
+    {
+      for (size_t i = 0; i < size(); ++i) {
+        ELFRelocTy *rel = table[i];
+
+        switch (rel->getType()) {
+        case R_386_PC32:
+          sym_index_set.insert(rel->getSymTabIndex());
+          break;
+        }
+      }
+
+      return sym_index_set.size();
+    }
+
   case EM_X86_64:
-    return 0;
+    {
+      for (size_t i = 0; i < size(); ++i) {
+        ELFRelocTy *rel = table[i];
+
+        switch (rel->getType()) {
+        case R_X86_64_PC32:
+          sym_index_set.insert(rel->getSymTabIndex());
+          break;
+        }
+      }
+
+      return sym_index_set.size();
+    }
 
   default:
     rsl_assert(0 && "Only support ARM, MIPS, X86, and X86_64 relocation.");
diff --git a/cpu_ref/linkloader/lib/GOT.cpp b/cpu_ref/linkloader/lib/GOT.cpp
index 3f523c5..3372cfa 100644
--- a/cpu_ref/linkloader/lib/GOT.cpp
+++ b/cpu_ref/linkloader/lib/GOT.cpp
@@ -17,15 +17,21 @@
 #include <stdio.h>
 #include "GOT.h"
 
-void *got_symbol_addresses[NUM_OF_GOT_ENTRY];
-int got_symbol_indexes[NUM_OF_GOT_ENTRY];
-size_t got_symbol_count = 0;
+static void **got_symbol_addresses = NULL;
+static int got_symbol_indexes[NUM_OF_GOT_ENTRY];
+static size_t got_symbol_count = 0;
 
 void *got_address()
 {
   return &got_symbol_addresses[0];
 }
 
+void set_got_address(void *addr)
+{
+  got_symbol_addresses = (void **) addr;
+  got_symbol_count = 0;
+}
+
 int search_got(int symbol_index, void *addr, uint8_t bind_type)
 {
   size_t i;
@@ -33,7 +39,12 @@
   // For local symbols (R_MIPS_GOT16), we only store the high 16-bit value
   // after adding 0x8000.
   if (bind_type == STB_LOCAL)
+#ifdef __LP64__
+    addr = (void *)(((intptr_t)addr + 0x8000) & 0xFFFFFFFFFFFF0000);
+#else
     addr = (void *)(((intptr_t)addr + 0x8000) & 0xFFFF0000);
+#endif
+
 
   for (i = 0; i < got_symbol_count; i++) {
     if (got_symbol_indexes[i] == symbol_index) {
diff --git a/cpu_ref/linkloader/lib/MemChunk.cpp b/cpu_ref/linkloader/lib/MemChunk.cpp
index 5d6c102..8a9b548 100644
--- a/cpu_ref/linkloader/lib/MemChunk.cpp
+++ b/cpu_ref/linkloader/lib/MemChunk.cpp
@@ -41,10 +41,10 @@
 static uintptr_t StartAddr = 0x7e000000UL;
 #endif
 
-AllocFunc MemChunk::VendorAlloc = NULL;
-FreeFunc MemChunk::VendorFree = NULL;
+AllocFunc MemChunk::VendorAlloc = nullptr;
+FreeFunc MemChunk::VendorFree = nullptr;
 
-MemChunk::MemChunk() : buf(NULL), buf_size(0), bVendorBuf(true) {
+MemChunk::MemChunk() : buf(nullptr), buf_size(0), bVendorBuf(true) {
 }
 
 MemChunk::~MemChunk() {
diff --git a/cpu_ref/linkloader/lib/StubLayout.cpp b/cpu_ref/linkloader/lib/StubLayout.cpp
index dd4b139..1ff9e16 100644
--- a/cpu_ref/linkloader/lib/StubLayout.cpp
+++ b/cpu_ref/linkloader/lib/StubLayout.cpp
@@ -23,7 +23,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-StubLayout::StubLayout() : table(NULL), count(0) {
+StubLayout::StubLayout() : table(nullptr), count(0) {
 }
 
 void StubLayout::initStubTable(unsigned char *table_, size_t count_) {
@@ -42,7 +42,7 @@
   // We have to create a new stub
   if (count == 0) {
     // No free stub slot is available
-    return NULL;
+    return nullptr;
   }
 
   // Initialize the stub
diff --git a/cpu_ref/linkloader/main.cpp b/cpu_ref/linkloader/main.cpp
index 072595f..011e1d7 100644
--- a/cpu_ref/linkloader/main.cpp
+++ b/cpu_ref/linkloader/main.cpp
@@ -54,7 +54,7 @@
 
   // Open the file
   int fd = -1;
-  unsigned char const *image = NULL;
+  unsigned char const *image = nullptr;
   size_t image_size = 0;
 
   if (!open_mmap_file(filename, fd, image, image_size)) {
@@ -135,7 +135,7 @@
   }
 
   assert(0 && "Can't find symbol.");
-  return 0;
+  return nullptr;
 }
 
 template <unsigned Bitwidth, typename Archiver>
diff --git a/cpu_ref/linkloader/utils/serialize.h b/cpu_ref/linkloader/utils/serialize.h
index 3d15158..49ba400 100644
--- a/cpu_ref/linkloader/utils/serialize.h
+++ b/cpu_ref/linkloader/utils/serialize.h
@@ -66,21 +66,21 @@
   bool good;
 
 public:
-  ArchiveReader(unsigned char const *buf = NULL, size_t size = 0)
+  ArchiveReader(unsigned char const *buf = nullptr, size_t size = 0)
   : buf_begin(buf), buf_end(buf + size),
-    cursor(buf), cursor_base(NULL), good(buf != NULL) {
+    cursor(buf), cursor_base(nullptr), good(buf != nullptr) {
   }
 
   void prologue(size_t size) {
-    rsl_assert(cursor_base == NULL);
+    rsl_assert(cursor_base == nullptr);
     cursor_base = cursor;
   }
 
   void epilogue(size_t size) {
-    rsl_assert(cursor_base != NULL);
+    rsl_assert(cursor_base != nullptr);
     rsl_assert(cursor_base + size >= cursor);
     cursor = cursor_base + size;
-    cursor_base = NULL;
+    cursor_base = nullptr;
   }
 
   void seek(off_t off, bool from_begin = false) {
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index f09e334..31cf6f8 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -48,9 +48,8 @@
 using namespace android::renderscript;
 
 typedef void (*outer_foreach_t)(
-    const android::renderscript::RsForEachStubParamStruct *,
-    uint32_t x1, uint32_t x2,
-    uint32_t instep, uint32_t outstep);
+    const android::renderscript::RsExpandKernelParams *,
+    uint32_t x1, uint32_t x2, uint32_t outstep);
 
 
 static pthread_key_t gThreadTLSKey = 0;
@@ -73,11 +72,11 @@
 
     RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
     if (!cpu) {
-        return NULL;
+        return nullptr;
     }
     if (!cpu->init(version_major, version_minor, lfn, slfn)) {
         delete cpu;
-        return NULL;
+        return nullptr;
     }
 
 #ifndef RS_COMPATIBILITY_LIB
@@ -117,9 +116,9 @@
     memset(&mTlsStruct, 0, sizeof(mTlsStruct));
     mExit = false;
 #ifndef RS_COMPATIBILITY_LIB
-    mLinkRuntimeCallback = NULL;
-    mSelectRTCallback = NULL;
-    mSetupCompilerCallback = NULL;
+    mLinkRuntimeCallback = nullptr;
+    mSelectRTCallback = nullptr;
+    mSetupCompilerCallback = nullptr;
 #endif
 }
 
@@ -161,7 +160,7 @@
     }
 
     //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
-    return NULL;
+    return nullptr;
 }
 
 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
@@ -247,7 +246,7 @@
 
     lockMutex();
     if (!gThreadTLSKeyCount) {
-        int status = pthread_key_create(&gThreadTLSKey, NULL);
+        int status = pthread_key_create(&gThreadTLSKey, nullptr);
         if (status) {
             ALOGE("Failed to init thread tls key.");
             unlockMutex();
@@ -258,7 +257,7 @@
     unlockMutex();
 
     mTlsStruct.mContext = mRSC;
-    mTlsStruct.mScript = NULL;
+    mTlsStruct.mScript = nullptr;
     int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
     if (status) {
         ALOGE("pthread_setspecific %i", status);
@@ -283,7 +282,7 @@
     mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
     mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
     mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
-    mWorkers.mLaunchCallback = NULL;
+    mWorkers.mLaunchCallback = nullptr;
 
     mWorkers.mCompleteSignal.init();
 
@@ -323,8 +322,8 @@
 
 RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
     mExit = true;
-    mWorkers.mLaunchData = NULL;
-    mWorkers.mLaunchCallback = NULL;
+    mWorkers.mLaunchData = nullptr;
+    mWorkers.mLaunchCallback = nullptr;
     mWorkers.mRunningCount = mWorkers.mCount;
     __sync_synchronize();
     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
@@ -350,153 +349,132 @@
 }
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void (*walk_loop_t)(MTLaunchStruct*,
+                            RsExpandKernelParams&,
+                            outer_foreach_t);
 
-static void wc_xy(void *usr, uint32_t idx) {
+
+static void walk_wrapper(void* usr, uint32_t idx, walk_loop_t walk_loop) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
 
-    outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
-        uint32_t yEnd = yStart + mtls->mSliceSize;
-        yEnd = rsMin(yEnd, mtls->yEnd);
-        if (yEnd <= yStart) {
-            return;
-        }
+    uint32_t inLen = mtls->fep.inLen;
 
-        //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
+    RsExpandKernelParams kparams;
+    kparams.takeFields(mtls->fep);
 
-        for (p.y = yStart; p.y < yEnd; p.y++) {
-            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
-                    (mtls->fep.eStrideOut * mtls->xStart);
-            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
-                   (mtls->fep.eStrideIn * mtls->xStart);
-            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+    // Used by CpuScriptGroup, IntrinsicBlur, and IntrinsicHistogram
+    kparams.lid = idx;
+
+    if (inLen > 0) {
+        // Allocate space for our input base pointers.
+        kparams.ins = (const void**)alloca(inLen * sizeof(void*));
+
+        // Allocate space for our input stride information.
+        kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
+
+        // Fill our stride information.
+        for (int inIndex = inLen; --inIndex >= 0;) {
+          kparams.inEStrides[inIndex] = mtls->fep.inStrides[inIndex].eStride;
         }
     }
-}
-
-static void wc_x(void *usr, uint32_t idx) {
-    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
-    RsForEachStubParamStruct p;
-    memcpy(&p, &mtls->fep, sizeof(p));
-    p.lid = idx;
-    uint32_t sig = mtls->sig;
 
     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-    while (1) {
-        uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
-        uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
-        uint32_t xEnd = xStart + mtls->mSliceSize;
-        xEnd = rsMin(xEnd, mtls->xEnd);
-        if (xEnd <= xStart) {
-            return;
-        }
 
-        //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
-        //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
-
-        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
-        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
-    }
+    walk_loop(mtls, kparams, fn);
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
-                                     const RsScriptCall *sc, MTLaunchStruct *mtls) {
+static void walk_2d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
 
-    //android::StopWatch kernel_time("kernel time");
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+            uint32_t yEnd   = yStart + mtls->mSliceSize;
 
-    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
-        const size_t targetByteChunk = 16 * 1024;
-        mInForEach = true;
-        if (mtls->fep.dimY > 1) {
-            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            yEnd = rsMin(yEnd, mtls->yEnd);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
-
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
+            if (yEnd <= yStart) {
+                return;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
-        } else {
-            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
-            uint32_t s2 = 0;
+            for (kparams.y = yStart; kparams.y < yEnd; kparams.y++) {
+                kparams.out = mtls->fep.outPtr +
+                              (mtls->fep.outStride.yStride * kparams.y) +
+                              (mtls->fep.outStride.eStride * mtls->xStart);
 
-            // This chooses our slice size to rate limit atomic ops to
-            // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
-            } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
-            }
-            mtls->mSliceSize = rsMin(s1, s2);
+                for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                    StridePair &strides = mtls->fep.inStrides[inIndex];
 
-            if(mtls->mSliceSize < 1) {
-                mtls->mSliceSize = 1;
-            }
-
-            launchThreads(wc_x, mtls);
-        }
-        mInForEach = false;
-
-        //ALOGE("launch 1");
-    } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
-
-        //ALOGE("launch 3");
-        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
-                                      mtls->fep.dimY * p.z + p.y;
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
-                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
-                           (mtls->fep.eStrideIn * mtls->xStart);
-                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+                    kparams.ins[inIndex] =
+                      mtls->fep.inPtrs[inIndex] +
+                      (strides.yStride * kparams.y) +
+                      (strides.eStride * mtls->xStart);
                 }
+
+                fn(&kparams, mtls->xStart, mtls->xEnd,
+                   mtls->fep.outStride.eStride);
             }
         }
-    }
+    });
 }
 
-void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
-                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+static void walk_1d(void *usr, uint32_t idx) {
+    walk_wrapper(usr, idx, [](MTLaunchStruct *mtls,
+                              RsExpandKernelParams &kparams,
+                              outer_foreach_t fn) {
+
+        while (1) {
+            uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
+            uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
+            uint32_t xEnd   = xStart + mtls->mSliceSize;
+
+            xEnd = rsMin(xEnd, mtls->xEnd);
+
+            if (xEnd <= xStart) {
+                return;
+            }
+
+            kparams.out = mtls->fep.outPtr +
+                          (mtls->fep.outStride.eStride * xStart);
+
+            for (int inIndex = mtls->fep.inLen; --inIndex >= 0;) {
+                StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                kparams.ins[inIndex] =
+                  mtls->fep.inPtrs[inIndex] + (strides.eStride * xStart);
+            }
+
+            fn(&kparams, xStart, xEnd, mtls->fep.outStride.eStride);
+        }
+    });
+}
+
+
+void RsdCpuReferenceImpl::launchThreads(const Allocation ** ains,
+                                        uint32_t inLen,
+                                        Allocation* aout,
+                                        const RsScriptCall* sc,
+                                        MTLaunchStruct* mtls) {
 
     //android::StopWatch kernel_time("kernel time");
 
     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
         const size_t targetByteChunk = 16 * 1024;
         mInForEach = true;
+
         if (mtls->fep.dimY > 1) {
             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.yStrideOut) {
-                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            if (mtls->fep.outStride.yStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.yStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.yStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].yStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -504,18 +482,18 @@
                 mtls->mSliceSize = 1;
             }
 
-         //   mtls->mSliceSize = 2;
-            launchThreads(wc_xy, mtls);
+            launchThreads(walk_2d, mtls);
         } else {
             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
             uint32_t s2 = 0;
 
             // This chooses our slice size to rate limit atomic ops to
             // one per 16k bytes of reads/writes.
-            if (mtls->fep.eStrideOut) {
-                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            if (mtls->fep.outStride.eStride) {
+                s2 = targetByteChunk / mtls->fep.outStride.eStride;
             } else {
-                s2 = targetByteChunk / mtls->fep.eStrideIn;
+                // We know that there is either an output or an input.
+                s2 = targetByteChunk / mtls->fep.inStrides[0].eStride;
             }
             mtls->mSliceSize = rsMin(s1, s2);
 
@@ -523,62 +501,61 @@
                 mtls->mSliceSize = 1;
             }
 
-            launchThreads(wc_x, mtls);
+            launchThreads(walk_1d, mtls);
         }
         mInForEach = false;
 
-        //ALOGE("launch 1");
     } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls->fep, sizeof(p));
-        uint32_t sig = mtls->sig;
+        RsExpandKernelParams kparams;
+        kparams.takeFields(mtls->fep);
 
-        // Allocate space for our input base pointers.
-        p.ins = new const void*[inLen];
+        if (inLen > 0) {
+            // Allocate space for our input base pointers.
+            kparams.ins = (const void**)alloca(inLen * sizeof(void*));
 
-        // Allocate space for our input stride information.
-        p.eStrideIns = new uint32_t[inLen];
+            // Allocate space for our input stride information.
+            kparams.inEStrides = (uint32_t*)alloca(inLen * sizeof(uint32_t));
 
-        // Fill our stride information.
-        for (int index = inLen; --index >= 0;) {
-          p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+            // Fill our stride information.
+            for (int inIndex = inLen; --inIndex >= 0;) {
+                kparams.inEStrides[inIndex] =
+                    mtls->fep.inStrides[inIndex].eStride;
+            }
         }
 
         //ALOGE("launch 3");
         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
-        uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+        for (uint32_t arrayIndex = mtls->arrayStart;
+             arrayIndex < mtls->arrayEnd; arrayIndex++) {
 
-        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
-            uint32_t offset_part = offset_invariant * p.ar[0];
+            for (kparams.z = mtls->zStart; kparams.z < mtls->zEnd;
+                 kparams.z++) {
 
-            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
-                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
-                    uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+                for (kparams.y = mtls->yStart; kparams.y < mtls->yEnd;
+                     kparams.y++) {
 
-                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
-                            (mtls->fep.eStrideOut * mtls->xStart);
+                    uint32_t offset =
+                      mtls->fep.dimY * mtls->fep.dimZ * arrayIndex +
+                      mtls->fep.dimY * kparams.z + kparams.y;
 
-                    for (int index = inLen; --index >= 0;) {
-                        StridePair &strides = mtls->fep.inStrides[index];
+                    kparams.out = mtls->fep.outPtr +
+                                  (mtls->fep.outStride.yStride * offset) +
+                                  (mtls->fep.outStride.eStride * mtls->xStart);
 
-                        p.ins[index] = mtls->fep.ptrIns[index] +
-                                       (strides.yStride * offset) +
-                                       (strides.eStride * mtls->xStart);
+                    for (int inIndex = inLen; --inIndex >= 0;) {
+                        StridePair &strides = mtls->fep.inStrides[inIndex];
+
+                        kparams.ins[inIndex] =
+                          mtls->fep.inPtrs[inIndex] +
+                          (strides.yStride * offset) +
+                          (strides.eStride * mtls->xStart);
                     }
 
-                    /*
-                     * The fourth argument is zero here because multi-input
-                     * kernels get their stride information from a member of p
-                     * that points to an array.
-                     */
-                    fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+                    fn(&kparams, mtls->xStart, mtls->xEnd,
+                       mtls->fep.outStride.eStride);
                 }
             }
         }
-
-        // Free our arrays.
-        delete[] p.ins;
-        delete[] p.eStrideIns;
     }
 }
 
@@ -592,7 +569,7 @@
     if (sc) {
         tls->mScript = sc->getScript();
     } else {
-        tls->mScript = NULL;
+        tls->mScript = nullptr;
     }
     return old;
 }
@@ -614,7 +591,7 @@
 #endif
         )) {
         delete i;
-        return NULL;
+        return nullptr;
     }
     return i;
 }
@@ -643,7 +620,7 @@
 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
                                     RsScriptIntrinsicID iid, Element *e) {
 
-    RsdCpuScriptImpl *i = NULL;
+    RsdCpuScriptImpl *i = nullptr;
     switch (iid) {
     case RS_SCRIPT_INTRINSIC_ID_3DLUT:
         i = rsdIntrinsic_3DLUT(this, s, e);
@@ -687,7 +664,7 @@
     CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
     if (!sgi->init()) {
         delete sgi;
-        return NULL;
+        return nullptr;
     }
     return sgi;
 }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index c54dca2..bfd5e51 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -25,6 +25,8 @@
 
 #include <string>
 
+#define RS_KERNEL_INPUT_THRESHOLD 32
+
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -34,44 +36,80 @@
 namespace android {
 namespace renderscript {
 
-typedef struct {
+struct StridePair {
   uint32_t eStride;
   uint32_t yStride;
-} StridePair;
+};
 
-typedef struct {
-    const void *in;
-    void *out;
-    const void *usr;
-    uint32_t usrLen;
-    uint32_t x;
-    uint32_t y;
-    uint32_t z;
-    uint32_t lod;
-    RsAllocationCubemapFace face;
-    uint32_t ar[16];
+struct RsExpandKernelDriverInfo {
+    const uint8_t **inPtrs;
+    uint32_t inLen;
 
-    const void **ins;
-    uint32_t *eStrideIns;
+    uint8_t *outPtr;
 
-    uint32_t lid;
+    StridePair *inStrides;
+    StridePair  outStride;
 
     uint32_t dimX;
     uint32_t dimY;
     uint32_t dimZ;
-    uint32_t dimArray;
 
-    const uint8_t *ptrIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideIn;
-    uint32_t eStrideOut;
-    uint32_t yStrideIn;
-    uint32_t yStrideOut;
     uint32_t slot;
 
-    const uint8_t** ptrIns;
-    StridePair* inStrides;
-} RsForEachStubParamStruct;
+    const void *usr;
+    uint32_t usrLen;
+
+    bool heapAllocatedArrays;
+
+    RsExpandKernelDriverInfo() : heapAllocatedArrays(false) {}
+
+    ~RsExpandKernelDriverInfo() {
+        if (heapAllocatedArrays) {
+            if (inPtrs != nullptr) {
+                delete[] inPtrs;
+            }
+
+            if (inStrides != nullptr) {
+                delete[] inStrides;
+            }
+        }
+    }
+};
+
+struct RsExpandKernelParams {
+
+    // Used by kernels
+    const void **ins;
+    uint32_t *inEStrides;
+    void *out;
+    uint32_t y;
+    uint32_t z;
+    uint32_t lid;
+
+    // Used by ScriptGroup and user kernels.
+    const void *usr;
+
+    // Used by intrinsics
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+
+    /*
+     * FIXME: This is only used by the blend intrinsic.  If possible, we should
+     *        modify blur to not need it.
+     */
+    uint32_t slot;
+
+    /// Copy fields needed by a kernel from a driver struct.
+    void takeFields(const RsExpandKernelDriverInfo &dstruct) {
+        this->usr  = dstruct.usr;
+        this->slot = dstruct.slot;
+
+        this->dimX = dstruct.dimX;
+        this->dimY = dstruct.dimY;
+        this->dimZ = dstruct.dimZ;
+    }
+};
 
 extern bool gArchUseSIMD;
 
@@ -82,21 +120,21 @@
 class RsdCpuScriptImpl;
 class RsdCpuReferenceImpl;
 
-typedef struct ScriptTLSStructRec {
+struct ScriptTLSStruct {
     android::renderscript::Context * mContext;
     const android::renderscript::Script * mScript;
     RsdCpuScriptImpl *mImpl;
-} ScriptTLSStruct;
+};
 
-typedef struct {
-    RsForEachStubParamStruct fep;
+struct MTLaunchStruct {
+    RsExpandKernelDriverInfo fep;
 
     RsdCpuReferenceImpl *rsc;
     RsdCpuScriptImpl *script;
 
     ForEachFunc_t kernel;
     uint32_t sig;
-    const Allocation * ain;
+    const Allocation ** ains;
     Allocation * aout;
 
     uint32_t mSliceSize;
@@ -112,12 +150,9 @@
     uint32_t arrayStart;
     uint32_t arrayEnd;
 
-    // Multi-input data.
-    const Allocation ** ains;
-} MTLaunchStruct;
-
-
-
+    const uint8_t *inPtrsBuff[RS_KERNEL_INPUT_THRESHOLD];
+    StridePair     inStridesBuff[RS_KERNEL_INPUT_THRESHOLD];
+};
 
 class RsdCpuReferenceImpl : public RsdCpuReference {
 public:
@@ -138,9 +173,6 @@
         return mWorkers.mCount + 1;
     }
 
-    void launchThreads(const Allocation * ain, Allocation * aout,
-                       const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
                        const RsScriptCall* sc, MTLaunchStruct* mtls);
 
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 5a7fffd..8437c99 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -73,54 +73,29 @@
 }
 
 
-void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::preLaunch(uint32_t slot, const Allocation ** ains,
+                                      uint32_t inLen, Allocation * aout,
+                                      const void * usr, uint32_t usrLen,
+                                      const RsScriptCall *sc) {
 }
 
-void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsic::postLaunch(uint32_t slot, const Allocation ** ains,
+                                       uint32_t inLen, Allocation * aout,
+                                       const void * usr, uint32_t usrLen,
+                                       const RsScriptCall *sc) {
 }
 
 void RsdCpuScriptIntrinsic::invokeForEach(uint32_t slot,
-                                          const Allocation * ain,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
                                           Allocation * aout,
                                           const void * usr,
                                           uint32_t usrLen,
                                           const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    preLaunch(slot, ain, aout, usr, usrLen, sc);
 
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    mtls.script = this;
-    mtls.fep.slot = slot;
-
-    mtls.kernel = (void (*)())mRootPtr;
-    mtls.fep.usr = this;
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-
-    postLaunch(slot, ain, aout, usr, usrLen, sc);
-}
-
-void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
-                                               const Allocation ** ains,
-                                               uint32_t inLen,
-                                               Allocation * aout,
-                                               const void * usr,
-                                               uint32_t usrLen,
-                                               const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
-    /*
-     * FIXME: Possibly create new preLaunch and postLaunch functions that take
-     *        all of the input allocation pointers.
-     */
-    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    preLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     mtls.script = this;
@@ -133,7 +108,7 @@
     mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
     mCtx->setTLS(oldTLS);
 
-    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+    postLaunch(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index bf6a8ac..95aaa14 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -28,43 +28,42 @@
 public:
     virtual void populateScript(Script *) = 0;
 
-    virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
+    virtual void invokeFunction(uint32_t slot, const void * params,
+                                size_t paramLength);
     virtual int invokeRoot();
+
     virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+                               const Allocation ** ain,
+                               uint32_t inLen,
+                               Allocation * aout,
+                               const void * usr,
+                               uint32_t usrLen,
+                               const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                       const Allocation ** ain,
-                       uint32_t inLen,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
-
-    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
+    virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct * mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
-                           uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall * sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall * sc);
 
-    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
-    virtual void setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
-                                  const Element *e, const uint32_t *dims, size_t dimLength);
+    virtual void setGlobalVar(uint32_t slot, const void * data,
+                              size_t dataLength);
+    virtual void setGlobalVarWithElemDims(uint32_t slot, const void * data,
+                                          size_t dataLength, const Element * e,
+                                          const uint32_t * dims,
+                                          size_t dimLength);
     virtual void setGlobalBind(uint32_t slot, Allocation *data);
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
     virtual ~RsdCpuScriptIntrinsic();
-    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl *ctx, const Script *s, const Element *,
-                          RsScriptIntrinsicID iid);
+    RsdCpuScriptIntrinsic(RsdCpuReferenceImpl * ctx, const Script * s,
+                          const Element * e, RsScriptIntrinsicID iid);
 
 protected:
     RsScriptIntrinsicID mID;
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index 7eb0c01..86d0478 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -38,9 +38,9 @@
 protected:
     ObjectBaseRef<Allocation> mLUT;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -58,13 +58,13 @@
                                       int dimx, int dimy, int dimz);
 
 
-void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
-                                      uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsic3DLUT::kernel(const RsExpandKernelParams *p,
+                                        uint32_t xstart, uint32_t xend,
+                                        uint32_t outstep) {
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
     uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
+    uchar4 *in = (uchar4 *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -161,9 +161,9 @@
     }
 }
 
-RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
-                                                     const Script *s, const Element *e)
-            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
+RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(
+    RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) :
+        RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
 
     mRootPtr = &kernel;
 }
@@ -185,5 +185,3 @@
 
     return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlend.cpp b/cpu_ref/rsCpuIntrinsicBlend.cpp
index 228b887..27a02b7 100644
--- a/cpu_ref/rsCpuIntrinsicBlend.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlend.cpp
@@ -33,9 +33,8 @@
     RsdCpuScriptIntrinsicBlend(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    static void kernel(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
+    static void kernel(const RsExpandKernelParams *p, uint32_t xstart,
+                       uint32_t xend, uint32_t outstep);
 };
 
 }
@@ -96,28 +95,28 @@
 #endif
 
 #if defined(ARCH_X86_HAVE_SSSE3)
-extern "C" void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
-extern "C" void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
 #endif
 
-void RsdCpuScriptIntrinsicBlend::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlend::kernel(const RsExpandKernelParams *p,
                                         uint32_t xstart, uint32_t xend,
-                                        uint32_t instep, uint32_t outstep) {
+                                        uint32_t outstep) {
     RsdCpuScriptIntrinsicBlend *cp = (RsdCpuScriptIntrinsicBlend *)p->usr;
 
     // instep/outstep can be ignored--sizeof(uchar4) known at compile time
     uchar4 *out = (uchar4 *)p->out;
-    uchar4 *in = (uchar4 *)p->in;
+    uchar4 *in = (uchar4 *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -509,6 +508,3 @@
                                       const Script *s, const Element *e) {
     return new RsdCpuScriptIntrinsicBlend(ctx, s, e);
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 123cc9f..9c6433a 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -44,12 +44,12 @@
     int mIradius;
     ObjectBaseRef<Allocation> mAlloc;
 
-    static void kernelU4(const RsForEachStubParamStruct *p,
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
     void ComputeGaussianWeights();
 };
 
@@ -113,7 +113,7 @@
 
 
 
-static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
+static void OneVU4(const RsExpandKernelParams *p, float4 *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x*4;
@@ -131,7 +131,7 @@
     out[0] = blurredPixel;
 }
 
-static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
+static void OneVU1(const RsExpandKernelParams *p, float *out, int32_t x, int32_t y,
                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
 
     const uchar *pi = ptrIn + x;
@@ -155,9 +155,9 @@
                  size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
 
 #if defined(ARCH_X86_HAVE_SSSE3)
-extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
-extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
-extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
+extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
+extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
+extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
 #endif
 
 static void OneVFU4(float4 *out,
@@ -247,7 +247,7 @@
     }
 }
 
-static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
+static void OneHU4(const RsExpandKernelParams *p, uchar4 *out, int32_t x,
                    const float4 *ptrIn, const float* gPtr, int iradius) {
 
     float4 blurredPixel = 0;
@@ -262,7 +262,7 @@
     out->xyzw = convert_uchar4(blurredPixel);
 }
 
-static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
+static void OneHU1(const RsExpandKernelParams *p, uchar *out, int32_t x,
                    const float *ptrIn, const float* gPtr, int iradius) {
 
     float blurredPixel = 0;
@@ -278,9 +278,9 @@
 }
 
 
-void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
 
     float4 stackbuf[2048];
     float4 *buf = &stackbuf[0];
@@ -350,9 +350,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelParams *p,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
     float buf[4 * 2048];
     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
     if (!cp->mAlloc.get()) {
@@ -420,7 +420,7 @@
                                                      const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
 
-    mRootPtr = NULL;
+    mRootPtr = nullptr;
     if (e->getType() == RS_TYPE_UNSIGNED_8) {
         switch (e->getVectorSize()) {
         case 1:
@@ -470,5 +470,3 @@
 
     return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8c85277..64ce43f 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -169,10 +169,9 @@
     virtual ~RsdCpuScriptIntrinsicColorMatrix();
     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
-                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
+                           uint32_t usrLen, const RsScriptCall *sc);
 
 protected:
     float fp[16];
@@ -188,9 +187,9 @@
     FunctionTab_t mFnTab;
 #endif
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
     void updateCoeffCache(float fpMul, float addMul);
 
     Key_t mLastKey;
@@ -456,16 +455,16 @@
 #endif
 
 #if defined(ARCH_X86_HAVE_SSSE3)
-extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
                                   const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
                                   const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
                                   const short *coef, uint32_t count);
 
 void * selectKernel(Key_t key)
 {
-    void * kernel = NULL;
+    void * kernel = nullptr;
 
     // inType, outType float if nonzero
     if (!(key.u.inType || key.u.outType)) {
@@ -493,7 +492,7 @@
     }
 
     uint8_t *buf = mBuf;
-    uint8_t *buf2 = NULL;
+    uint8_t *buf2 = nullptr;
 
     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
     int opInit[4] = {0, 0, 0, 0};
@@ -779,7 +778,7 @@
 }
 
 
-static void One(const RsForEachStubParamStruct *p, void *out,
+static void One(const RsExpandKernelParams *p, void *out,
                 const void *py, const float* coeff, const float *add,
                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
 
@@ -880,12 +879,15 @@
     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelParams *p,
                                               uint32_t xstart, uint32_t xend,
-                                              uint32_t instep, uint32_t outstep) {
+                                              uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
+
+    uint32_t instep = p->inEStrides[0];
+
     uchar *out = (uchar *)p->out;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -899,7 +901,7 @@
     if(x2 > x1) {
         int32_t len = x2 - x1;
         if (gArchUseSIMD) {
-            if((cp->mOptKernel != NULL) && (len >= 4)) {
+            if((cp->mOptKernel != nullptr) && (len >= 4)) {
                 // The optimized kernel processes 4 pixels at once
                 // and requires a minimum of 1 chunk of 4
                 cp->mOptKernel(out, in, cp->ip, len >> 2);
@@ -933,11 +935,15 @@
     }
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
+                                                 const Allocation ** ains,
+                                                 uint32_t inLen,
+                                                 Allocation * aout,
+                                                 const void * usr,
+                                                 uint32_t usrLen,
+                                                 const RsScriptCall *sc) {
 
-    const Element *ein = ain->mHal.state.type->getElement();
+    const Element *ein = ains[0]->mHal.state.type->getElement();
     const Element *eout = aout->mHal.state.type->getElement();
 
     if (ein->getType() == eout->getType()) {
@@ -954,20 +960,20 @@
         }
     }
 
-    Key_t key = computeKey(ain->mHal.state.type->getElement(),
-                           aout->mHal.state.type->getElement());
+    Key_t key = computeKey(ein, eout);
+
 #if defined(ARCH_X86_HAVE_SSSE3)
-    if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
         // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
         mLastKey = key;
     }
 
 #else //if !defined(ARCH_X86_HAVE_SSSE3)
-    if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
         if (mBuf) munmap(mBuf, mBufSize);
-        mBuf = NULL;
-        mOptKernel = NULL;
+        mBuf = nullptr;
+        mOptKernel = nullptr;
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
         }
@@ -997,20 +1003,14 @@
 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
 }
 
-void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
-        uint32_t slot, const Allocation * ain, Allocation * aout,
-        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
-
-}
-
 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
 
     mLastKey.key = 0;
-    mBuf = NULL;
+    mBuf = nullptr;
     mBufSize = 0;
-    mOptKernel = NULL;
+    mOptKernel = nullptr;
     const static float defaultMatrix[] = {
         1.f, 0.f, 0.f, 0.f,
         0.f, 1.f, 0.f, 0.f,
@@ -1024,8 +1024,8 @@
 
 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
     if (mBuf) munmap(mBuf, mBufSize);
-    mBuf = NULL;
-    mOptKernel = NULL;
+    mBuf = nullptr;
+    mOptKernel = nullptr;
 }
 
 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a835..f9b70cc 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -42,24 +42,24 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 };
 
 }
@@ -88,7 +88,7 @@
                                           const void *y2, const short *coef, uint32_t count);
 
 
-static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void ConvolveOneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
                           const float* coeff) {
 
@@ -110,7 +110,7 @@
     *out = o;
 }
 
-static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void ConvolveOneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
                           const float* coeff) {
 
@@ -131,7 +131,7 @@
     *out = convert_uchar2(px);
 }
 
-static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void ConvolveOneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                           const uchar *py0, const uchar *py1, const uchar *py2,
                           const float* coeff) {
 
@@ -150,7 +150,7 @@
     *out = clamp(px, 0.f, 255.f);
 }
 
-static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void ConvolveOneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                           const float4 *py0, const float4 *py1, const float4 *py2,
                           const float* coeff) {
 
@@ -161,7 +161,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void ConvolveOneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                           const float2 *py0, const float2 *py1, const float2 *py2,
                           const float* coeff) {
 
@@ -172,7 +172,7 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-static void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void ConvolveOneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                           const float *py0, const float *py1, const float *py2,
                           const float* coeff) {
 
@@ -183,9 +183,9 @@
            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -230,9 +230,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -275,9 +275,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -320,9 +320,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -365,9 +365,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -409,9 +409,9 @@
         }
     }
 }
-void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -507,5 +507,3 @@
 
     return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e2a6b8b..815badf 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -42,24 +42,24 @@
     ObjectBaseRef<Allocation> alloc;
 
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF1(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelF4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelF4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 
 
 };
@@ -86,7 +86,7 @@
 }
 
 
-static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
+static void OneU4(const RsExpandKernelParams *p, uint32_t x, uchar4 *out,
                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
                   const float* coeff) {
 
@@ -129,7 +129,7 @@
     *out = convert_uchar4(px);
 }
 
-static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
+static void OneU2(const RsExpandKernelParams *p, uint32_t x, uchar2 *out,
                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
                   const float* coeff) {
 
@@ -172,7 +172,7 @@
     *out = convert_uchar2(px);
 }
 
-static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
+static void OneU1(const RsExpandKernelParams *p, uint32_t x, uchar *out,
                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
                   const float* coeff) {
 
@@ -215,7 +215,7 @@
     *out = px;
 }
 
-static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
+static void OneF4(const RsExpandKernelParams *p, uint32_t x, float4 *out,
                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
                   const float* coeff) {
 
@@ -257,7 +257,7 @@
     *out = px;
 }
 
-static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
+static void OneF2(const RsExpandKernelParams *p, uint32_t x, float2 *out,
                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
                   const float* coeff) {
 
@@ -299,7 +299,7 @@
     *out = px;
 }
 
-static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
+static void OneF1(const RsExpandKernelParams *p, uint32_t x, float *out,
                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
                   const float* coeff) {
 
@@ -346,9 +346,9 @@
                                           const void *y2, const void *y3, const void *y4,
                                           const short *coef, uint32_t count);
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -406,9 +406,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -455,9 +455,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -504,9 +504,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -553,9 +553,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -602,9 +602,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("Convolve5x5 executed without input, skipping");
@@ -705,6 +705,3 @@
 
     return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsicHistogram.cpp b/cpu_ref/rsCpuIntrinsicHistogram.cpp
index cdfe7d1..4779187 100644
--- a/cpu_ref/rsCpuIntrinsicHistogram.cpp
+++ b/cpu_ref/rsCpuIntrinsicHistogram.cpp
@@ -36,10 +36,10 @@
     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
 
 protected:
-    void preLaunch(uint32_t slot, const Allocation * ain,
+    void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                    Allocation * aout, const void * usr,
                    uint32_t usrLen, const RsScriptCall *sc);
-    void postLaunch(uint32_t slot, const Allocation * ain,
+    void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
                     Allocation * aout, const void * usr,
                     uint32_t usrLen, const RsScriptCall *sc);
 
@@ -49,31 +49,31 @@
     int *mSums;
     ObjectBaseRef<Allocation> mAllocOut;
 
-    static void kernelP1U4(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U3(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U2(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
-    static void kernelP1U1(const RsForEachStubParamStruct *p,
-                          uint32_t xstart, uint32_t xend,
-                          uint32_t instep, uint32_t outstep);
+    static void kernelP1U4(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U3(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U2(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
+    static void kernelP1U1(const RsExpandKernelParams *p,
+                           uint32_t xstart, uint32_t xend,
+                           uint32_t outstep);
 
-    static void kernelP1L4(const RsForEachStubParamStruct *p,
+    static void kernelP1L4(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L3(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L3(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L2(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L2(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
-    static void kernelP1L1(const RsForEachStubParamStruct *p,
+                           uint32_t outstep);
+    static void kernelP1L1(const RsExpandKernelParams *p,
                            uint32_t xstart, uint32_t xend,
-                           uint32_t instep, uint32_t outstep);
+                           uint32_t outstep);
 
 };
 
@@ -97,9 +97,12 @@
 
 
 
-void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
-                                      Allocation * aout, const void * usr,
-                                      uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen, Allocation * aout,
+                                          const void * usr, uint32_t usrLen,
+                                          const RsScriptCall *sc) {
 
     const uint32_t threads = mCtx->getThreadCount();
     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
@@ -123,7 +126,7 @@
         }
         break;
     case 1:
-        switch(ain->getType()->getElement()->getVectorSize()) {
+        switch(ains[0]->getType()->getElement()->getVectorSize()) {
         case 1:
             mRootPtr = &kernelP1L1;
             break;
@@ -142,9 +145,12 @@
     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
 }
 
-void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
-                                       Allocation * aout, const void * usr,
-                                       uint32_t usrLen, const RsScriptCall *sc) {
+void
+RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
+                                           const Allocation ** ains,
+                                           uint32_t inLen,  Allocation * aout,
+                                           const void * usr, uint32_t usrLen,
+                                           const RsScriptCall *sc) {
 
     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
     uint32_t threads = mCtx->getThreadCount();
@@ -160,12 +166,12 @@
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -173,47 +179,47 @@
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
         sums[(in[3] << 2) + 3] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 4 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 2)    ] ++;
         sums[(in[1] << 2) + 1] ++;
         sums[(in[2] << 2) + 2] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * 2 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[(in[0] << 1)    ] ++;
         sums[(in[1] << 1) + 1] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -222,16 +228,16 @@
                 (cp->mDotI[2] * in[2]) +
                 (cp->mDotI[3] * in[3]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
@@ -239,52 +245,52 @@
                 (cp->mDotI[1] * in[1]) +
                 (cp->mDotI[2] * in[2]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]) +
                 (cp->mDotI[1] * in[1]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         int t = (cp->mDotI[0] * in[0]);
         sums[(t + 0x7f) >> 8] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
-void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
 
     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
-    uchar *in = (uchar *)p->in;
+    uchar *in = (uchar *)p->ins[0];
     int * sums = &cp->mSums[256 * p->lid];
 
     for (uint32_t x = xstart; x < xend; x++) {
         sums[in[0]] ++;
-        in += instep;
+        in += p->inEStrides[0];
     }
 }
 
@@ -293,7 +299,7 @@
                                                      const Script *s, const Element *e)
             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
 
-    mRootPtr = NULL;
+    mRootPtr = nullptr;
     mSums = new int[256 * 4 * mCtx->getThreadCount()];
     mDot[0] = 0.299f;
     mDot[1] = 0.587f;
@@ -323,5 +329,3 @@
 
     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicLUT.cpp b/cpu_ref/rsCpuIntrinsicLUT.cpp
index 5b2adc5..b08a0e5 100644
--- a/cpu_ref/rsCpuIntrinsicLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsicLUT.cpp
@@ -38,9 +38,9 @@
 protected:
     ObjectBaseRef<Allocation> lut;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -53,13 +53,13 @@
 }
 
 
-void RsdCpuScriptIntrinsicLUT::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicLUT::kernel(const RsExpandKernelParams *p,
                                       uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep) {
+                                      uint32_t outstep) {
     RsdCpuScriptIntrinsicLUT *cp = (RsdCpuScriptIntrinsicLUT *)p->usr;
 
     uchar *out = (uchar *)p->out;
-    const uchar *in = (uchar *)p->in;
+    const uchar *in = (uchar *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -103,5 +103,3 @@
 
     return new RsdCpuScriptIntrinsicLUT(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 474f82d..fa0e8ee 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -35,8 +35,8 @@
     virtual ~RsdCpuScriptIntrinsicResize();
     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
 
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
 
     float scaleX;
@@ -46,15 +46,15 @@
     ObjectBaseRef<const Allocation> mAlloc;
     ObjectBaseRef<const Element> mElement;
 
-    static void kernelU1(const RsForEachStubParamStruct *p,
+    static void kernelU1(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU2(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU2(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
-    static void kernelU4(const RsForEachStubParamStruct *p,
+                         uint32_t outstep);
+    static void kernelU4(const RsExpandKernelParams *p,
                          uint32_t xstart, uint32_t xend,
-                         uint32_t instep, uint32_t outstep);
+                         uint32_t outstep);
 };
 
 }
@@ -175,9 +175,9 @@
     return (uchar)p;
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -215,9 +215,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -255,9 +255,9 @@
     }
 }
 
-void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelParams *p,
                                                 uint32_t xstart, uint32_t xend,
-                                                uint32_t instep, uint32_t outstep) {
+                                                uint32_t outstep) {
     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)p->usr;
 
     if (!cp->mAlloc.get()) {
@@ -304,9 +304,11 @@
 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
 }
 
-void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot, const Allocation * ain,
-                                            Allocation * aout, const void * usr,
-                                            uint32_t usrLen, const RsScriptCall *sc)
+void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
+                                            const Allocation ** ains,
+                                            uint32_t inLen, Allocation * aout,
+                                            const void * usr, uint32_t usrLen,
+                                            const RsScriptCall *sc)
 {
     if (!mAlloc.get()) {
         ALOGE("Resize executed without input, skipping");
@@ -347,5 +349,3 @@
 
     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
 }
-
-
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index c53ef31..497e19c 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -46,9 +46,9 @@
 protected:
     ObjectBaseRef<Allocation> alloc;
 
-    static void kernel(const RsForEachStubParamStruct *p,
+    static void kernel(const RsExpandKernelParams *p,
                        uint32_t xstart, uint32_t xend,
-                       uint32_t instep, uint32_t outstep);
+                       uint32_t outstep);
 };
 
 }
@@ -101,16 +101,16 @@
 extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart, size_t xend);
 extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v, size_t xstart, size_t xend);
 
-void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
+void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsExpandKernelParams *p,
                                            uint32_t xstart, uint32_t xend,
-                                           uint32_t instep, uint32_t outstep) {
+                                           uint32_t outstep) {
     RsdCpuScriptIntrinsicYuvToRGB *cp = (RsdCpuScriptIntrinsicYuvToRGB *)p->usr;
     if (!cp->alloc.get()) {
         ALOGE("YuvToRGB executed without input, skipping");
         return;
     }
     const uchar *pinY = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
-    if (pinY == NULL) {
+    if (pinY == nullptr) {
         ALOGE("YuvToRGB executed without data, skipping");
         return;
     }
@@ -143,7 +143,7 @@
     //ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX, cp->alloc->mHal.drvState.lod[0].dimY);
     //ALOGE("p->dimX, %d, p->dimY, %d", p->dimX, p->dimY);
 
-    if (pinU == NULL) {
+    if (pinU == nullptr) {
         // Legacy yuv support didn't fill in uv
         v = ((uint8_t *)cp->alloc->mHal.drvState.lod[0].mallocPtr) +
             (strideY * p->dimY) +
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
index 632ef7a..6386863 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -176,12 +176,17 @@
          * same time as loading only part of a register.  So the data is loaded
          * linearly and unpacked manually at this point if necessary.
          */
-1:      uzp1        v8.16b, v8.16b, v9.16b
+1:      mov         v18.8b, v8.8b
+        uzp1        v8.8b, v18.8b, v9.8b
+        uzp2        v9.8b, v18.8b, v9.8b
   .if \interleaved
+        mov         v18.8b, v16.8b
     .if \swapuv
-        uzp1        v16.16b, v17.16b, v16.16b
+        uzp1        v16.8b, v17.8b, v18.8b
+        uzp2        v17.8b, v17.8b, v18.8b
     .else
-        uzp1        v16.16b, v16.16b, v17.16b
+        uzp1        v16.8b, v18.8b, v17.8b
+        uzp2        v17.8b, v18.8b, v17.8b
     .endif
   .endif
 
@@ -225,7 +230,7 @@
         add         x1, x1, x4
         add         x4, x3, x6
         add         x3, x2, x6
-        sub         x2, x5, x6, LSL #2
+        sub         x2, x5, x6, LSL #1
 
         sub         x6, sp, #32
         sub         sp, sp, #64
diff --git a/cpu_ref/rsCpuIntrinsics_x86.c b/cpu_ref/rsCpuIntrinsics_x86.cpp
similarity index 98%
rename from cpu_ref/rsCpuIntrinsics_x86.c
rename to cpu_ref/rsCpuIntrinsics_x86.cpp
index cc799af..cb502c6 100644
--- a/cpu_ref/rsCpuIntrinsics_x86.c
+++ b/cpu_ref/rsCpuIntrinsics_x86.cpp
@@ -76,9 +76,9 @@
 #endif
 }
 
-void rsdIntrinsicConvolve3x3_K(void *dst,
-                               const void *y0, const void *y1, const void *y2,
-                               const short *coef, uint32_t count) {
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const short *coef, uint32_t count) {
     __m128i x;
     __m128i c0, c2, c4, c6, c8;
     __m128i r0, r1, r2;
@@ -593,9 +593,10 @@
     }
 }
 
-void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
-                               const void *y2, const void *y3, const void *y4,
-                               const short *coef, uint32_t count) {
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const void *y3, const void *y4,
+                                          const short *coef, uint32_t count) {
     __m128i x;
     __m128i c0, c2, c4, c6, c8, c10, c12;
     __m128i c14, c16, c18, c20, c22, c24;
diff --git a/cpu_ref/rsCpuRuntimeMath.cpp b/cpu_ref/rsCpuRuntimeMath.cpp
index db8cee9..5e66bbd 100644
--- a/cpu_ref/rsCpuRuntimeMath.cpp
+++ b/cpu_ref/rsCpuRuntimeMath.cpp
@@ -312,7 +312,7 @@
     { "_Z6rsRandff", (void *)&SC_randf2, true },
     { "_Z6rsFracf", (void *)&SC_frac, true },
 
-    { NULL, NULL, false }
+    { nullptr, nullptr, false }
 };
 
 const RsdCpuReference::CpuSymbol * RsdCpuScriptImpl::lookupSymbolMath(const char *sym) {
@@ -324,6 +324,6 @@
         }
         syms++;
     }
-    return NULL;
+    return nullptr;
 }
 
diff --git a/cpu_ref/rsCpuRuntimeStubs.cpp b/cpu_ref/rsCpuRuntimeStubs.cpp
index 6210f33..6dc3688 100644
--- a/cpu_ref/rsCpuRuntimeStubs.cpp
+++ b/cpu_ref/rsCpuRuntimeStubs.cpp
@@ -301,14 +301,14 @@
     { "_Z7rsDebugPKcPKDv4_y", (void *)&SC_debugUL4, true },
     { "_Z7rsDebugPKcPKv", (void *)&SC_debugP, true },
 
-    { NULL, NULL, false }
+    { nullptr, nullptr, false }
 };
 
 
 void * RsdCpuScriptImpl::lookupRuntimeStub(void* pContext, char const* name) {
     RsdCpuScriptImpl *s = (RsdCpuScriptImpl *)pContext;
     const RsdCpuReference::CpuSymbol *syms = gSyms;
-    const RsdCpuReference::CpuSymbol *sym = NULL;
+    const RsdCpuReference::CpuSymbol *sym = nullptr;
 
     sym = s->mCtx->symLookup(name);
     if (!sym) {
@@ -328,7 +328,7 @@
         return sym->fnPtr;
     }
     ALOGE("ScriptC sym lookup failed for %s", name);
-    return NULL;
+    return nullptr;
 }
 
 
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda1..f4abe67 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -26,6 +26,8 @@
     #include <string.h>
     #include <sys/stat.h>
     #include <unistd.h>
+    #include <fstream>
+    #include <iostream>
 #else
     #include <bcc/BCCContext.h>
     #include <bcc/Config/Config.h>
@@ -79,24 +81,49 @@
     return false;
 }
 
+// Copy the file named \p srcFile to \p dstFile.
+// Return 0 on success and -1 if anything wasn't copied.
+static int copyFile(const char *dstFile, const char *srcFile) {
+    std::ifstream srcStream(srcFile);
+    if (!srcStream) {
+        ALOGE("Could not verify or read source file: %s", srcFile);
+        return -1;
+    }
+    std::ofstream dstStream(dstFile);
+    if (!dstStream) {
+        ALOGE("Could not verify or write destination file: %s", dstFile);
+        return -1;
+    }
+    dstStream << srcStream.rdbuf();
+    if (!dstStream) {
+        ALOGE("Could not write destination file: %s", dstFile);
+        return -1;
+    }
+
+    srcStream.close();
+    dstStream.close();
+
+    return 0;
+}
+
 // Attempt to load the shared library from origName, but then fall back to
-// creating the symlinked shared library if necessary (to ensure instancing).
+// creating a copy of the shared library if necessary (to ensure instancing).
 // This function returns the dlopen()-ed handle if successful.
 static void *loadSOHelper(const char *origName, const char *cacheDir,
                           const char *resName) {
     // Keep track of which .so libraries have been loaded. Once a library is
-    // in the set (per-process granularity), we must instead make a symlink to
+    // in the set (per-process granularity), we must instead make a copy of
     // the original shared object (randomly named .so file) and load that one
     // instead. If we don't do this, we end up aliasing global data between
     // the various Script instances (which are supposed to be completely
     // independent).
     static std::set<std::string> LoadedLibraries;
 
-    void *loaded = NULL;
+    void *loaded = nullptr;
 
     // Skip everything if we don't even have the original library available.
     if (access(origName, F_OK) != 0) {
-        return NULL;
+        return nullptr;
     }
 
     // Common path is that we have not loaded this Script/library before.
@@ -113,25 +140,25 @@
 
     if (!ensureCacheDirExists(newName.c_str())) {
         ALOGE("Could not verify or create cache dir: %s", cacheDir);
-        return NULL;
+        return nullptr;
     }
 
-    // Construct an appropriately randomized filename for the symlink.
+    // Construct an appropriately randomized filename for the copy.
     newName.append("librs.");
     newName.append(resName);
     newName.append("#");
     newName.append(getRandomString(6));  // 62^6 potential filename variants.
     newName.append(".so");
 
-    int r = symlink(origName, newName.c_str());
+    int r = copyFile(newName.c_str(), origName);
     if (r != 0) {
-        ALOGE("Could not create symlink %s -> %s", newName.c_str(), origName);
-        return NULL;
+        ALOGE("Could not create copy %s -> %s", origName, newName.c_str());
+        return nullptr;
     }
     loaded = dlopen(newName.c_str(), RTLD_NOW | RTLD_LOCAL);
     r = unlink(newName.c_str());
     if (r != 0) {
-        ALOGE("Could not unlink symlink %s", newName.c_str());
+        ALOGE("Could not unlink copy %s", newName.c_str());
     }
     if (loaded) {
         LoadedLibraries.insert(newName.c_str());
@@ -141,13 +168,12 @@
 }
 
 // Load the shared library referred to by cacheDir and resName. If we have
-// already loaded this library, we instead create a new symlink (in the
-// cache dir) and then load that. We then immediately destroy the symlink.
+// already loaded this library, we instead create a new copy (in the
+// cache dir) and then load that. We then immediately destroy the copy.
 // This is required behavior to implement script instancing for the support
 // library, since shared objects are loaded and de-duped by name only.
 static void *loadSharedLibrary(const char *cacheDir, const char *resName) {
-    void *loaded = NULL;
-    //arc4random_stir();
+    void *loaded = nullptr;
 #ifndef RS_SERVER
     std::string scriptSOName(cacheDir);
     size_t cutPos = scriptSOName.rfind("cache");
@@ -167,7 +193,7 @@
     // location for shared libraries first.
     loaded = loadSOHelper(scriptSOName.c_str(), cacheDir, resName);
 
-    if (loaded == NULL) {
+    if (loaded == nullptr) {
         ALOGE("Unable to open shared library (%s): %s",
               scriptSOName.c_str(), dlerror());
 
@@ -176,12 +202,12 @@
         // library fallback path. Those applications don't have a private
         // library path, so they need to install to the system directly.
         // Note that this is really just a testing path.
-        android::String8 scriptSONameSystem("/system/lib/librs.");
+        std::string scriptSONameSystem("/system/lib/librs.");
         scriptSONameSystem.append(resName);
         scriptSONameSystem.append(".so");
         loaded = loadSOHelper(scriptSONameSystem.c_str(), cacheDir,
                               resName);
-        if (loaded == NULL) {
+        if (loaded == nullptr) {
             ALOGE("Unable to open system shared library (%s): %s",
                   scriptSONameSystem.c_str(), dlerror());
         }
@@ -216,9 +242,11 @@
 
 const static char *BCC_EXE_PATH = "/system/bin/bcc";
 
-static void setCompileArguments(std::vector<const char*>* args, const android::String8& bcFileName,
-                                const char* cacheDir, const char* resName, const char* core_lib,
-                                bool useRSDebugContext, const char* bccPluginName) {
+static void setCompileArguments(std::vector<const char*>* args,
+                                const std::string& bcFileName,
+                                const char* cacheDir, const char* resName,
+                                const char* core_lib, bool useRSDebugContext,
+                                const char* bccPluginName) {
     rsAssert(cacheDir && resName && core_lib);
     args->push_back(BCC_EXE_PATH);
     args->push_back("-o");
@@ -242,27 +270,27 @@
         }
     }
 
-    args->push_back(bcFileName.string());
-    args->push_back(NULL);
+    args->push_back(bcFileName.c_str());
+    args->push_back(nullptr);
 }
 
-static bool compileBitcode(const android::String8& bcFileName,
+static bool compileBitcode(const std::string &bcFileName,
                            const char *bitcode,
                            size_t bitcodeSize,
-                           const char** compileArguments,
-                           const std::string& compileCommandLine) {
+                           const char **compileArguments,
+                           const std::string &compileCommandLine) {
     rsAssert(bitcode && bitcodeSize);
 
-    FILE *bcfile = fopen(bcFileName.string(), "w");
+    FILE *bcfile = fopen(bcFileName.c_str(), "w");
     if (!bcfile) {
-        ALOGE("Could not write to %s", bcFileName.string());
+        ALOGE("Could not write to %s", bcFileName.c_str());
         return false;
     }
     size_t nwritten = fwrite(bitcode, 1, bitcodeSize, bcfile);
     fclose(bcfile);
     if (nwritten != bitcodeSize) {
         ALOGE("Could not write %zu bytes to %s", bitcodeSize,
-              bcFileName.string());
+              bcFileName.c_str());
         return false;
     }
 
@@ -316,10 +344,10 @@
 #define OBJECT_SLOT_STR "objectSlotCount: "
 
 // Copy up to a newline or size chars from str -> s, updating str
-// Returns s when successful and NULL when '\0' is finally reached.
+// Returns s when successful and nullptr when '\0' is finally reached.
 static char* strgets(char *s, int size, const char **ppstr) {
     if (!ppstr || !*ppstr || **ppstr == '\0' || size < 1) {
-        return NULL;
+        return nullptr;
     }
 
     int i;
@@ -346,27 +374,27 @@
     mScript = s;
 
 #ifdef RS_COMPATIBILITY_LIB
-    mScriptSO = NULL;
-    mInvokeFunctions = NULL;
-    mForEachFunctions = NULL;
-    mFieldAddress = NULL;
-    mFieldIsObject = NULL;
-    mForEachSignatures = NULL;
+    mScriptSO = nullptr;
+    mInvokeFunctions = nullptr;
+    mForEachFunctions = nullptr;
+    mFieldAddress = nullptr;
+    mFieldIsObject = nullptr;
+    mForEachSignatures = nullptr;
 #else
-    mCompilerContext = NULL;
-    mCompilerDriver = NULL;
-    mExecutable = NULL;
+    mCompilerContext = nullptr;
+    mCompilerDriver = nullptr;
+    mExecutable = nullptr;
 #endif
 
 
-    mRoot = NULL;
-    mRootExpand = NULL;
-    mInit = NULL;
-    mFreeChildren = NULL;
+    mRoot = nullptr;
+    mRootExpand = nullptr;
+    mInit = nullptr;
+    mFreeChildren = nullptr;
 
 
-    mBoundAllocs = NULL;
-    mIntrinsicData = NULL;
+    mBoundAllocs = nullptr;
+    mIntrinsicData = nullptr;
     mIsThreadable = true;
 }
 
@@ -381,19 +409,19 @@
 #ifndef RS_COMPATIBILITY_LIB
     bool useRSDebugContext = false;
 
-    mCompilerContext = NULL;
-    mCompilerDriver = NULL;
-    mExecutable = NULL;
+    mCompilerContext = nullptr;
+    mCompilerDriver = nullptr;
+    mExecutable = nullptr;
 
     mCompilerContext = new bcc::BCCContext();
-    if (mCompilerContext == NULL) {
+    if (mCompilerContext == nullptr) {
         ALOGE("bcc: FAILS to create compiler context (out of memory)");
         mCtx->unlockMutex();
         return false;
     }
 
     mCompilerDriver = new bcc::RSCompilerDriver();
-    if (mCompilerDriver == NULL) {
+    if (mCompilerDriver == nullptr) {
         ALOGE("bcc: FAILS to create compiler driver (out of memory)");
         mCtx->unlockMutex();
         return false;
@@ -408,7 +436,7 @@
     // Run any compiler setup functions we have been provided with.
     RSSetupCompilerCallback setupCompilerCallback =
             mCtx->getSetupCompilerCallback();
-    if (setupCompilerCallback != NULL) {
+    if (setupCompilerCallback != nullptr) {
         setupCompilerCallback(mCompilerDriver);
     }
 
@@ -426,7 +454,7 @@
         useRSDebugContext = true;
     }
 
-    android::String8 bcFileName(cacheDir);
+    std::string bcFileName(cacheDir);
     bcFileName.append("/");
     bcFileName.append(resName);
     bcFileName.append(".bc");
@@ -434,7 +462,7 @@
     std::vector<const char*> compileArguments;
     setCompileArguments(&compileArguments, bcFileName, cacheDir, resName, core_lib,
                         useRSDebugContext, bccPluginName);
-    // The last argument of compileArguments ia a NULL, so remove 1 from the size.
+    // The last argument of compileArguments ia a nullptr, so remove 1 from the size.
     std::string compileCommandLine =
                 bcc::getCommandLine(compileArguments.size() - 1, compileArguments.data());
 
@@ -447,7 +475,7 @@
 
     // If we can't, it's either not there or out of date.  We compile the bit code and try loading
     // again.
-    if (mExecutable == NULL) {
+    if (mExecutable == nullptr) {
         if (!compileBitcode(bcFileName, (const char*)bitcode, bitcodeSize, compileArguments.data(),
                             compileCommandLine)) {
             ALOGE("bcc: FAILS to compile '%s'", resName);
@@ -457,7 +485,7 @@
         mExecutable = bcc::RSCompilerDriver::loadScript(cacheDir, resName, (const char*)bitcode,
                                                         bitcodeSize, compileCommandLine.c_str(),
                                                         mResolver);
-        if (mExecutable == NULL) {
+        if (mExecutable == nullptr) {
             ALOGE("bcc: FAILS to load freshly compiled executable for '%s'", resName);
             mCtx->unlockMutex();
             return false;
@@ -517,7 +545,7 @@
         }
 
         size_t varCount = 0;
-        if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
             goto error;
         }
         if (sscanf(line, EXPORT_VAR_STR "%zu", &varCount) != 1) {
@@ -531,16 +559,16 @@
             // Start by creating/zeroing this member, since we don't want to
             // accidentally clean up invalid pointers later (if we error out).
             mFieldIsObject = new bool[varCount];
-            if (mFieldIsObject == NULL) {
+            if (mFieldIsObject == nullptr) {
                 goto error;
             }
             memset(mFieldIsObject, 0, varCount * sizeof(*mFieldIsObject));
             mFieldAddress = new void*[varCount];
-            if (mFieldAddress == NULL) {
+            if (mFieldAddress == nullptr) {
                 goto error;
             }
             for (size_t i = 0; i < varCount; ++i) {
-                if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+                if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
                     goto error;
                 }
                 char *c = strrchr(line, '\n');
@@ -548,7 +576,7 @@
                     *c = '\0';
                 }
                 mFieldAddress[i] = dlsym(mScriptSO, line);
-                if (mFieldAddress[i] == NULL) {
+                if (mFieldAddress[i] == nullptr) {
                     ALOGE("Failed to find variable address for %s: %s",
                           line, dlerror());
                     // Not a critical error if we don't find a global variable.
@@ -561,7 +589,7 @@
         }
 
         size_t funcCount = 0;
-        if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
             goto error;
         }
         if (sscanf(line, EXPORT_FUNC_STR "%zu", &funcCount) != 1) {
@@ -574,11 +602,11 @@
 
         if (funcCount > 0) {
             mInvokeFunctions = new InvokeFunc_t[funcCount];
-            if (mInvokeFunctions == NULL) {
+            if (mInvokeFunctions == nullptr) {
                 goto error;
             }
             for (size_t i = 0; i < funcCount; ++i) {
-                if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+                if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
                     goto error;
                 }
                 char *c = strrchr(line, '\n');
@@ -587,7 +615,7 @@
                 }
 
                 mInvokeFunctions[i] = (InvokeFunc_t) dlsym(mScriptSO, line);
-                if (mInvokeFunctions[i] == NULL) {
+                if (mInvokeFunctions[i] == nullptr) {
                     ALOGE("Failed to get function address for %s(): %s",
                           line, dlerror());
                     goto error;
@@ -599,7 +627,7 @@
         }
 
         size_t forEachCount = 0;
-        if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
             goto error;
         }
         if (sscanf(line, EXPORT_FOREACH_STR "%zu", &forEachCount) != 1) {
@@ -610,18 +638,18 @@
         if (forEachCount > 0) {
 
             mForEachSignatures = new uint32_t[forEachCount];
-            if (mForEachSignatures == NULL) {
+            if (mForEachSignatures == nullptr) {
                 goto error;
             }
             mForEachFunctions = new ForEachFunc_t[forEachCount];
-            if (mForEachFunctions == NULL) {
+            if (mForEachFunctions == nullptr) {
                 goto error;
             }
             for (size_t i = 0; i < forEachCount; ++i) {
                 unsigned int tmpSig = 0;
                 char tmpName[MAXLINE];
 
-                if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+                if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
                     goto error;
                 }
                 if (sscanf(line, "%u - %" MAKE_STR(MAXLINE) "s",
@@ -635,7 +663,7 @@
                 mForEachSignatures[i] = tmpSig;
                 mForEachFunctions[i] =
                         (ForEachFunc_t) dlsym(mScriptSO, tmpName);
-                if (i != 0 && mForEachFunctions[i] == NULL) {
+                if (i != 0 && mForEachFunctions[i] == nullptr) {
                     // Ignore missing root.expand functions.
                     // root() is always specified at location 0.
                     ALOGE("Failed to find forEach function address for %s: %s",
@@ -649,7 +677,7 @@
         }
 
         size_t objectSlotCount = 0;
-        if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
             goto error;
         }
         if (sscanf(line, OBJECT_SLOT_STR "%zu", &objectSlotCount) != 1) {
@@ -661,7 +689,7 @@
             rsAssert(varCount > 0);
             for (size_t i = 0; i < objectSlotCount; ++i) {
                 uint32_t varNum = 0;
-                if (strgets(line, MAXLINE, &rsInfo) == NULL) {
+                if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
                     goto error;
                 }
                 if (sscanf(line, "%u", &varNum) != 1) {
@@ -726,7 +754,7 @@
 
     // If a callback has been registered to specify a library, use that.
     RSSelectRTCallback selectRTCallback = mCtx->getSelectRTCallback();
-    if (selectRTCallback != NULL) {
+    if (selectRTCallback != nullptr) {
         return selectRTCallback((const char*)bitcode, bitcodeSize);
     }
 
@@ -760,9 +788,9 @@
     script->mHal.info.exportedForeachFuncList = &mExportedForEachFuncList[0];
     script->mHal.info.exportedPragmaCount = mExecutable->getPragmaKeys().size();
     script->mHal.info.exportedPragmaKeyList =
-        const_cast<const char**>(mExecutable->getPragmaKeys().array());
+        const_cast<const char**>(&mExecutable->getPragmaKeys().front());
     script->mHal.info.exportedPragmaValueList =
-        const_cast<const char**>(mExecutable->getPragmaValues().array());
+        const_cast<const char**>(&mExecutable->getPragmaValues().front());
 
     if (mRootExpand) {
         script->mHal.info.root = mRootExpand;
@@ -789,119 +817,8 @@
 
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                                        const void * usr, uint32_t usrLen,
-                                        const RsScriptCall *sc,
-                                        MTLaunchStruct *mtls) {
-
-    memset(mtls, 0, sizeof(MTLaunchStruct));
-
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ain && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-        return;
-    }
-    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
-        return;
-    }
-
-    if (ain != NULL) {
-        const Type *inType = ain->getType();
-
-        mtls->fep.dimX = inType->getDimX();
-        mtls->fep.dimY = inType->getDimY();
-        mtls->fep.dimZ = inType->getDimZ();
-
-    } else if (aout != NULL) {
-        const Type *outType = aout->getType();
-
-        mtls->fep.dimX = outType->getDimX();
-        mtls->fep.dimY = outType->getDimY();
-        mtls->fep.dimZ = outType->getDimZ();
-
-    } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
-
-    if (ain != NULL && aout != NULL) {
-        if (!ain->hasSameDims(aout)) {
-            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
-              "Failed to launch kernel; dimensions of input and output allocations do not match.");
-
-            return;
-        }
-    }
-
-    if (!sc || (sc->xEnd == 0)) {
-        mtls->xEnd = mtls->fep.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls->fep.dimX);
-        rsAssert(sc->xEnd <= mtls->fep.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
-        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
-        if (mtls->xStart >= mtls->xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls->yEnd = mtls->fep.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls->fep.dimY);
-        rsAssert(sc->yEnd <= mtls->fep.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
-        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
-        if (mtls->yStart >= mtls->yEnd) return;
-    }
-
-    if (!sc || (sc->zEnd == 0)) {
-        mtls->zEnd = mtls->fep.dimZ;
-    } else {
-        rsAssert(sc->zStart < mtls->fep.dimZ);
-        rsAssert(sc->zEnd <= mtls->fep.dimZ);
-        rsAssert(sc->zStart < sc->zEnd);
-        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
-        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
-        if (mtls->zStart >= mtls->zEnd) return;
-    }
-
-    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
-    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
-    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
-    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
-
-    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
-    mtls->rsc = mCtx;
-    mtls->ain = ain;
-    mtls->aout = aout;
-    mtls->fep.usr = usr;
-    mtls->fep.usrLen = usrLen;
-    mtls->mSliceSize = 1;
-    mtls->mSliceNum = 0;
-
-    mtls->fep.ptrIn = NULL;
-    mtls->fep.eStrideIn = 0;
-    mtls->isThreadable = mIsThreadable;
-
-    if (ain) {
-        mtls->fep.ptrIn = (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls->fep.yStrideIn = ain->mHal.drvState.lod[0].stride;
-    }
-
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
-    }
-}
-
-void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
+                                        uint32_t inLen,
                                         Allocation * aout,
                                         const void * usr, uint32_t usrLen,
                                         const RsScriptCall *sc,
@@ -909,24 +826,28 @@
 
     memset(mtls, 0, sizeof(MTLaunchStruct));
 
-    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
-    if (ains != NULL) {
-        for (int index = inLen; --index >= 0;) {
-            const Allocation* ain = ains[index];
+    for (int index = inLen; --index >= 0;) {
+        const Allocation* ain = ains[index];
 
-            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
-                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
-                return;
-            }
+        // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+        if (ain != nullptr &&
+            (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == nullptr) {
+
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                         "rsForEach called with null in allocations");
+            return;
         }
     }
 
-    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+    if (aout &&
+        (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == nullptr) {
+
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null out allocations");
         return;
     }
 
-    if (ains != NULL) {
+    if (inLen > 0) {
         const Allocation *ain0   = ains[0];
         const Type       *inType = ain0->getType();
 
@@ -943,7 +864,7 @@
             }
         }
 
-    } else if (aout != NULL) {
+    } else if (aout != nullptr) {
         const Type *outType = aout->getType();
 
         mtls->fep.dimX = outType->getDimX();
@@ -951,11 +872,12 @@
         mtls->fep.dimZ = outType->getDimZ();
 
     } else {
-        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                                     "rsForEach called with null allocations");
         return;
     }
 
-    if (ains != NULL && aout != NULL) {
+    if (inLen > 0 && aout != nullptr) {
         if (!ains[0]->hasSameDims(aout)) {
             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
               "Failed to launch kernel; dimensions of input and output allocations do not match.");
@@ -1002,7 +924,7 @@
     mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
     mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
 
-    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+    rsAssert(inLen == 0 || (ains[0]->getType()->getDimZ() == 0));
 
     mtls->rsc        = mCtx;
     mtls->ains       = ains;
@@ -1012,18 +934,28 @@
     mtls->mSliceSize = 1;
     mtls->mSliceNum  = 0;
 
-    mtls->fep.ptrIns    = NULL;
-    mtls->fep.eStrideIn = 0;
+    mtls->fep.inPtrs    = nullptr;
+    mtls->fep.inStrides = nullptr;
     mtls->isThreadable  = mIsThreadable;
 
-    if (ains) {
-        mtls->fep.ptrIns    = new const uint8_t*[inLen];
-        mtls->fep.inStrides = new StridePair[inLen];
+    if (inLen > 0) {
+
+        if (inLen <= RS_KERNEL_INPUT_THRESHOLD) {
+            mtls->fep.inPtrs    = (const uint8_t**)mtls->inPtrsBuff;
+            mtls->fep.inStrides = mtls->inStridesBuff;
+        } else {
+            mtls->fep.heapAllocatedArrays = true;
+
+            mtls->fep.inPtrs    = new const uint8_t*[inLen];
+            mtls->fep.inStrides = new StridePair[inLen];
+        }
+
+        mtls->fep.inLen = inLen;
 
         for (int index = inLen; --index >= 0;) {
             const Allocation *ain = ains[index];
 
-            mtls->fep.ptrIns[index] =
+            mtls->fep.inPtrs[index] =
               (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
 
             mtls->fep.inStrides[index].eStride =
@@ -1033,41 +965,27 @@
         }
     }
 
-    mtls->fep.ptrOut = NULL;
-    mtls->fep.eStrideOut = 0;
-    if (aout) {
-        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
-        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    mtls->fep.outPtr            = nullptr;
+    mtls->fep.outStride.eStride = 0;
+    mtls->fep.outStride.yStride = 0;
+    if (aout != nullptr) {
+        mtls->fep.outPtr = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+
+        mtls->fep.outStride.eStride = aout->getType()->getElementSizeBytes();
+        mtls->fep.outStride.yStride = aout->mHal.drvState.lod[0].stride;
     }
 }
 
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
-                                     const Allocation * ain,
+                                     const Allocation ** ains,
+                                     uint32_t inLen,
                                      Allocation * aout,
                                      const void * usr,
                                      uint32_t usrLen,
                                      const RsScriptCall *sc) {
 
     MTLaunchStruct mtls;
-    forEachMtlsSetup(ain, aout, usr, usrLen, sc, &mtls);
-    forEachKernelSetup(slot, &mtls);
-
-    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
-    mCtx->launchThreads(ain, aout, sc, &mtls);
-    mCtx->setTLS(oldTLS);
-}
-
-void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
-                                          const Allocation ** ains,
-                                          uint32_t inLen,
-                                          Allocation * aout,
-                                          const void * usr,
-                                          uint32_t usrLen,
-                                          const RsScriptCall *sc) {
-
-    MTLaunchStruct mtls;
 
     forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
     forEachKernelSetup(slot, &mtls);
@@ -1084,11 +1002,11 @@
     rsAssert(slot < mExecutable->getExportForeachFuncAddrs().size());
     mtls->kernel = reinterpret_cast<ForEachFunc_t>(
                       mExecutable->getExportForeachFuncAddrs()[slot]);
-    rsAssert(mtls->kernel != NULL);
+    rsAssert(mtls->kernel != nullptr);
     mtls->sig = mExecutable->getInfo().getExportForeachFuncs()[slot].second;
 #else
     mtls->kernel = reinterpret_cast<ForEachFunc_t>(mForEachFunctions[slot]);
-    rsAssert(mtls->kernel != NULL);
+    rsAssert(mtls->kernel != nullptr);
     mtls->sig = mForEachSignatures[slot];
 #endif
 }
@@ -1224,7 +1142,7 @@
         return;
     }
 
-    void *ptr = NULL;
+    void *ptr = nullptr;
     mBoundAllocs[slot] = data;
     if(data) {
         ptr = data->mHal.drvState.lod[0].mallocPtr;
@@ -1255,9 +1173,9 @@
 RsdCpuScriptImpl::~RsdCpuScriptImpl() {
 #ifndef RS_COMPATIBILITY_LIB
     if (mExecutable) {
-        Vector<void *>::const_iterator var_addr_iter =
+        std::vector<void *>::const_iterator var_addr_iter =
             mExecutable->getExportVarAddrs().begin();
-        Vector<void *>::const_iterator var_addr_end =
+        std::vector<void *>::const_iterator var_addr_end =
             mExecutable->getExportVarAddrs().end();
 
         bcc::RSInfo::ObjectSlotListTy::const_iterator is_object_iter =
@@ -1267,12 +1185,12 @@
 
         while ((var_addr_iter != var_addr_end) &&
                (is_object_iter != is_object_end)) {
-            // The field address can be NULL if the script-side has optimized
+            // The field address can be nullptr if the script-side has optimized
             // the corresponding global variable away.
             rs_object_base *obj_addr =
                 reinterpret_cast<rs_object_base *>(*var_addr_iter);
             if (*is_object_iter) {
-                if (*var_addr_iter != NULL && mCtx->getContext() != NULL) {
+                if (*var_addr_iter != nullptr && mCtx->getContext() != nullptr) {
                     rsrClearObject(mCtx->getContext(), obj_addr);
                 }
             }
@@ -1301,7 +1219,7 @@
     if (mFieldIsObject) {
         for (size_t i = 0; i < mExportedVariableCount; ++i) {
             if (mFieldIsObject[i]) {
-                if (mFieldAddress[i] != NULL) {
+                if (mFieldAddress[i] != nullptr) {
                     rs_object_base *obj_addr =
                         reinterpret_cast<rs_object_base *>(mFieldAddress[i]);
                     rsrClearObject(mCtx->getContext(), obj_addr);
@@ -1324,7 +1242,7 @@
 
 Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
     if (!ptr) {
-        return NULL;
+        return nullptr;
     }
 
     for (uint32_t ct=0; ct < mScript->mHal.info.exportedVariableCount; ct++) {
@@ -1335,20 +1253,18 @@
         }
     }
     ALOGE("rsGetAllocation, failed to find %p", ptr);
-    return NULL;
+    return nullptr;
 }
 
-void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation * ain,
-                       Allocation * aout, const void * usr,
-                       uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
+                                 uint32_t inLen, Allocation * aout,
+                                 const void * usr, uint32_t usrLen,
+                                 const RsScriptCall *sc) {}
 
-void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation * ain,
-                        Allocation * aout, const void * usr,
-                        uint32_t usrLen, const RsScriptCall *sc)
-{
-}
+void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
+                                  uint32_t inLen, Allocation * aout,
+                                  const void * usr, uint32_t usrLen,
+                                  const RsScriptCall *sc) {}
 
 
 }
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index f4ca1ed..78111ea 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -47,9 +47,9 @@
 class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
 public:
     typedef void (*outer_foreach_t)(
-        const RsForEachStubParamStruct *,
+        const RsExpandKernelParams *,
         uint32_t x1, uint32_t x2,
-        uint32_t instep, uint32_t outstep);
+        uint32_t outstep);
 #ifdef RS_COMPATIBILITY_LIB
     typedef void (* InvokeFunc_t)(void);
     typedef void (* ForEachFunc_t)(void);
@@ -59,31 +59,27 @@
 
     bool init(char const *resName, char const *cacheDir,
               uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags,
-              char const *bccPluginName = NULL);
+              char const *bccPluginName = nullptr);
     virtual void populateScript(Script *);
 
     virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength);
     virtual int invokeRoot();
-    virtual void preLaunch(uint32_t slot, const Allocation * ain,
-                           Allocation * aout, const void * usr,
+    virtual void preLaunch(uint32_t slot, const Allocation ** ains,
+                           uint32_t inLen, Allocation * aout, const void * usr,
                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void postLaunch(uint32_t slot, const Allocation * ain,
-                            Allocation * aout, const void * usr,
-                            uint32_t usrLen, const RsScriptCall *sc);
-    virtual void invokeForEach(uint32_t slot,
-                       const Allocation * ain,
-                       Allocation * aout,
-                       const void * usr,
-                       uint32_t usrLen,
-                       const RsScriptCall *sc);
+    virtual void postLaunch(uint32_t slot, const Allocation ** ains,
+                            uint32_t inLen, Allocation * aout,
+                            const void * usr, uint32_t usrLen,
+                            const RsScriptCall *sc);
 
-    virtual void invokeForEachMulti(uint32_t slot,
-                                     const Allocation** ains,
-                                     uint32_t inLen,
-                                     Allocation* aout,
-                                     const void* usr,
-                                     uint32_t usrLen,
-                                     const RsScriptCall* sc);
+    virtual void invokeForEach(uint32_t slot,
+                               const Allocation ** ains,
+                               uint32_t inLen,
+                               Allocation* aout,
+                               const void* usr,
+                               uint32_t usrLen,
+                               const RsScriptCall* sc);
+
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -100,10 +96,6 @@
 
     const Script * getScript() {return mScript;}
 
-    void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
-                          const void * usr, uint32_t usrLen,
-                          const RsScriptCall *sc, MTLaunchStruct *mtls);
-
     void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
                           Allocation * aout, const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index a9de00c..751bafb 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -44,76 +44,93 @@
 }
 
 
-typedef void (*ScriptGroupRootFunc_t)(const RsForEachStubParamStruct *p,
+typedef void (*ScriptGroupRootFunc_t)(const RsExpandKernelParams *kparams,
                                       uint32_t xstart, uint32_t xend,
-                                      uint32_t instep, uint32_t outstep);
+                                      uint32_t outstep);
 
-void CpuScriptGroupImpl::scriptGroupRoot(const RsForEachStubParamStruct *p,
+void CpuScriptGroupImpl::scriptGroupRoot(const RsExpandKernelParams *kparams,
                                          uint32_t xstart, uint32_t xend,
-                                         uint32_t instep, uint32_t outstep) {
+                                         uint32_t outstep) {
 
 
-    const ScriptList *sl = (const ScriptList *)p->usr;
-    RsForEachStubParamStruct *mp = (RsForEachStubParamStruct *)p;
-    const void *oldUsr = p->usr;
+    const ScriptList *sl           = (const ScriptList *)kparams->usr;
+    RsExpandKernelParams *mkparams = (RsExpandKernelParams *)kparams;
 
-    for(size_t ct=0; ct < sl->count; ct++) {
+    const void **oldIns  = mkparams->ins;
+    uint32_t *oldStrides = mkparams->inEStrides;
+
+    void *localIns[1];
+    uint32_t localStride[1];
+
+    mkparams->ins        = (const void**)localIns;
+    mkparams->inEStrides = localStride;
+
+    for (size_t ct = 0; ct < sl->count; ct++) {
         ScriptGroupRootFunc_t func;
-        func = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
-        mp->usr = sl->usrPtrs[ct];
-
-        mp->ptrIn = NULL;
-        mp->in = NULL;
-        mp->ptrOut = NULL;
-        mp->out = NULL;
-
-        uint32_t istep = 0;
-        uint32_t ostep = 0;
+        func          = (ScriptGroupRootFunc_t)sl->fnPtrs[ct];
+        mkparams->usr = sl->usrPtrs[ct];
 
         if (sl->ins[ct]) {
-            mp->ptrIn = (const uint8_t *)sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
-            istep = sl->ins[ct]->mHal.state.elementSizeBytes;
-            mp->in = mp->ptrIn;
+            localIns[0] = sl->ins[ct]->mHal.drvState.lod[0].mallocPtr;
+
+            localStride[0] = sl->ins[ct]->mHal.state.elementSizeBytes;
+
             if (sl->inExts[ct]) {
-                mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->ins[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->in = mp->ptrIn + sl->ins[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->y);
+
+            } else if (sl->ins[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                localIns[0] = (void*)
+                  ((const uint8_t *)localIns[0] +
+                   sl->ins[ct]->mHal.drvState.lod[0].stride * kparams->lid);
             }
+
+        } else {
+            localIns[0]    = nullptr;
+            localStride[0] = 0;
         }
 
+        uint32_t ostep;
         if (sl->outs[ct]) {
-            mp->ptrOut = (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
-            mp->out = mp->ptrOut;
+            mkparams->out =
+              (uint8_t *)sl->outs[ct]->mHal.drvState.lod[0].mallocPtr;
+
             ostep = sl->outs[ct]->mHal.state.elementSizeBytes;
+
             if (sl->outExts[ct]) {
-                mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->y;
-            } else {
-                if (sl->outs[ct]->mHal.drvState.lod[0].dimY > p->lid) {
-                    mp->out = mp->ptrOut + sl->outs[ct]->mHal.drvState.lod[0].stride * p->lid;
-                }
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->y;
+
+            } else if (sl->outs[ct]->mHal.drvState.lod[0].dimY > kparams->lid) {
+                mkparams->out =
+                  (uint8_t *)mkparams->out +
+                  sl->outs[ct]->mHal.drvState.lod[0].stride * kparams->lid;
             }
+        } else {
+            mkparams->out = nullptr;
+            ostep         = 0;
         }
 
         //ALOGE("kernel %i %p,%p  %p,%p", ct, mp->ptrIn, mp->in, mp->ptrOut, mp->out);
-        func(p, xstart, xend, istep, ostep);
+        func(kparams, xstart, xend, ostep);
     }
     //ALOGE("script group root");
 
-    //ConvolveParams *cp = (ConvolveParams *)p->usr;
-
-    mp->usr = oldUsr;
+    mkparams->ins        = oldIns;
+    mkparams->inEStrides = oldStrides;
+    mkparams->usr        = sl;
 }
 
 
 
 void CpuScriptGroupImpl::execute() {
-    Vector<Allocation *> ins;
-    Vector<bool> inExts;
-    Vector<Allocation *> outs;
-    Vector<bool> outExts;
-    Vector<const ScriptKernelID *> kernels;
+    std::vector<Allocation *> ins;
+    std::vector<char> inExts;
+    std::vector<Allocation *> outs;
+    std::vector<char> outExts;
+    std::vector<const ScriptKernelID *> kernels;
     bool fieldDep = false;
 
     for (size_t ct=0; ct < mSG->mNodes.size(); ct++) {
@@ -136,8 +153,8 @@
 
         for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
             const ScriptKernelID *k = n->mKernels[ct2];
-            Allocation *ain = NULL;
-            Allocation *aout = NULL;
+            Allocation *ain = nullptr;
+            Allocation *aout = nullptr;
             bool inExt = false;
             bool outExt = false;
 
@@ -147,7 +164,7 @@
                     break;
                 }
             }
-            if (ain == NULL) {
+            if (ain == nullptr) {
                 for (size_t ct3=0; ct3 < mSG->mInputs.size(); ct3++) {
                     if (mSG->mInputs[ct3]->mKernel == k) {
                         ain = mSG->mInputs[ct3]->mAlloc.get();
@@ -160,13 +177,13 @@
             for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
                 if (n->mOutputs[ct3]->mSource.get() == k) {
                     aout = n->mOutputs[ct3]->mAlloc.get();
-                    if(n->mOutputs[ct3]->mDstField.get() != NULL) {
+                    if(n->mOutputs[ct3]->mDstField.get() != nullptr) {
                         fieldDep = true;
                     }
                     break;
                 }
             }
-            if (aout == NULL) {
+            if (aout == nullptr) {
                 for (size_t ct3=0; ct3 < mSG->mOutputs.size(); ct3++) {
                     if (mSG->mOutputs[ct3]->mKernel == k) {
                         aout = mSG->mOutputs[ct3]->mAlloc.get();
@@ -176,72 +193,111 @@
                 }
             }
 
-            rsAssert((k->mHasKernelOutput == (aout != NULL)) &&
-                     (k->mHasKernelInput == (ain != NULL)));
+            rsAssert((k->mHasKernelOutput == (aout != nullptr)) &&
+                     (k->mHasKernelInput == (ain != nullptr)));
 
-            ins.add(ain);
-            inExts.add(inExt);
-            outs.add(aout);
-            outExts.add(outExt);
-            kernels.add(k);
+            ins.push_back(ain);
+            inExts.push_back(inExt);
+            outs.push_back(aout);
+            outExts.push_back(outExt);
+            kernels.push_back(k);
         }
 
     }
 
     MTLaunchStruct mtls;
 
-    if(fieldDep) {
+    if (fieldDep) {
         for (size_t ct=0; ct < ins.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
             uint32_t slot = kernels[ct]->mSlot;
 
-            si->forEachMtlsSetup(ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+            uint32_t inLen;
+            const Allocation **ains;
+
+            if (ins[ct] == nullptr) {
+                inLen = 0;
+                ains  = nullptr;
+
+            } else {
+                inLen = 1;
+                ains  = const_cast<const Allocation**>(&ins[ct]);
+            }
+
+            si->forEachMtlsSetup(ains, inLen, outs[ct], nullptr, 0, nullptr, &mtls);
+
             si->forEachKernelSetup(slot, &mtls);
-            si->preLaunch(slot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
-            mCtx->launchThreads(ins[ct], outs[ct], NULL, &mtls);
-            si->postLaunch(slot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->preLaunch(slot, ains, inLen, outs[ct], mtls.fep.usr,
+                          mtls.fep.usrLen, nullptr);
+
+            mCtx->launchThreads(ains, inLen, outs[ct], nullptr, &mtls);
+
+            si->postLaunch(slot, ains, inLen, outs[ct], nullptr, 0, nullptr);
         }
     } else {
         ScriptList sl;
-        sl.ins = ins.array();
-        sl.outs = outs.array();
-        sl.kernels = kernels.array();
-        sl.count = kernels.size();
 
-        Vector<const void *> usrPtrs;
-        Vector<const void *> fnPtrs;
-        Vector<uint32_t> sigs;
+        /*
+         * TODO: This is a hacky way of doing this and should be replaced by a
+         *       call to std::vector's data() member once we have a C++11
+         *       version of the STL.
+         */
+        sl.ins     = &ins.front();
+        sl.outs    = &outs.front();
+        sl.kernels = &kernels.front();
+        sl.count   = kernels.size();
+
+        uint32_t inLen;
+        const Allocation **ains;
+
+        if (ins[0] == nullptr) {
+            inLen = 0;
+            ains  = nullptr;
+
+        } else {
+            inLen = 1;
+            ains  = const_cast<const Allocation**>(&ins[0]);
+        }
+
+        std::vector<const void *> usrPtrs;
+        std::vector<const void *> fnPtrs;
+        std::vector<uint32_t> sigs;
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
 
             si->forEachKernelSetup(kernels[ct]->mSlot, &mtls);
-            fnPtrs.add((void *)mtls.kernel);
-            usrPtrs.add(mtls.fep.usr);
-            sigs.add(mtls.fep.usrLen);
-            si->preLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], mtls.fep.usr, mtls.fep.usrLen, NULL);
+            fnPtrs.push_back((void *)mtls.kernel);
+            usrPtrs.push_back(mtls.fep.usr);
+            sigs.push_back(mtls.fep.usrLen);
+            si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
+                          mtls.fep.usr, mtls.fep.usrLen, nullptr);
         }
-        sl.sigs = sigs.array();
-        sl.usrPtrs = usrPtrs.array();
-        sl.fnPtrs = fnPtrs.array();
-        sl.inExts = inExts.array();
-        sl.outExts = outExts.array();
+
+        sl.sigs    = &sigs.front();
+        sl.usrPtrs = &usrPtrs.front();
+        sl.fnPtrs  = &fnPtrs.front();
+
+        sl.inExts  = (bool*)&inExts.front();
+        sl.outExts = (bool*)&outExts.front();
 
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-        si->forEachMtlsSetup(ins[0], outs[0], NULL, 0, NULL, &mtls);
-        mtls.script = NULL;
+
+        si->forEachMtlsSetup(ains, inLen, outs[0], nullptr, 0, nullptr, &mtls);
+
+        mtls.script = nullptr;
         mtls.kernel = (void (*)())&scriptGroupRoot;
         mtls.fep.usr = &sl;
-        mCtx->launchThreads(ins[0], outs[0], NULL, &mtls);
+
+        mCtx->launchThreads(ains, inLen, outs[0], nullptr, &mtls);
 
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
-            si->postLaunch(kernels[ct]->mSlot, ins[ct], outs[ct], NULL, 0, NULL);
+            si->postLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct], nullptr, 0,
+                           nullptr);
         }
     }
 }
-
-
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 78e179d..1a4af05 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -33,9 +33,9 @@
     CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
     bool init();
 
-    static void scriptGroupRoot(const RsForEachStubParamStruct *p,
+    static void scriptGroupRoot(const RsExpandKernelParams *p,
                                 uint32_t xstart, uint32_t xend,
-                                uint32_t instep, uint32_t outstep);
+                                uint32_t outstep);
 
 protected:
     struct ScriptList {
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 0076cb9..b0e924e 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -69,21 +69,15 @@
         virtual void populateScript(Script *) = 0;
         virtual void invokeFunction(uint32_t slot, const void *params, size_t paramLength) = 0;
         virtual int invokeRoot() = 0;
+
         virtual void invokeForEach(uint32_t slot,
-                           const Allocation * ain,
-                           Allocation * aout,
-                           const void * usr,
-                           uint32_t usrLen,
-                           const RsScriptCall *sc) = 0;
-                           
-        virtual void invokeForEachMulti(uint32_t slot,
-                                         const Allocation** ains,
-                                         uint32_t inLen,
-                                         Allocation * aout,
-                                         const void * usr,
-                                         uint32_t usrLen,
-                                         const RsScriptCall *sc) = 0;
-        
+                                   const Allocation ** ains,
+                                   uint32_t inLen,
+                                   Allocation * aout,
+                                   const void * usr,
+                                   uint32_t usrLen,
+                                   const RsScriptCall *sc) = 0;
+
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
@@ -118,9 +112,9 @@
     static RsdCpuReference * create(Context *c, uint32_t version_major,
                                     uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
 #ifndef RS_COMPATIBILITY_LIB
-                                    , bcc::RSLinkRuntimeCallback pLinkRuntimeCallback = NULL,
-                                    RSSelectRTCallback pSelectRTCallback = NULL,
-                                    const char *pBccPluginName = NULL
+                                    , bcc::RSLinkRuntimeCallback pLinkRuntimeCallback = nullptr,
+                                    RSSelectRTCallback pSelectRTCallback = nullptr,
+                                    const char *pBccPluginName = nullptr
 #endif
                                     );
     virtual ~RsdCpuReference();
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 9a40756..413f1ad 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -193,7 +193,7 @@
     if (!(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_SCRIPT)) {
         if (alloc->mHal.drvState.lod[0].mallocPtr) {
             free(alloc->mHal.drvState.lod[0].mallocPtr);
-            alloc->mHal.drvState.lod[0].mallocPtr = NULL;
+            alloc->mHal.drvState.lod[0].mallocPtr = nullptr;
         }
     }
     rsdGLCheckError(rsc, "UploadToTexture");
@@ -360,7 +360,7 @@
     // We align all allocations to a 16-byte boundary.
     uint8_t* ptr = (uint8_t *)memalign(16, allocSize);
     if (!ptr) {
-        return NULL;
+        return nullptr;
     }
     if (forceZero) {
         memset(ptr, 0, allocSize);
@@ -376,15 +376,15 @@
     alloc->mHal.drv = drv;
 
     // Calculate the object size.
-    size_t allocSize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), NULL);
+    size_t allocSize = AllocationBuildPointerTable(rsc, alloc, alloc->getType(), nullptr);
 
-    uint8_t * ptr = NULL;
+    uint8_t * ptr = nullptr;
     if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_OUTPUT) {
 
     } else if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_INPUT) {
         // Allocation is allocated when the surface is created
         // in getSurface
-    } else if (alloc->mHal.state.userProvidedPtr != NULL) {
+    } else if (alloc->mHal.state.userProvidedPtr != nullptr) {
         // user-provided allocation
         // limitations: no faces, no LOD, USAGE_SCRIPT or SCRIPT+TEXTURE only
         if (!(alloc->mHal.state.usageFlags == (RS_ALLOCATION_USAGE_SCRIPT | RS_ALLOCATION_USAGE_SHARED) ||
@@ -405,7 +405,7 @@
 
             ptr = allocAlignedMemory(allocSize, forceZero);
             if (!ptr) {
-                alloc->mHal.drv = NULL;
+                alloc->mHal.drv = nullptr;
                 free(drv);
                 return false;
             }
@@ -417,7 +417,7 @@
     } else {
         ptr = allocAlignedMemory(allocSize, forceZero);
         if (!ptr) {
-            alloc->mHal.drv = NULL;
+            alloc->mHal.drv = nullptr;
             free(drv);
             return false;
         }
@@ -456,7 +456,7 @@
     }
 
 
-    drv->readBackFBO = NULL;
+    drv->readBackFBO = nullptr;
 
     // fill out the initial state of the buffer if we couldn't use the user-provided ptr and USAGE_SHARED was accepted
     if ((alloc->mHal.state.userProvidedPtr != 0) && (drv->useUserProvidedPtr == false)) {
@@ -500,13 +500,13 @@
             !(alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_OUTPUT)) {
                 free(alloc->mHal.drvState.lod[0].mallocPtr);
         }
-        alloc->mHal.drvState.lod[0].mallocPtr = NULL;
+        alloc->mHal.drvState.lod[0].mallocPtr = nullptr;
     }
 
 #ifndef RS_COMPATIBILITY_LIB
-    if (drv->readBackFBO != NULL) {
+    if (drv->readBackFBO != nullptr) {
         delete drv->readBackFBO;
-        drv->readBackFBO = NULL;
+        drv->readBackFBO = nullptr;
     }
 
     if ((alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_OUTPUT) &&
@@ -519,15 +519,15 @@
             mapper.unlock(drv->wndBuffer->handle);
             int32_t r = nw->queueBuffer(nw, drv->wndBuffer, -1);
 
-            drv->wndSurface = NULL;
+            drv->wndSurface = nullptr;
             native_window_api_disconnect(nw, NATIVE_WINDOW_API_CPU);
-            nw->decStrong(NULL);
+            nw->decStrong(nullptr);
         }
     }
 #endif
 
     free(drv);
-    alloc->mHal.drv = NULL;
+    alloc->mHal.drv = nullptr;
 }
 
 void rsdAllocationResize(const Context *rsc, const Allocation *alloc,
@@ -542,7 +542,7 @@
     }
     void * oldPtr = alloc->mHal.drvState.lod[0].mallocPtr;
     // Calculate the object size
-    size_t s = AllocationBuildPointerTable(rsc, alloc, newType, NULL);
+    size_t s = AllocationBuildPointerTable(rsc, alloc, newType, nullptr);
     uint8_t *ptr = (uint8_t *)realloc(oldPtr, s);
     // Build the relative pointer tables.
     size_t verifySize = AllocationBuildPointerTable(rsc, alloc, newType, ptr);
@@ -571,7 +571,7 @@
     if (!drv->textureID && !drv->renderTargetID) {
         return; // nothing was rendered here yet, so nothing to sync
     }
-    if (drv->readBackFBO == NULL) {
+    if (drv->readBackFBO == nullptr) {
         drv->readBackFBO = new RsdFrameBufferObj();
         drv->readBackFBO->setColorTarget(drv, 0);
         drv->readBackFBO->setDimensions(alloc->getType()->getDimX(),
@@ -656,7 +656,7 @@
     GraphicBufferMapper &mapper = GraphicBufferMapper::get();
     Rect bounds(drv->wndBuffer->width, drv->wndBuffer->height);
 
-    void *dst = NULL;
+    void *dst = nullptr;
     mapper.lock(drv->wndBuffer->handle,
             GRALLOC_USAGE_SW_READ_NEVER | GRALLOC_USAGE_SW_WRITE_OFTEN,
             bounds, &dst);
@@ -674,7 +674,7 @@
     ANativeWindow *old = drv->wndSurface;
 
     if (nw) {
-        nw->incStrong(NULL);
+        nw->incStrong(nullptr);
     }
 
     if (alloc->mHal.state.usageFlags & RS_ALLOCATION_USAGE_GRAPHICS_RENDER_TARGET) {
@@ -689,13 +689,13 @@
         GraphicBufferMapper &mapper = GraphicBufferMapper::get();
         mapper.unlock(drv->wndBuffer->handle);
         old->cancelBuffer(old, drv->wndBuffer, -1);
-        drv->wndSurface = NULL;
+        drv->wndSurface = nullptr;
 
         native_window_api_disconnect(old, NATIVE_WINDOW_API_CPU);
-        old->decStrong(NULL);
+        old->decStrong(nullptr);
     }
 
-    if (nw != NULL) {
+    if (nw != nullptr) {
         int32_t r;
         uint32_t flags = 0;
 
@@ -747,7 +747,7 @@
  error:
 
     if (nw) {
-        nw->decStrong(NULL);
+        nw->decStrong(nullptr);
     }
 
 
@@ -1202,16 +1202,14 @@
 {
     obj->p = alloc;
 #ifdef __LP64__
-    if (alloc != NULL) {
+    if (alloc != nullptr) {
         obj->r = alloc->mHal.drvState.lod[0].mallocPtr;
         obj->v1 = alloc->mHal.drv;
         obj->v2 = (void *)alloc->mHal.drvState.lod[0].stride;
     } else {
-        obj->r = NULL;
-        obj->v1 = NULL;
-        obj->v2 = NULL;
+        obj->r = nullptr;
+        obj->v1 = nullptr;
+        obj->v2 = nullptr;
     }
 #endif
 }
-
-
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 27029cf..811fa3e 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <vector>
+
 #include "../cpu_ref/rsd_cpu.h"
 
 #include "rsdCore.h"
@@ -26,7 +28,6 @@
 #include "rsScriptC.h"
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-#include "utils/Vector.h"
 #include "utils/Timers.h"
 #include "utils/StopWatch.h"
 #endif
@@ -43,9 +44,10 @@
                      size_t bitcodeSize,
                      uint32_t flags) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
-    RsdCpuReference::CpuScript * cs = dc->mCpuRef->createScript(script, resName, cacheDir,
-                                                                bitcode, bitcodeSize, flags);
-    if (cs == NULL) {
+    RsdCpuReference::CpuScript * cs =
+        dc->mCpuRef->createScript(script, resName, cacheDir, bitcode,
+                                  bitcodeSize, flags);
+    if (cs == nullptr) {
         return false;
     }
     script->mHal.drv = cs;
@@ -53,10 +55,11 @@
     return true;
 }
 
-bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid, Element *e) {
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrinsicID iid,
+                      Element *e) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     RsdCpuReference::CpuScript * cs = dc->mCpuRef->createIntrinsic(s, iid, e);
-    if (cs == NULL) {
+    if (cs == nullptr) {
         return false;
     }
     s->mHal.drv = cs;
@@ -73,8 +76,15 @@
                             size_t usrLen,
                             const RsScriptCall *sc) {
 
-    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEach(slot, ain, aout, usr, usrLen, sc);
+    if (ain == nullptr) {
+        rsdScriptInvokeForEachMulti(rsc, s, slot, nullptr, 0, aout, usr, usrLen,
+                                    sc);
+    } else {
+        const Allocation *ains[1] = {ain};
+
+        rsdScriptInvokeForEachMulti(rsc, s, slot, ains, 1, aout, usr, usrLen,
+                                    sc);
+    }
 }
 
 void rsdScriptInvokeForEachMulti(const Context *rsc,
@@ -88,7 +98,7 @@
                                  const RsScriptCall *sc) {
 
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
-    cs->invokeForEachMulti(slot, ains, inLen, aout, usr, usrLen, sc);
+    cs->invokeForEach(slot, ains, inLen, aout, usr, usrLen, sc);
 }
 
 
@@ -149,7 +159,7 @@
 void rsdScriptDestroy(const Context *dc, Script *s) {
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
     delete cs;
-    s->mHal.drv = NULL;
+    s->mHal.drv = nullptr;
 }
 
 
@@ -166,12 +176,12 @@
 {
     obj->p = script;
 #ifdef __LP64__
-    obj->r = NULL;
-    if (script != NULL) {
+    obj->r = nullptr;
+    if (script != nullptr) {
         obj->v1 = script->mHal.drv;
     } else {
-        obj->v1 = NULL;
+        obj->v1 = nullptr;
     }
-    obj->v2 = NULL;
+    obj->v2 = nullptr;
 #endif
 }
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index 37c9755..48959fd 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -53,7 +53,7 @@
 #ifndef RS_COMPATIBILITY_LIB
     #define NATIVE_FUNC(a) a
 #else
-    #define NATIVE_FUNC(a) NULL
+    #define NATIVE_FUNC(a) nullptr
 #endif
 
 
@@ -64,7 +64,7 @@
     NATIVE_FUNC(rsdGLSwap),
 
     Shutdown,
-    NULL,
+    nullptr,
     SetPriority,
     rsdAllocRuntimeMem,
     rsdFreeRuntimeMem,
@@ -169,7 +169,7 @@
         rsdScriptGroupSetOutput,
         rsdScriptGroupExecute,
         rsdScriptGroupDestroy,
-        NULL
+        nullptr
     },
 
     {
@@ -184,7 +184,7 @@
         rsdElementUpdateCachedObject
     },
 
-    NULL // finish
+    nullptr // finish
 };
 
 extern const RsdCpuReference::CpuSymbol * rsdLookupRuntimeStub(Context * pContext, char const* name);
@@ -209,7 +209,7 @@
                                           &rsdLookupRuntimeStub, &LookupScript);
     if (!dc->mCpuRef) {
         ALOGE("RsdCpuReference::create for driver hal failed.");
-        rsc->mHal.drv = NULL;
+        rsc->mHal.drv = nullptr;
         free(dc);
         return false;
     }
@@ -217,7 +217,7 @@
 #ifndef RS_COMPATIBILITY_LIB
     // Set a callback for compiler setup here.
     if (false) {
-        dc->mCpuRef->setSetupCompilerCallback(NULL);
+        dc->mCpuRef->setSetupCompilerCallback(nullptr);
     }
 
     // Set a callback for switching MemChunk's allocator here.
@@ -251,7 +251,7 @@
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
     delete dc->mCpuRef;
     free(dc);
-    rsc->mHal.drv = NULL;
+    rsc->mHal.drv = nullptr;
 }
 
 void* rsdAllocRuntimeMem(size_t size, uint32_t flags) {
diff --git a/driver/rsdElement.cpp b/driver/rsdElement.cpp
index c0413e4..06a0790 100644
--- a/driver/rsdElement.cpp
+++ b/driver/rsdElement.cpp
@@ -35,9 +35,9 @@
 {
     obj->p = element;
 #ifdef __LP64__
-    obj->r = NULL;
-    obj->v1 = NULL;
-    obj->v2 = NULL;
+    obj->r = nullptr;
+    obj->v1 = nullptr;
+    obj->v2 = nullptr;
 #endif
 }
 
diff --git a/driver/rsdFrameBuffer.cpp b/driver/rsdFrameBuffer.cpp
index bb07d29..f458bf9 100644
--- a/driver/rsdFrameBuffer.cpp
+++ b/driver/rsdFrameBuffer.cpp
@@ -32,8 +32,8 @@
 void setDepthAttachment(const Context *rsc, const FBOCache *fb) {
     RsdFrameBufferObj *fbo = (RsdFrameBufferObj*)fb->mHal.drv;
 
-    DrvAllocation *depth = NULL;
-    if (fb->mHal.state.depthTarget != NULL) {
+    DrvAllocation *depth = nullptr;
+    if (fb->mHal.state.depthTarget != nullptr) {
         depth = (DrvAllocation *)fb->mHal.state.depthTarget->mHal.drv;
 
         if (depth->uploadDeferred) {
@@ -48,8 +48,8 @@
     RsdFrameBufferObj *fbo = (RsdFrameBufferObj*)fb->mHal.drv;
     // Now attach color targets
     for (uint32_t i = 0; i < fb->mHal.state.colorTargetsCount; i ++) {
-        DrvAllocation *color = NULL;
-        if (fb->mHal.state.colorTargets[i] != NULL) {
+        DrvAllocation *color = nullptr;
+        if (fb->mHal.state.colorTargets[i] != nullptr) {
             color = (DrvAllocation *)fb->mHal.state.colorTargets[i]->mHal.drv;
 
             if (color->uploadDeferred) {
@@ -63,7 +63,7 @@
 
 bool rsdFrameBufferInit(const Context *rsc, const FBOCache *fb) {
     RsdFrameBufferObj *fbo = new RsdFrameBufferObj();
-    if (fbo == NULL) {
+    if (fbo == nullptr) {
         return false;
     }
     fb->mHal.drv = fbo;
@@ -93,7 +93,7 @@
 void rsdFrameBufferDestroy(const Context *rsc, const FBOCache *fb) {
     RsdFrameBufferObj *fbo = (RsdFrameBufferObj *)fb->mHal.drv;
     delete fbo;
-    fb->mHal.drv = NULL;
+    fb->mHal.drv = nullptr;
 }
 
 
diff --git a/driver/rsdFrameBufferObj.cpp b/driver/rsdFrameBufferObj.cpp
index 7731aff..1e91c98 100644
--- a/driver/rsdFrameBufferObj.cpp
+++ b/driver/rsdFrameBufferObj.cpp
@@ -35,7 +35,7 @@
     for (uint32_t i = 0; i < mColorTargetsCount; i ++) {
         mColorTargets[i] = 0;
     }
-    mDepthTarget = NULL;
+    mDepthTarget = nullptr;
     mDirty = true;
 }
 
@@ -73,7 +73,7 @@
 
 
 void RsdFrameBufferObj::setDepthAttachment() {
-    if (mDepthTarget != NULL) {
+    if (mDepthTarget != nullptr) {
         if (mDepthTarget->textureID) {
             glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT,
                                    GL_TEXTURE_2D, mDepthTarget->textureID, 0);
@@ -91,7 +91,7 @@
 void RsdFrameBufferObj::setColorAttachment() {
     // Now attach color targets
     for (uint32_t i = 0; i < mColorTargetsCount; i ++) {
-        if (mColorTargets[i] != NULL) {
+        if (mColorTargets[i] != nullptr) {
             if (mColorTargets[i]->textureID) {
                 glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i,
                                        GL_TEXTURE_2D, mColorTargets[i]->textureID, 0);
@@ -110,12 +110,12 @@
 }
 
 bool RsdFrameBufferObj::renderToFramebuffer() {
-    if (mDepthTarget != NULL) {
+    if (mDepthTarget != nullptr) {
         return false;
     }
 
     for (uint32_t i = 0; i < mColorTargetsCount; i ++) {
-        if (mColorTargets[i] != NULL) {
+        if (mColorTargets[i] != nullptr) {
             return false;
         }
     }
diff --git a/driver/rsdGL.cpp b/driver/rsdGL.cpp
index b59ee20..e58b0f2 100644
--- a/driver/rsdGL.cpp
+++ b/driver/rsdGL.cpp
@@ -152,7 +152,7 @@
 void rsdGLShutdown(const Context *rsc) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
-    rsdGLSetSurface(rsc, 0, 0, NULL);
+    rsdGLSetSurface(rsc, 0, 0, nullptr);
     dc->gl.shaderCache->cleanupAll();
     delete dc->gl.shaderCache;
     delete dc->gl.vertexArrayState;
@@ -319,7 +319,7 @@
     checkEglError("eglCreateContext");
     if (dc->gl.egl.context == EGL_NO_CONTEXT) {
         ALOGE("%p, eglCreateContext returned EGL_NO_CONTEXT", rsc);
-        rsc->setWatchdogGL(NULL, 0, NULL);
+        rsc->setWatchdogGL(nullptr, 0, nullptr);
         return false;
     }
     gGLContextCount++;
@@ -332,7 +332,7 @@
     if (dc->gl.egl.surfaceDefault == EGL_NO_SURFACE) {
         ALOGE("eglCreatePbufferSurface returned EGL_NO_SURFACE");
         rsdGLShutdown(rsc);
-        rsc->setWatchdogGL(NULL, 0, NULL);
+        rsc->setWatchdogGL(nullptr, 0, nullptr);
         return false;
     }
 
@@ -343,7 +343,7 @@
         ALOGE("eglMakeCurrent returned EGL_FALSE");
         checkEglError("eglMakeCurrent", ret);
         rsdGLShutdown(rsc);
-        rsc->setWatchdogGL(NULL, 0, NULL);
+        rsc->setWatchdogGL(nullptr, 0, nullptr);
         return false;
     }
 
@@ -358,7 +358,7 @@
     //ALOGV("GL Renderer %s", mGL.mRenderer);
     //ALOGV("GL Extensions %s", mGL.mExtensions);
 
-    const char *verptr = NULL;
+    const char *verptr = nullptr;
     if (strlen((const char *)dc->gl.gl.version) > 9) {
         if (!memcmp(dc->gl.gl.version, "OpenGL ES-CM", 12)) {
             verptr = (const char *)dc->gl.gl.version + 12;
@@ -371,7 +371,7 @@
     if (!verptr) {
         ALOGE("Error, OpenGL ES Lite not supported");
         rsdGLShutdown(rsc);
-        rsc->setWatchdogGL(NULL, 0, NULL);
+        rsc->setWatchdogGL(nullptr, 0, nullptr);
         return false;
     } else {
         sscanf(verptr, " %i.%i", &dc->gl.gl.majorVersion, &dc->gl.gl.minorVersion);
@@ -387,14 +387,14 @@
     glGetIntegerv(GL_MAX_TEXTURE_IMAGE_UNITS, &dc->gl.gl.maxFragmentTextureImageUnits);
     glGetIntegerv(GL_MAX_FRAGMENT_UNIFORM_VECTORS, &dc->gl.gl.maxFragmentUniformVectors);
 
-    dc->gl.gl.OES_texture_npot = NULL != strstr((const char *)dc->gl.gl.extensions,
+    dc->gl.gl.OES_texture_npot = nullptr != strstr((const char *)dc->gl.gl.extensions,
                                                 "GL_OES_texture_npot");
-    dc->gl.gl.IMG_texture_npot = NULL != strstr((const char *)dc->gl.gl.extensions,
+    dc->gl.gl.IMG_texture_npot = nullptr != strstr((const char *)dc->gl.gl.extensions,
                                                    "GL_IMG_texture_npot");
-    dc->gl.gl.NV_texture_npot_2D_mipmap = NULL != strstr((const char *)dc->gl.gl.extensions,
+    dc->gl.gl.NV_texture_npot_2D_mipmap = nullptr != strstr((const char *)dc->gl.gl.extensions,
                                                             "GL_NV_texture_npot_2D_mipmap");
     dc->gl.gl.EXT_texture_max_aniso = 1.0f;
-    bool hasAniso = NULL != strstr((const char *)dc->gl.gl.extensions,
+    bool hasAniso = nullptr != strstr((const char *)dc->gl.gl.extensions,
                                    "GL_EXT_texture_filter_anisotropic");
     if (hasAniso) {
         glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &dc->gl.gl.EXT_texture_max_aniso);
@@ -407,11 +407,11 @@
     dc->gl.shaderCache = new RsdShaderCache();
     dc->gl.vertexArrayState = new RsdVertexArrayState();
     dc->gl.vertexArrayState->init(dc->gl.gl.maxVertexAttribs);
-    dc->gl.currentFrameBuffer = NULL;
+    dc->gl.currentFrameBuffer = nullptr;
     dc->mHasGraphics = true;
 
     ALOGV("%p initGLThread end", rsc);
-    rsc->setWatchdogGL(NULL, 0, NULL);
+    rsc->setWatchdogGL(nullptr, 0, nullptr);
     return true;
 }
 
@@ -420,7 +420,7 @@
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
     EGLBoolean ret;
-    if (dc->gl.egl.surface != NULL) {
+    if (dc->gl.egl.surface != nullptr) {
         rsc->setWatchdogGL("eglMakeCurrent", __LINE__, __FILE__);
         ret = eglMakeCurrent(dc->gl.egl.display, dc->gl.egl.surfaceDefault,
                              dc->gl.egl.surfaceDefault, dc->gl.egl.context);
@@ -430,20 +430,20 @@
         ret = eglDestroySurface(dc->gl.egl.display, dc->gl.egl.surface);
         checkEglError("eglDestroySurface", ret);
 
-        dc->gl.egl.surface = NULL;
+        dc->gl.egl.surface = nullptr;
     }
 
-    if (dc->gl.currentWndSurface != NULL) {
-        dc->gl.currentWndSurface->decStrong(NULL);
+    if (dc->gl.currentWndSurface != nullptr) {
+        dc->gl.currentWndSurface->decStrong(nullptr);
     }
 
     dc->gl.currentWndSurface = (ANativeWindow *)sur;
-    if (dc->gl.currentWndSurface != NULL) {
-        dc->gl.currentWndSurface->incStrong(NULL);
+    if (dc->gl.currentWndSurface != nullptr) {
+        dc->gl.currentWndSurface->incStrong(nullptr);
 
         rsc->setWatchdogGL("eglCreateWindowSurface", __LINE__, __FILE__);
         dc->gl.egl.surface = eglCreateWindowSurface(dc->gl.egl.display, dc->gl.egl.config,
-                                                    dc->gl.currentWndSurface, NULL);
+                                                    dc->gl.currentWndSurface, nullptr);
         checkEglError("eglCreateWindowSurface");
         if (dc->gl.egl.surface == EGL_NO_SURFACE) {
             ALOGE("eglCreateWindowSurface returned EGL_NO_SURFACE");
@@ -454,23 +454,23 @@
                              dc->gl.egl.surface, dc->gl.egl.context);
         checkEglError("eglMakeCurrent", ret);
     }
-    rsc->setWatchdogGL(NULL, 0, NULL);
+    rsc->setWatchdogGL(nullptr, 0, nullptr);
     return true;
 }
 
 bool rsdGLSetSurface(const Context *rsc, uint32_t w, uint32_t h, RsNativeWindow sur) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
-    if (dc->gl.wndSurface != NULL) {
-        dc->gl.wndSurface->decStrong(NULL);
-        dc->gl.wndSurface = NULL;
+    if (dc->gl.wndSurface != nullptr) {
+        dc->gl.wndSurface->decStrong(nullptr);
+        dc->gl.wndSurface = nullptr;
     }
     if(w && h) {
         // WAR: Some drivers fail to handle 0 size surfaces correctly. Use the
         // pbuffer to avoid this pitfall.
         dc->gl.wndSurface = (ANativeWindow *)sur;
-        if (dc->gl.wndSurface != NULL) {
-            dc->gl.wndSurface->incStrong(NULL);
+        if (dc->gl.wndSurface != nullptr) {
+            dc->gl.wndSurface->incStrong(nullptr);
         }
     }
 
diff --git a/driver/rsdGL.h b/driver/rsdGL.h
index 419c317..48c4c4e 100644
--- a/driver/rsdGL.h
+++ b/driver/rsdGL.h
@@ -20,7 +20,7 @@
 #include <rs_hal.h>
 #include <EGL/egl.h>
 
-#define RSD_CALL_GL(x, ...) rsc->setWatchdogGL(#x, __LINE__, __FILE__); x(__VA_ARGS__); rsc->setWatchdogGL(NULL, 0, NULL)
+#define RSD_CALL_GL(x, ...) rsc->setWatchdogGL(#x, __LINE__, __FILE__); x(__VA_ARGS__); rsc->setWatchdogGL(nullptr, 0, nullptr)
 
 class RsdShaderCache;
 class RsdVertexArrayState;
diff --git a/driver/rsdMesh.cpp b/driver/rsdMesh.cpp
index 50daf3e..01d2140 100644
--- a/driver/rsdMesh.cpp
+++ b/driver/rsdMesh.cpp
@@ -28,7 +28,7 @@
 using namespace android::renderscript;
 
 bool rsdMeshInit(const Context *rsc, const Mesh *m) {
-    RsdMeshObj *drv = NULL;
+    RsdMeshObj *drv = nullptr;
     if(m->mHal.drv) {
         drv = (RsdMeshObj*)m->mHal.drv;
         delete drv;
diff --git a/driver/rsdMeshObj.cpp b/driver/rsdMeshObj.cpp
index 66c3b18..ac1780c 100644
--- a/driver/rsdMeshObj.cpp
+++ b/driver/rsdMeshObj.cpp
@@ -32,9 +32,9 @@
 RsdMeshObj::RsdMeshObj(const Context *rsc, const Mesh *rsMesh) {
     mRSMesh = rsMesh;
 
-    mAttribs = NULL;
-    mAttribAllocationIndex = NULL;
-    mGLPrimitives = NULL;
+    mAttribs = nullptr;
+    mAttribAllocationIndex = nullptr;
+    mGLPrimitives = nullptr;
 
     mAttribCount = 0;
 }
@@ -86,8 +86,8 @@
     if (mAttribs) {
         delete [] mAttribs;
         delete [] mAttribAllocationIndex;
-        mAttribs = NULL;
-        mAttribAllocationIndex = NULL;
+        mAttribs = nullptr;
+        mAttribAllocationIndex = nullptr;
     }
     if (!mAttribCount) {
         return false;
@@ -112,9 +112,9 @@
             mAttribs[userNum].type = rsdTypeToGLType(f->mHal.state.dataType);
             mAttribs[userNum].normalized = f->mHal.state.dataType != RS_TYPE_FLOAT_32;
             mAttribs[userNum].stride = stride;
-            String8 tmp(RS_SHADER_ATTR);
+            std::string tmp(RS_SHADER_ATTR);
             tmp.append(elem->mHal.state.fieldNames[fieldI]);
-            mAttribs[userNum].name.setTo(tmp.string());
+            mAttribs[userNum].name = tmp.c_str();
 
             // Remember which allocation this attribute came from
             mAttribAllocationIndex[userNum] = ct;
@@ -148,7 +148,7 @@
 
         if (drvAlloc->bufferID) {
             mAttribs[ct].buffer = drvAlloc->bufferID;
-            mAttribs[ct].ptr = NULL;
+            mAttribs[ct].ptr = nullptr;
         } else {
             mAttribs[ct].buffer = 0;
             mAttribs[ct].ptr = (const uint8_t*)alloc->mHal.drvState.lod[0].mallocPtr;
diff --git a/driver/rsdPath.cpp b/driver/rsdPath.cpp
index ee5e3ad..1a9fa21 100644
--- a/driver/rsdPath.cpp
+++ b/driver/rsdPath.cpp
@@ -71,13 +71,13 @@
 
 bool rsdPathInitStatic(const Context *rsc, const Path *m,
                        const Allocation *vtx, const Allocation *loops) {
-    DrvPathStatic *drv = NULL;
+    DrvPathStatic *drv = nullptr;
     cleanup(rsc, m);
 
     DrvPathStatic *dps = new DrvPathStatic(vtx, loops);
     //LOGE("init path m %p,  %p", m, dps);
     m->mHal.drv = dps;
-    return dps != NULL;
+    return dps != nullptr;
 }
 
 bool rsdPathInitDynamic(const Context *rsc, const Path *m) {
@@ -97,7 +97,7 @@
 
 void rsdPathDestroy(const Context *rsc, const Path *m) {
     cleanup(rsc, m);
-    m->mHal.drv = NULL;
+    m->mHal.drv = nullptr;
 }
 
 
diff --git a/driver/rsdProgram.cpp b/driver/rsdProgram.cpp
index 132f7be..f2b5452 100644
--- a/driver/rsdProgram.cpp
+++ b/driver/rsdProgram.cpp
@@ -67,7 +67,7 @@
 void rsdProgramVertexDestroy(const Context *rsc, const ProgramVertex *pv) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
-    RsdShader *drv = NULL;
+    RsdShader *drv = nullptr;
     if(pv->mHal.drv) {
         drv = (RsdShader*)pv->mHal.drv;
         if (rsc->props.mLogShaders) {
@@ -101,7 +101,7 @@
 void rsdProgramFragmentDestroy(const Context *rsc, const ProgramFragment *pf) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
-    RsdShader *drv = NULL;
+    RsdShader *drv = nullptr;
     if(pf->mHal.drv) {
         drv = (RsdShader*)pf->mHal.drv;
         if (rsc->props.mLogShaders) {
diff --git a/driver/rsdProgramStore.cpp b/driver/rsdProgramStore.cpp
index c1295e8..79d2fdd 100644
--- a/driver/rsdProgramStore.cpp
+++ b/driver/rsdProgramStore.cpp
@@ -39,7 +39,7 @@
 
 bool rsdProgramStoreInit(const Context *rsc, const ProgramStore *ps) {
     DrvProgramStore *drv = (DrvProgramStore *)calloc(1, sizeof(DrvProgramStore));
-    if (drv == NULL) {
+    if (drv == nullptr) {
         return false;
     }
 
@@ -149,7 +149,7 @@
 
 error:
     free(drv);
-    ps->mHal.drv = NULL;
+    ps->mHal.drv = nullptr;
     return false;
 }
 
@@ -198,7 +198,7 @@
 
 void rsdProgramStoreDestroy(const Context *rsc, const ProgramStore *ps) {
     free(ps->mHal.drv);
-    ps->mHal.drv = NULL;
+    ps->mHal.drv = nullptr;
 }
 
 
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index 7456d2f..4dc1801 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -562,7 +562,8 @@
                             android::renderscript::rs_allocation in,
                             android::renderscript::rs_allocation out) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, NULL, 0, NULL);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               nullptr, 0, nullptr);
 }
 #else
 static void SC_ForEach_SAA(android::renderscript::rs_script *target,
@@ -579,7 +580,8 @@
                             android::renderscript::rs_allocation out,
                             const void *usr) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, usr, 0, NULL);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               usr, 0, nullptr);
 }
 #else
 static void SC_ForEach_SAAU(android::renderscript::rs_script *target,
@@ -618,7 +620,8 @@
                              const void *usr,
                              uint32_t usrLen) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, usr, usrLen, NULL);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               usr, usrLen, nullptr);
 }
 #else
 static void SC_ForEach_SAAUL(android::renderscript::rs_script *target,
@@ -703,7 +706,7 @@
 
 static uint32_t SC_ToClient(int cmdID) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    return rsrToClient(rsc, cmdID, (const void *)NULL, 0);
+    return rsrToClient(rsc, cmdID, (const void *)nullptr, 0);
 }
 
 static uint32_t SC_ToClientBlocking2(int cmdID, const void *data, uint32_t len) {
@@ -713,7 +716,7 @@
 
 static uint32_t SC_ToClientBlocking(int cmdID) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    return rsrToClientBlocking(rsc, cmdID, (const void *)NULL, 0);
+    return rsrToClientBlocking(rsc, cmdID, (const void *)nullptr, 0);
 }
 
 
@@ -726,20 +729,20 @@
     if (x >= t->getLODDimX(0)) {
         sprintf(buf, "Out range ElementAt X %i of %i", x, t->getLODDimX(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (vecSize > 0) {
         if (vecSize != e->getVectorSize()) {
             sprintf(buf, "Vector size mismatch for ElementAt %i of %i", vecSize, e->getVectorSize());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
 
         if (dt != e->getType()) {
             sprintf(buf, "Data type mismatch for ElementAt %i of %i", dt, e->getType());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
     }
 
@@ -757,26 +760,26 @@
     if (x >= t->getLODDimX(0)) {
         sprintf(buf, "Out range ElementAt X %i of %i", x, t->getLODDimX(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (y >= t->getLODDimY(0)) {
         sprintf(buf, "Out range ElementAt Y %i of %i", y, t->getLODDimY(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (vecSize > 0) {
         if (vecSize != e->getVectorSize()) {
             sprintf(buf, "Vector size mismatch for ElementAt %i of %i", vecSize, e->getVectorSize());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
 
         if (dt != e->getType()) {
             sprintf(buf, "Data type mismatch for ElementAt %i of %i", dt, e->getType());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
     }
 
@@ -795,32 +798,32 @@
     if (x >= t->getLODDimX(0)) {
         sprintf(buf, "Out range ElementAt X %i of %i", x, t->getLODDimX(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (y >= t->getLODDimY(0)) {
         sprintf(buf, "Out range ElementAt Y %i of %i", y, t->getLODDimY(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (z >= t->getLODDimZ(0)) {
         sprintf(buf, "Out range ElementAt Z %i of %i", z, t->getLODDimZ(0));
         rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-        return NULL;
+        return nullptr;
     }
 
     if (vecSize > 0) {
         if (vecSize != e->getVectorSize()) {
             sprintf(buf, "Vector size mismatch for ElementAt %i of %i", vecSize, e->getVectorSize());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
 
         if (dt != e->getType()) {
             sprintf(buf, "Data type mismatch for ElementAt %i of %i", dt, e->getType());
             rsc->setError(RS_ERROR_FATAL_DEBUG, buf);
-            return NULL;
+            return nullptr;
         }
     }
 
@@ -844,7 +847,7 @@
     const Type *t = ((Allocation*)a.p)->getType();
     const Element *e = t->getElement();
     void *tmp = ElementAt1D((Allocation*)a.p, RS_TYPE_UNSIGNED_8, 0, x);
-    if (tmp != NULL) {
+    if (tmp != nullptr) {
         memcpy(tmp, ptr, e->getSizeBytes());
     }
 }
@@ -852,7 +855,7 @@
     const Type *t = ((Allocation*)a.p)->getType();
     const Element *e = t->getElement();
     void *tmp = ElementAt2D((Allocation*)a.p, RS_TYPE_UNSIGNED_8, 0, x, y);
-    if (tmp != NULL) {
+    if (tmp != nullptr) {
         memcpy(tmp, ptr, e->getSizeBytes());
     }
 }
@@ -860,41 +863,41 @@
     const Type *t = ((Allocation*)a.p)->getType();
     const Element *e = t->getElement();
     void *tmp = ElementAt3D((Allocation*)a.p, RS_TYPE_UNSIGNED_8, 0, x, y, z);
-    if (tmp != NULL) {
+    if (tmp != nullptr) {
         memcpy(tmp, ptr, e->getSizeBytes());
     }
 }
 
 #define ELEMENT_AT(T, DT, VS)                                               \
     static void SC_SetElementAt1_##T(android::renderscript::rs_allocation a, const T *val, uint32_t x) { \
-        void *r = ElementAt1D((Allocation*)a.p, DT, VS, x);               \
-        if (r != NULL) ((T *)r)[0] = *val;                               \
+        void *r = ElementAt1D((Allocation*)a.p, DT, VS, x);             \
+        if (r != nullptr) ((T *)r)[0] = *val;                           \
         else ALOGE("Error from %s", __PRETTY_FUNCTION__);               \
     }                                                                   \
     static void SC_SetElementAt2_##T(android::renderscript::rs_allocation a, const T * val, uint32_t x, uint32_t y) { \
-        void *r = ElementAt2D((Allocation*)a.p, DT, VS, x, y);            \
-        if (r != NULL) ((T *)r)[0] = *val;                               \
+        void *r = ElementAt2D((Allocation*)a.p, DT, VS, x, y);          \
+        if (r != nullptr) ((T *)r)[0] = *val;                           \
         else ALOGE("Error from %s", __PRETTY_FUNCTION__);               \
     }                                                                   \
     static void SC_SetElementAt3_##T(android::renderscript::rs_allocation a, const T * val, uint32_t x, uint32_t y, uint32_t z) { \
-        void *r = ElementAt3D((Allocation*)a.p, DT, VS, x, y, z);         \
-        if (r != NULL) ((T *)r)[0] = *val;                               \
+        void *r = ElementAt3D((Allocation*)a.p, DT, VS, x, y, z);       \
+        if (r != nullptr) ((T *)r)[0] = *val;                           \
         else ALOGE("Error from %s", __PRETTY_FUNCTION__);               \
     }                                                                   \
     static void SC_GetElementAt1_##T(android::renderscript::rs_allocation a, T *val, uint32_t x) {                  \
-        void *r = ElementAt1D((Allocation*)a.p, DT, VS, x);               \
-        if (r != NULL) *val = ((T *)r)[0];                              \
-        else ALOGE("Error from %s", __PRETTY_FUNCTION__);                    \
+        void *r = ElementAt1D((Allocation*)a.p, DT, VS, x);             \
+        if (r != nullptr) *val = ((T *)r)[0];                           \
+        else ALOGE("Error from %s", __PRETTY_FUNCTION__);               \
     }                                                                   \
     static void SC_GetElementAt2_##T(android::renderscript::rs_allocation a, T *val, uint32_t x, uint32_t y) {      \
-        void *r = ElementAt2D((Allocation*)a.p, DT, VS, x, y);            \
-        if (r != NULL) *val = ((T *)r)[0];                              \
-        else ALOGE("Error from %s", __PRETTY_FUNCTION__);                    \
-    }                                                                   \
+        void *r = ElementAt2D((Allocation*)a.p, DT, VS, x, y);           \
+        if (r != nullptr) *val = ((T *)r)[0];                            \
+        else ALOGE("Error from %s", __PRETTY_FUNCTION__);                \
+    }                                                                    \
     static void SC_GetElementAt3_##T(android::renderscript::rs_allocation a, T *val, uint32_t x, uint32_t y, uint32_t z) { \
-        void *r = ElementAt3D((Allocation*)a.p, DT, VS, x, y, z);         \
-        if (r != NULL) *val = ((T *)r)[0];                              \
-        else ALOGE("Error from %s", __PRETTY_FUNCTION__);                    \
+        void *r = ElementAt3D((Allocation*)a.p, DT, VS, x, y, z);        \
+        if (r != nullptr) *val = ((T *)r)[0];                            \
+        else ALOGE("Error from %s", __PRETTY_FUNCTION__);                \
     }
 
 ELEMENT_AT(char, RS_TYPE_SIGNED_8, 1)
@@ -1397,7 +1400,7 @@
     { "_Z9rsgFinishv", (void *)&SC_Finish, false },
 #endif
 
-    { NULL, NULL, false }
+    { nullptr, nullptr, false }
 };
 
 #ifdef RS_COMPATIBILITY_LIB
@@ -1408,7 +1411,7 @@
 
 #define IS_CLEAR_SET_OBJ(t) \
     bool rsIsObject(t src) { \
-        return src.p != NULL; \
+        return src.p != nullptr; \
     } \
     void __attribute__((overloadable)) rsClearObject(t *dst) { \
         return SC_ClearObject(reinterpret_cast<rs_object_base *>(dst)); \
@@ -1428,7 +1431,8 @@
                            ::rs_allocation in,
                            ::rs_allocation out) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, NULL, 0, NULL);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               nullptr, 0, nullptr);
 }
 
 static void SC_ForEach_SAAUS(::rs_script target,
@@ -1437,7 +1441,8 @@
                              const void *usr,
                              const RsScriptCall *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, usr, 0, call);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               usr, 0, call);
 }
 
 static void SC_ForEach_SAAUL(::rs_script target,
@@ -1446,7 +1451,8 @@
                              const void *usr,
                              uint32_t usrLen) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, usr, usrLen, NULL);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               usr, usrLen, nullptr);
 }
 
 static void SC_ForEach_SAAULS(::rs_script target,
@@ -1456,7 +1462,8 @@
                               uint32_t usrLen,
                               const RsScriptCall *call) {
     Context *rsc = RsdCpuReference::getTlsContext();
-    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p, usr, usrLen, call);
+    rsrForEach(rsc, (Script*)target.p, (Allocation*)in.p, (Allocation*)out.p,
+               usr, usrLen, call);
 }
 
 static const Allocation * SC_GetAllocation(const void *ptr) {
@@ -1933,7 +1940,7 @@
 extern const RsdCpuReference::CpuSymbol * rsdLookupRuntimeStub(Context * pContext, char const* name) {
     ScriptC *s = (ScriptC *)pContext;
     const RsdCpuReference::CpuSymbol *syms = gSyms;
-    const RsdCpuReference::CpuSymbol *sym = NULL;
+    const RsdCpuReference::CpuSymbol *sym = nullptr;
 
     if (!sym) {
         while (syms->fnPtr) {
@@ -1944,5 +1951,5 @@
         }
     }
 
-    return NULL;
+    return nullptr;
 }
diff --git a/driver/rsdSampler.cpp b/driver/rsdSampler.cpp
index 1a72fd4..28b594f 100644
--- a/driver/rsdSampler.cpp
+++ b/driver/rsdSampler.cpp
@@ -47,13 +47,13 @@
 {
     obj->p = alloc;
 #ifdef __LP64__
-    obj->r = NULL;
-    if (alloc != NULL) {
+    obj->r = nullptr;
+    if (alloc != nullptr) {
         obj->v1 = alloc->mHal.drv;
     } else {
-        obj->v1 = NULL;
+        obj->v1 = nullptr;
     }
-    obj->v2 = NULL;
+    obj->v2 = nullptr;
 #endif
 }
 
diff --git a/driver/rsdScriptGroup.cpp b/driver/rsdScriptGroup.cpp
index 6162c11..a7b2e77 100644
--- a/driver/rsdScriptGroup.cpp
+++ b/driver/rsdScriptGroup.cpp
@@ -32,7 +32,7 @@
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
     sg->mHal.drv = dc->mCpuRef->createScriptGroup(sg);
-    return sg->mHal.drv != NULL;
+    return sg->mHal.drv != nullptr;
 }
 
 void rsdScriptGroupSetInput(const Context *rsc, const ScriptGroup *sg,
@@ -59,13 +59,13 @@
 {
     obj->p = sg;
 #ifdef __LP64__
-    obj->r = NULL;
-    if (sg != NULL) {
+    obj->r = nullptr;
+    if (sg != nullptr) {
         obj->v1 = sg->mHal.drv;
     } else {
-        obj->v1 = NULL;
+        obj->v1 = nullptr;
     }
-    obj->v2 = NULL;
+    obj->v2 = nullptr;
 #endif
 }
 
diff --git a/driver/rsdShader.cpp b/driver/rsdShader.cpp
index 0b182ff..7511883 100644
--- a/driver/rsdShader.cpp
+++ b/driver/rsdShader.cpp
@@ -33,7 +33,7 @@
                      const char * shaderText, size_t shaderLength,
                      const char** textureNames, size_t textureNamesCount,
                      const size_t *textureNamesLength) {
-    mUserShader.setTo(shaderText, shaderLength);
+    mUserShader.replace(0, shaderLength, shaderText);
     mRSProgram = p;
     mType = type;
     initMemberVars();
@@ -41,13 +41,14 @@
     init(textureNames, textureNamesCount, textureNamesLength);
 
     for(size_t i=0; i < textureNamesCount; i++) {
-        mTextureNames.push(String8(textureNames[i], textureNamesLength[i]));
+        mTextureNames.push_back(std::string(textureNames[i],
+                                            textureNamesLength[i]));
     }
 }
 
 RsdShader::~RsdShader() {
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        StateBasedKey *state = mStateBasedShaders.itemAt(i);
+        StateBasedKey *state = mStateBasedShaders[i];
         if (state->mShaderID) {
             glDeleteShader(state->mShaderID);
         }
@@ -64,19 +65,19 @@
     mAttribCount = 0;
     mUniformCount = 0;
 
-    mAttribNames = NULL;
-    mUniformNames = NULL;
-    mUniformArraySizes = NULL;
-    mCurrentState = NULL;
+    mAttribNames = nullptr;
+    mUniformNames = nullptr;
+    mUniformArraySizes = nullptr;
+    mCurrentState = nullptr;
 
     mIsValid = false;
 }
 
 RsdShader::StateBasedKey *RsdShader::getExistingState() {
-    RsdShader::StateBasedKey *returnKey = NULL;
+    RsdShader::StateBasedKey *returnKey = nullptr;
 
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        returnKey = mStateBasedShaders.itemAt(i);
+        returnKey = mStateBasedShaders[i];
 
         for (uint32_t ct = 0; ct < mRSProgram->mHal.state.texturesCount; ct ++) {
             uint32_t texType = 0;
@@ -91,7 +92,7 @@
                 texType = GL_TEXTURE_CUBE_MAP;
             }
             if (texType != returnKey->mTextureTargets[ct]) {
-                returnKey = NULL;
+                returnKey = nullptr;
                 break;
             }
         }
@@ -101,14 +102,14 @@
 
 uint32_t RsdShader::getStateBasedShaderID(const Context *rsc) {
     StateBasedKey *state = getExistingState();
-    if (state != NULL) {
+    if (state != nullptr) {
         mCurrentState = state;
         return mCurrentState->mShaderID;
     }
     // We have not created a shader for this particular state yet
     state = new StateBasedKey(mTextureCount);
     mCurrentState = state;
-    mStateBasedShaders.add(state);
+    mStateBasedShaders.push_back(state);
     createShader();
     loadShader(rsc);
     return mCurrentState->mShaderID;
@@ -120,7 +121,7 @@
     uint32_t uniformCount = 0;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.inputElementsCount; ct++) {
         initAddUserElement(mRSProgram->mHal.state.inputElements[ct], mAttribNames,
-                           NULL, &attribCount, RS_SHADER_ATTR);
+                           nullptr, &attribCount, RS_SHADER_ATTR);
     }
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.constantsCount; ct++) {
         initAddUserElement(mRSProgram->mHal.state.constantTypes[ct]->getElement(),
@@ -129,15 +130,15 @@
 
     mTextureUniformIndexStart = uniformCount;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.texturesCount; ct++) {
-        mUniformNames[uniformCount].setTo("UNI_");
+        mUniformNames[uniformCount] = "UNI_";
         mUniformNames[uniformCount].append(textureNames[ct], textureNamesLength[ct]);
         mUniformArraySizes[uniformCount] = 1;
         uniformCount++;
     }
 }
 
-String8 RsdShader::getGLSLInputString() const {
-    String8 s;
+std::string RsdShader::getGLSLInputString() const {
+    std::string s;
     for (uint32_t ct=0; ct < mRSProgram->mHal.state.inputElementsCount; ct++) {
         const Element *e = mRSProgram->mHal.state.inputElements[ct];
         for (uint32_t field=0; field < e->mHal.state.fieldsCount; field++) {
@@ -237,12 +238,12 @@
 
     if (rsc->props.mLogShaders) {
         ALOGV("Loading shader type %x, ID %i", mType, mCurrentState->mShaderID);
-        ALOGV("%s", mShader.string());
+        ALOGV("%s", mShader.c_str());
     }
 
     if (mCurrentState->mShaderID) {
-        const char * ss = mShader.string();
-        RSD_CALL_GL(glShaderSource, mCurrentState->mShaderID, 1, &ss, NULL);
+        const char * ss = mShader.c_str();
+        RSD_CALL_GL(glShaderSource, mCurrentState->mShaderID, 1, &ss, nullptr);
         RSD_CALL_GL(glCompileShader, mCurrentState->mShaderID);
 
         GLint compiled = 0;
@@ -253,7 +254,7 @@
             if (infoLen) {
                 char* buf = (char*) malloc(infoLen);
                 if (buf) {
-                    RSD_CALL_GL(glGetShaderInfoLog, mCurrentState->mShaderID, infoLen, NULL, buf);
+                    RSD_CALL_GL(glGetShaderInfoLog, mCurrentState->mShaderID, infoLen, nullptr, buf);
                     rsc->setError(RS_ERROR_FATAL_PROGRAM_LINK, buf);
                     free(buf);
                 }
@@ -299,7 +300,9 @@
 
             mShader.append(fn);
             if (e->mHal.state.fieldArraySizes[field] > 1) {
-                mShader.appendFormat("[%d]", e->mHal.state.fieldArraySizes[field]);
+                mShader += "[";
+                mShader += std::to_string(e->mHal.state.fieldArraySizes[field]);
+                mShader += "]";
             }
             mShader.append(";\n");
         }
@@ -585,27 +588,28 @@
     mUniformCount += mRSProgram->mHal.state.texturesCount;
 
     if (mAttribCount) {
-        mAttribNames = new String8[mAttribCount];
+        mAttribNames = new std::string[mAttribCount];
     }
     if (mUniformCount) {
-        mUniformNames = new String8[mUniformCount];
+        mUniformNames = new std::string[mUniformCount];
         mUniformArraySizes = new uint32_t[mUniformCount];
     }
 
     mTextureCount = mRSProgram->mHal.state.texturesCount;
 }
 
-void RsdShader::initAddUserElement(const Element *e, String8 *names, uint32_t *arrayLengths,
-                                   uint32_t *count, const char *prefix) {
+void RsdShader::initAddUserElement(const Element *e, std::string *names,
+                                   uint32_t *arrayLengths, uint32_t *count,
+                                   const char *prefix) {
     rsAssert(e->mHal.state.fieldsCount);
     for (uint32_t ct=0; ct < e->mHal.state.fieldsCount; ct++) {
         const Element *ce = e->mHal.state.fields[ct];
         if (ce->mHal.state.fieldsCount) {
             initAddUserElement(ce, names, arrayLengths, count, prefix);
         } else {
-            String8 tmp(prefix);
+            std::string tmp(prefix);
             tmp.append(e->mHal.state.fieldNames[ct]);
-            names[*count].setTo(tmp.string());
+            names[*count] = tmp;
             if (arrayLengths) {
                 arrayLengths[*count] = e->mHal.state.fieldArraySizes[ct];
             }
diff --git a/driver/rsdShader.h b/driver/rsdShader.h
index fba1790..0dc5102 100644
--- a/driver/rsdShader.h
+++ b/driver/rsdShader.h
@@ -17,7 +17,7 @@
 #ifndef ANDROID_RSD_SHADER_H
 #define ANDROID_RSD_SHADER_H
 
-#include <utils/String8.h>
+#include <string>
 
 // ---------------------------------------------------------------------------
 namespace android {
@@ -49,16 +49,16 @@
     // Add ability to get all ID's to clean up the cached program objects
     uint32_t getStateBasedIDCount() const { return mStateBasedShaders.size(); }
     uint32_t getStateBasedID(uint32_t index) const {
-        return mStateBasedShaders.itemAt(index)->mShaderID;
+        return mStateBasedShaders[index]->mShaderID;
     }
 
     uint32_t getAttribCount() const {return mAttribCount;}
     uint32_t getUniformCount() const {return mUniformCount;}
-    const android::String8 & getAttribName(uint32_t i) const {return mAttribNames[i];}
-    const android::String8 & getUniformName(uint32_t i) const {return mUniformNames[i];}
+    const std::string & getAttribName(uint32_t i) const {return mAttribNames[i];}
+    const std::string & getUniformName(uint32_t i) const {return mUniformNames[i];}
     uint32_t getUniformArraySize(uint32_t i) const {return mUniformArraySizes[i];}
 
-    android::String8 getGLSLInputString() const;
+    std::string getGLSLInputString() const;
 
     bool isValid() const {return mIsValid;}
     void forceDirty() const {mDirty = true;}
@@ -91,7 +91,7 @@
     void setupUserConstants(const android::renderscript::Context *rsc,
                             RsdShaderCache *sc, bool isFragment);
     void initAddUserElement(const android::renderscript::Element *e,
-                            android::String8 *names, uint32_t *arrayLengths,
+                            std::string *names, uint32_t *arrayLengths,
                             uint32_t *count, const char *prefix);
     void setupTextures(const android::renderscript::Context *rsc, RsdShaderCache *sc);
     void setupSampler(const android::renderscript::Context *rsc,
@@ -104,21 +104,21 @@
     void initAttribAndUniformArray();
 
     mutable bool mDirty;
-    android::String8 mShader;
-    android::String8 mUserShader;
+    std::string mShader;
+    std::string mUserShader;
     uint32_t mType;
 
     uint32_t mTextureCount;
     StateBasedKey *mCurrentState;
     uint32_t mAttribCount;
     uint32_t mUniformCount;
-    android::String8 *mAttribNames;
-    android::String8 *mUniformNames;
+    std::string *mAttribNames;
+    std::string *mUniformNames;
     uint32_t *mUniformArraySizes;
 
-    android::Vector<android::String8> mTextureNames;
+    std::vector<std::string> mTextureNames;
 
-    android::Vector<StateBasedKey*> mStateBasedShaders;
+    std::vector<StateBasedKey*> mStateBasedShaders;
 
     int32_t mTextureUniformIndexStart;
 
@@ -133,7 +133,3 @@
 };
 
 #endif //ANDROID_RSD_SHADER_H
-
-
-
-
diff --git a/driver/rsdShaderCache.cpp b/driver/rsdShaderCache.cpp
index 69b43fc..608922c 100644
--- a/driver/rsdShaderCache.cpp
+++ b/driver/rsdShaderCache.cpp
@@ -29,7 +29,7 @@
 
 
 RsdShaderCache::RsdShaderCache() {
-    mEntries.setCapacity(16);
+    mEntries.reserve(16);
     mVertexDirty = true;
     mFragmentDirty = true;
 }
@@ -38,9 +38,13 @@
     cleanupAll();
 }
 
-void RsdShaderCache::updateUniformArrayData(const Context *rsc, RsdShader *prog, uint32_t linkedID,
-                                         UniformData *data, const char* logTag,
-                                         UniformQueryData **uniformList, uint32_t uniListSize) {
+void RsdShaderCache::updateUniformArrayData(const Context *rsc,
+                                            RsdShader *prog,
+                                            uint32_t linkedID,
+                                            UniformData *data,
+                                            const char* logTag,
+                                            UniformQueryData **uniformList,
+                                            uint32_t uniListSize) {
 
     for (uint32_t ct=0; ct < prog->getUniformCount(); ct++) {
         if (data[ct].slot >= 0 && data[ct].arraySize > 1) {
@@ -55,14 +59,17 @@
 
         if (rsc->props.mLogShaders) {
              ALOGV("%s U, %s = %d, arraySize = %d\n", logTag,
-                  prog->getUniformName(ct).string(), data[ct].slot, data[ct].arraySize);
+                   prog->getUniformName(ct).c_str(), data[ct].slot,
+                   data[ct].arraySize);
         }
     }
 }
 
-void RsdShaderCache::populateUniformData(RsdShader *prog, uint32_t linkedID, UniformData *data) {
+void RsdShaderCache::populateUniformData(RsdShader *prog, uint32_t linkedID,
+                                         UniformData *data) {
     for (uint32_t ct=0; ct < prog->getUniformCount(); ct++) {
-       data[ct].slot = glGetUniformLocation(linkedID, prog->getUniformName(ct));
+       data[ct].slot = glGetUniformLocation(linkedID,
+                                            prog->getUniformName(ct).c_str());
        data[ct].arraySize = prog->getUniformArraySize(ct);
     }
 }
@@ -132,7 +139,7 @@
     ProgramEntry *e = new ProgramEntry(vtx->getAttribCount(),
                                        vtx->getUniformCount(),
                                        frag->getUniformCount());
-    mEntries.push(e);
+    mEntries.push_back(e);
     mCurrent = e;
     e->vtx = vID;
     e->frag = fID;
@@ -159,7 +166,7 @@
             if (bufLength) {
                 char* buf = (char*) malloc(bufLength);
                 if (buf) {
-                    glGetProgramInfoLog(pgm, bufLength, NULL, buf);
+                    glGetProgramInfoLog(pgm, bufLength, nullptr, buf);
                     rsc->setError(RS_ERROR_FATAL_PROGRAM_LINK, buf);
                     free(buf);
                 }
@@ -169,10 +176,12 @@
         }
 
         for (uint32_t ct=0; ct < e->vtxAttrCount; ct++) {
-            e->vtxAttrs[ct].slot = glGetAttribLocation(pgm, vtx->getAttribName(ct));
-            e->vtxAttrs[ct].name = vtx->getAttribName(ct).string();
+            e->vtxAttrs[ct].slot =
+                glGetAttribLocation(pgm, vtx->getAttribName(ct).c_str());
+            e->vtxAttrs[ct].name = vtx->getAttribName(ct).c_str();
             if (rsc->props.mLogShaders) {
-                ALOGV("vtx A %i, %s = %d\n", ct, vtx->getAttribName(ct).string(), e->vtxAttrs[ct].slot);
+                ALOGV("vtx A %i, %s = %d\n", ct,
+                      vtx->getAttribName(ct).c_str(), e->vtxAttrs[ct].slot);
             }
         }
 
@@ -180,7 +189,7 @@
         populateUniformData(frag, pgm, e->fragUniforms);
 
         // Only populate this list if we have arrays in our uniforms
-        UniformQueryData **uniformList = NULL;
+        UniformQueryData **uniformList = nullptr;
         GLint numUniforms = 0;
         bool hasArrays = hasArrayUniforms(vtx, frag);
         if (hasArrays) {
@@ -212,12 +221,12 @@
                                uniformList, (uint32_t)numUniforms);
 
         // Clean up the uniform data from GL
-        if (uniformList != NULL) {
+        if (uniformList != nullptr) {
             for (uint32_t ct = 0; ct < (uint32_t)numUniforms; ct++) {
                 delete uniformList[ct];
             }
             delete[] uniformList;
-            uniformList = NULL;
+            uniformList = nullptr;
         }
     }
 
@@ -228,7 +237,7 @@
     return true;
 }
 
-int32_t RsdShaderCache::vtxAttribSlot(const String8 &attrName) const {
+int32_t RsdShaderCache::vtxAttribSlot(const std::string &attrName) const {
     for (uint32_t ct=0; ct < mCurrent->vtxAttrCount; ct++) {
         if (attrName == mCurrent->vtxAttrs[ct].name) {
             return mCurrent->vtxAttrs[ct].slot;
@@ -238,46 +247,45 @@
 }
 
 void RsdShaderCache::cleanupVertex(RsdShader *s) {
-    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
-        for (int32_t ct = 0; ct < numEntries; ct ++) {
-            if (mEntries[ct]->vtx == id) {
-                glDeleteProgram(mEntries[ct]->program);
 
-                delete mEntries[ct];
-                mEntries.removeAt(ct);
-                numEntries = (int32_t)mEntries.size();
-                ct --;
+        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
+            if ((*entry)->vtx == id) {
+                glDeleteProgram((*entry)->program);
+
+                delete *entry;
+                entry = mEntries.erase(entry);
+            } else {
+                entry++;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupFragment(RsdShader *s) {
-    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
-        for (int32_t ct = 0; ct < numEntries; ct ++) {
-            if (mEntries[ct]->frag == id) {
-                glDeleteProgram(mEntries[ct]->program);
 
-                delete mEntries[ct];
-                mEntries.removeAt(ct);
-                numEntries = (int32_t)mEntries.size();
-                ct --;
+        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
+            if ((*entry)->frag == id) {
+                glDeleteProgram((*entry)->program);
+
+                delete *entry;
+                entry = mEntries.erase(entry);
+            } else {
+                entry++;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupAll() {
-    for (uint32_t ct=0; ct < mEntries.size(); ct++) {
-        glDeleteProgram(mEntries[ct]->program);
-        free(mEntries[ct]);
+    for (auto entry : mEntries) {
+        glDeleteProgram(entry->program);
+        delete entry;
     }
     mEntries.clear();
 }
-
diff --git a/driver/rsdShaderCache.h b/driver/rsdShaderCache.h
index 6de1d63..de195e6 100644
--- a/driver/rsdShaderCache.h
+++ b/driver/rsdShaderCache.h
@@ -17,6 +17,9 @@
 #ifndef ANDROID_RSD_SHADER_CACHE_H
 #define ANDROID_RSD_SHADER_CACHE_H
 
+#include <string>
+#include <vector>
+
 namespace android {
 namespace renderscript {
 
@@ -25,10 +28,7 @@
 }
 }
 
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-#include <utils/String8.h>
-#include <utils/Vector.h>
-#else
+#if defined(RS_SERVER) || defined(RS_COMPATIBILITY_LIB)
 #include "rsUtils.h"
 #endif
 class RsdShader;
@@ -58,7 +58,7 @@
 
     void cleanupAll();
 
-    int32_t vtxAttribSlot(const android::String8 &attrName) const;
+    int32_t vtxAttribSlot(const std::string &attrName) const;
     int32_t vtxUniformSlot(uint32_t a) const {return mCurrent->vtxUniforms[a].slot;}
     uint32_t vtxUniformSize(uint32_t a) const {return mCurrent->vtxUniforms[a].arraySize;}
     int32_t fragUniformSlot(uint32_t a) const {return mCurrent->fragUniforms[a].slot;}
@@ -78,16 +78,16 @@
         int32_t arraySize;
         uint32_t type;
         UniformQueryData(uint32_t maxName) {
-            name = NULL;
+            name = nullptr;
             nameLength = maxName;
             if (nameLength > 0 ) {
                 name = new char[nameLength];
             }
         }
         ~UniformQueryData() {
-            if (name != NULL) {
+            if (name != nullptr) {
                 delete[] name;
-                name = NULL;
+                name = nullptr;
             }
         }
     };
@@ -119,19 +119,19 @@
         ~ProgramEntry() {
             if (vtxAttrs) {
                 delete[] vtxAttrs;
-                vtxAttrs = NULL;
+                vtxAttrs = nullptr;
             }
             if (vtxUniforms) {
                 delete[] vtxUniforms;
-                vtxUniforms = NULL;
+                vtxUniforms = nullptr;
             }
             if (fragUniforms) {
                 delete[] fragUniforms;
-                fragUniforms = NULL;
+                fragUniforms = nullptr;
             }
             if (fragUniformIsSTO) {
                 delete[] fragUniformIsSTO;
-                fragUniformIsSTO = NULL;
+                fragUniformIsSTO = nullptr;
             }
         }
         uint32_t vtx;
@@ -143,7 +143,7 @@
         UniformData *fragUniforms;
         bool *fragUniformIsSTO;
     };
-    android::Vector<ProgramEntry*> mEntries;
+    std::vector<ProgramEntry*> mEntries;
     ProgramEntry *mCurrent;
 
     bool hasArrayUniforms(RsdShader *vtx, RsdShader *frag);
@@ -156,7 +156,3 @@
 
 
 #endif //ANDROID_RSD_SHADER_CACHE_H
-
-
-
-
diff --git a/driver/rsdType.cpp b/driver/rsdType.cpp
index fa7b46c..869a86d 100644
--- a/driver/rsdType.cpp
+++ b/driver/rsdType.cpp
@@ -46,9 +46,9 @@
 {
     obj->p = t;
 #ifdef __LP64__
-    obj->r = NULL;
-    obj->v1 = NULL;
-    obj->v2 = NULL;
+    obj->r = nullptr;
+    obj->v1 = nullptr;
+    obj->v2 = nullptr;
 #endif
 }
 
diff --git a/driver/rsdVertexArray.cpp b/driver/rsdVertexArray.cpp
index 4e293f6..c18a062 100644
--- a/driver/rsdVertexArray.cpp
+++ b/driver/rsdVertexArray.cpp
@@ -46,9 +46,9 @@
     type = 0;
     size = 0;
     stride = 0;
-    ptr = NULL;
+    ptr = nullptr;
     normalized = false;
-    name.setTo("");
+    name = "";
 }
 
 void RsdVertexArray::Attrib::set(uint32_t type, uint32_t size, uint32_t stride,
@@ -60,7 +60,7 @@
     this->offset = offset;
     this->normalized = normalized;
     this->stride = stride;
-    this->name.setTo(name);
+    this->name = name;
 }
 
 void RsdVertexArray::logAttrib(uint32_t idx, uint32_t slot) const {
@@ -69,7 +69,7 @@
     }
     ALOGV("va %i: slot=%i name=%s buf=%i ptr=%p size=%i  type=0x%x  stride=0x%x  norm=%i  offset=0x%p",
           idx, slot,
-          mAttribs[idx].name.string(),
+          mAttribs[idx].name.c_str(),
           mAttribs[idx].buffer,
           mAttribs[idx].ptr,
           mAttribs[idx].size,
@@ -118,14 +118,14 @@
 }
 ////////////////////////////////////////////
 RsdVertexArrayState::RsdVertexArrayState() {
-    mAttrsEnabled = NULL;
+    mAttrsEnabled = nullptr;
     mAttrsEnabledSize = 0;
 }
 
 RsdVertexArrayState::~RsdVertexArrayState() {
     if (mAttrsEnabled) {
         delete[] mAttrsEnabled;
-        mAttrsEnabled = NULL;
+        mAttrsEnabled = nullptr;
     }
 }
 void RsdVertexArrayState::init(uint32_t maxAttrs) {
@@ -135,4 +135,3 @@
         mAttrsEnabled[ct] = false;
     }
 }
-
diff --git a/driver/rsdVertexArray.h b/driver/rsdVertexArray.h
index 975121b..1bafe3b 100644
--- a/driver/rsdVertexArray.h
+++ b/driver/rsdVertexArray.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_RSD_VERTEX_ARRAY_H
 #define ANDROID_RSD_VERTEX_ARRAY_H
 
+#include <string>
+
 #include "rsUtils.h"
 
 namespace android {
@@ -39,7 +41,7 @@
         uint32_t size;
         uint32_t stride;
         bool normalized;
-        android::String8 name;
+        std::string name;
 
         Attrib();
         void clear();
@@ -74,6 +76,3 @@
 
 
 #endif //ANDROID_RSD_VERTEX_ARRAY_H
-
-
-
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index fe45420..a79ad2a 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -548,7 +548,7 @@
 
 extern float __attribute__((overloadable)) rootn(float v, int r) {
     if (r == 0) {
-        return posinf(0);
+        return posinf();
     }
 
     if (iszero(v)) {
diff --git a/driver/runtime/rs_structs.h b/driver/runtime/rs_structs.h
index e35cd54..40f740a 100644
--- a/driver/runtime/rs_structs.h
+++ b/driver/runtime/rs_structs.h
@@ -275,7 +275,7 @@
             void **vertexBuffers;
             uint32_t vertexBuffersCount;
 
-            // indexBuffers[i] could be NULL, in which case only primitives[i] is used
+            // indexBuffers[i] could be nullptr, in which case only primitives[i] is used
             void **indexBuffers;
             uint32_t indexBuffersCount;
             rs_primitive *primitives;
diff --git a/java/tests/HelloComputeNDK/Android.mk b/java/tests/HelloComputeNDK/Android.mk
index 58b95aa..5f1bd17 100644
--- a/java/tests/HelloComputeNDK/Android.mk
+++ b/java/tests/HelloComputeNDK/Android.mk
@@ -17,6 +17,8 @@
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 
+LOCAL_CFLAGS := -std=c++11
+
 LOCAL_MODULE_TAGS := tests
 
 LOCAL_SRC_FILES := $(call all-java-files-under, src) \
diff --git a/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk b/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
index 928accb..9a012a1 100644
--- a/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
+++ b/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
@@ -17,6 +17,7 @@
 #
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 LOCAL_CLANG := true
 
 LOCAL_MODULE := libhellocomputendk
@@ -26,10 +27,11 @@
 LOCAL_C_INCLUDES := $(JNI_H_INCLUDE)
 LOCAL_C_INCLUDES += frameworks/rs/cpp
 LOCAL_C_INCLUDES += frameworks/rs
-LOCAL_C_INCLUDES += external/stlport/stlport bionic/ bionic/libstdc++/include
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS := -Wl,-Bsymbolic
 LOCAL_SHARED_LIBRARIES := libdl liblog libjnigraphics
 LOCAL_STATIC_LIBRARIES := libRScpp_static
 
+include external/stlport/libstlport.mk
 include $(BUILD_SHARED_LIBRARY)
diff --git a/java/tests/HelloComputeNDK/libhellocomputendk/helloComputeNDK.cpp b/java/tests/HelloComputeNDK/libhellocomputendk/helloComputeNDK.cpp
index 4985664..1aab056 100644
--- a/java/tests/HelloComputeNDK/libhellocomputendk/helloComputeNDK.cpp
+++ b/java/tests/HelloComputeNDK/libhellocomputendk/helloComputeNDK.cpp
@@ -27,13 +27,13 @@
                                                                        )
 {
 
-    void* inputPtr = NULL;
-    void* outputPtr = NULL;
+    void* inputPtr = nullptr;
+    void* outputPtr = nullptr;
 
     AndroidBitmap_lockPixels(env, jbitmapIn, &inputPtr);
     AndroidBitmap_lockPixels(env, jbitmapOut, &outputPtr);
 
-    const char * path = env->GetStringUTFChars(pathObj, NULL);
+    const char * path = env->GetStringUTFChars(pathObj, nullptr);
     sp<RS> rs = new RS();
     rs->init(path);
     env->ReleaseStringUTFChars(pathObj, path);
diff --git a/java/tests/ImageProcessing2/Android.mk b/java/tests/ImageProcessing2/Android.mk
index a6d5ec9..9c53d91 100644
--- a/java/tests/ImageProcessing2/Android.mk
+++ b/java/tests/ImageProcessing2/Android.mk
@@ -25,7 +25,7 @@
 LOCAL_STATIC_JAVA_LIBRARIES := android-support-v8-renderscript
 
 LOCAL_PACKAGE_NAME := ImageProcessing2
-LOCAL_SDK_VERSION := 8
+LOCAL_SDK_VERSION := 14
 LOCAL_RENDERSCRIPT_TARGET_API := 18
 LOCAL_RENDERSCRIPT_COMPATIBILITY := 18
 LOCAL_RENDERSCRIPT_INCLUDES_OVERRIDE := $(TOPDIR)external/clang/lib/Headers \
diff --git a/java/tests/ImageProcessing2/AndroidManifest.xml b/java/tests/ImageProcessing2/AndroidManifest.xml
index 0129fa8..0f89c35 100644
--- a/java/tests/ImageProcessing2/AndroidManifest.xml
+++ b/java/tests/ImageProcessing2/AndroidManifest.xml
@@ -3,11 +3,19 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="com.android.rs.image2">
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
-    <uses-sdk android:minSdkVersion="8" />
-    <application android:label="IP GB">
+    <uses-sdk android:minSdkVersion="14" />
+    <application android:label="IP-Compat"
+                 android:hardwareAccelerated="true"
+                 android:theme="@android:style/Theme.Holo.Light">
         <activity android:name="ImageProcessingActivity2">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
+            </intent-filter>
+        </activity>
+
+        <activity class=".IPControls" android:name="IPControls">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
diff --git a/java/tests/ImageProcessing2/res/drawable-hdpi/ic_action_settings.png b/java/tests/ImageProcessing2/res/drawable-hdpi/ic_action_settings.png
new file mode 100644
index 0000000..54eecde
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-hdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-mdpi/ic_action_settings.png b/java/tests/ImageProcessing2/res/drawable-mdpi/ic_action_settings.png
new file mode 100644
index 0000000..25c36db
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-mdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/city.png b/java/tests/ImageProcessing2/res/drawable-nodpi/city.png
deleted file mode 100644
index 856eeff..0000000
--- a/java/tests/ImageProcessing2/res/drawable-nodpi/city.png
+++ /dev/null
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720a.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720a.jpg
new file mode 100644
index 0000000..ff09574
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720a.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720b.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720b.jpg
new file mode 100644
index 0000000..e9f6aa4
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img1280x720b.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067.jpg
deleted file mode 100644
index 05d3ee2..0000000
--- a/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067.jpg
+++ /dev/null
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067b.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067b.jpg
deleted file mode 100644
index aed0781..0000000
--- a/java/tests/ImageProcessing2/res/drawable-nodpi/img1600x1067b.jpg
+++ /dev/null
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080a.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080a.jpg
new file mode 100644
index 0000000..80b16ab
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080a.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080b.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080b.jpg
new file mode 100644
index 0000000..b4883d6
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img1920x1080b.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450a.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450a.jpg
new file mode 100644
index 0000000..6d5b623
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450a.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450b.jpg b/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450b.jpg
new file mode 100644
index 0000000..2013e07
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-nodpi/img800x450b.jpg
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-xhdpi/ic_action_settings.png b/java/tests/ImageProcessing2/res/drawable-xhdpi/ic_action_settings.png
new file mode 100644
index 0000000..425a8bc
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-xhdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/drawable-xxhdpi/ic_action_settings.png b/java/tests/ImageProcessing2/res/drawable-xxhdpi/ic_action_settings.png
new file mode 100644
index 0000000..fe5fec4
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/drawable-xxhdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing2/res/layout/controls.xml b/java/tests/ImageProcessing2/res/layout/controls.xml
new file mode 100644
index 0000000..6b5021e
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/layout/controls.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2014 The Android Open Source Project
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+-->
+
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+            android:orientation="vertical"
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            android:id="@+id/toplevel">
+
+    <ListView
+        android:id="@+id/test_list"
+        android:layout_weight="0.2"
+        android:layout_width="fill_parent"
+        android:layout_height="wrap_content"/>
+
+    <LinearLayout
+        android:orientation="horizontal"
+        android:layout_width="fill_parent" android:layout_height="wrap_content">
+        <Button
+             android:id="@+id/run"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/benchmark"
+             android:onClick="btnRun"/>
+        <Button
+             android:id="@+id/select_all"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/select_all"
+             android:onClick="btnSelAll"/>
+        <Button
+             android:id="@+id/select_none"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/select_none"
+             android:onClick="btnSelNone"/>
+    </LinearLayout>
+
+    <TextView
+        android:id="@+id/results"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:textSize="8pt"
+        android:layout_marginLeft="10sp"
+        android:layout_marginTop="15sp"
+        android:text="@string/results"/>
+
+</LinearLayout>
+
diff --git a/java/tests/ImageProcessing2/res/layout/main.xml b/java/tests/ImageProcessing2/res/layout/main.xml
index f0a2b92..765c7b1 100644
--- a/java/tests/ImageProcessing2/res/layout/main.xml
+++ b/java/tests/ImageProcessing2/res/layout/main.xml
@@ -36,27 +36,13 @@
                 android:layout_height="wrap_content" />
             <LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
                 android:orientation="horizontal"
-                android:layout_width="fill_parent"
+                android:layout_width="wrap_content"
                 android:layout_height="wrap_content">
-                    <Button
-                        android:layout_width="wrap_content"
-                        android:layout_height="wrap_content"
-                        android:text="@string/benchmark"
-                        android:onClick="benchmark"/>
-                    <TextView
-                        android:id="@+id/benchmarkText"
-                        android:layout_width="match_parent"
-                        android:layout_height="wrap_content"
-                        android:textSize="8pt"
-                        android:text="@string/saturation"/>
             </LinearLayout>
-            <Spinner
-                android:id="@+id/filterselection"
-                android:layout_width="fill_parent"
-                android:layout_height="wrap_content"/>
+
             <Spinner
                 android:id="@+id/spinner1"
-                android:layout_width="fill_parent"
+                android:layout_width="wrap_content"
                 android:layout_height="wrap_content"/>
             <TextView
                 android:id="@+id/slider1Text"
@@ -128,11 +114,6 @@
                 android:layout_marginRight="10sp"
                 android:layout_width="match_parent"
                 android:layout_height="wrap_content"/>
-            <Button
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:text="@string/benchmark_all"
-                    android:onClick="benchmark_all"/>
             </LinearLayout>
     </ScrollView>
 </LinearLayout>
diff --git a/java/tests/ImageProcessing2/res/layout/spinner_layout.xml b/java/tests/ImageProcessing2/res/layout/spinner_layout.xml
index 8196bbf..7e9590e 100644
--- a/java/tests/ImageProcessing2/res/layout/spinner_layout.xml
+++ b/java/tests/ImageProcessing2/res/layout/spinner_layout.xml
@@ -18,6 +18,6 @@
 <TextView xmlns:android="http://schemas.android.com/apk/res/android"
     android:layout_width="fill_parent"
     android:layout_height="fill_parent"
-    android:padding="10dp"
-    android:textSize="16sp"
+    android:padding="2sp"
+    android:textSize="14sp"
 />
diff --git a/java/tests/ImageProcessing2/res/menu/main_activity_actions.xml b/java/tests/ImageProcessing2/res/menu/main_activity_actions.xml
new file mode 100644
index 0000000..df0159b
--- /dev/null
+++ b/java/tests/ImageProcessing2/res/menu/main_activity_actions.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2014 The Android Open Source Project
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+-->
+
+<menu xmlns:android="http://schemas.android.com/apk/res/android" >
+    <item android:id="@+id/action_res"
+          android:title="@string/action_res"
+          android:icon="@drawable/ic_action_settings"
+          android:showAsAction="always"
+          android:actionViewClass="android.widget.Spinner" />
+
+
+    <item android:id="@+id/action_settings"
+          android:icon="@drawable/ic_action_settings"
+          android:title="@string/action_settings"
+          android:showAsAction="always"/>
+
+</menu>
diff --git a/java/tests/ImageProcessing2/res/values/strings.xml b/java/tests/ImageProcessing2/res/values/strings.xml
index a7dd165..c8f9bc4 100644
--- a/java/tests/ImageProcessing2/res/values/strings.xml
+++ b/java/tests/ImageProcessing2/res/values/strings.xml
@@ -29,6 +29,26 @@
     <string name="gamma">Gamma</string>
     <string name="saturation">Saturation</string>
     <string name="benchmark">Benchmark</string>
-    <string name="benchmark_all">Benchmark All</string>
+
+    <string name="results">Results: not run</string>
+    <string name="length_long">Long run</string>
+    <string name="length_short">Long run</string>
+    <string name="select_all">All</string>
+    <string name="select_none">None</string>
+
+    <string name="action_settings">Setting</string>
+    <string name="action_resolution">Resolution</string>
+
+    <string name="action_res">res</string>
+    <string name="ok">Ok</string>
+    <string name="cancel">Cancel</string>
+    <string name="settings">settings</string>
+    <string-array
+        name="settings_array">
+        <item>Animate paramaters during benchmark</item>
+        <item>Display output while testing</item>
+        <item>Run each test longer, 10 seconds</item>
+        <item>Pause 10 seconds between tests</item>
+    </string-array>
 
 </resources>
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Artistic1.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Artistic1.java
new file mode 100644
index 0000000..611b1c4
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Artistic1.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.support.v8.renderscript.*;
+
+public class Artistic1 extends TestBase {
+    private ScriptC_artistic1 mScript;
+    private Allocation mBlured;
+
+    public void createTest(android.content.res.Resources res) {
+        mScript = new ScriptC_artistic1(mRS);
+        mBlured = Allocation.createTyped(mRS, mInPixelsAllocation.getType());
+        mScript.set_gBlur(mBlured);
+
+        ScriptIntrinsicBlur blur = ScriptIntrinsicBlur.create(mRS, Element.U8_4(mRS));
+        blur.setRadius(20);
+        blur.setInput(mInPixelsAllocation);
+        blur.forEach(mBlured);
+    }
+
+    public void runTest() {
+        mScript.invoke_setup();
+        mScript.forEach_process(mInPixelsAllocation, mOutPixelsAllocation);
+    }
+
+}
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Blend.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Blend.java
index d81ba88..7513bd4 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Blend.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Blend.java
@@ -46,7 +46,7 @@
                     currentIntrinsic = pos;
                     if (mRS != null) {
                         runTest();
-                        act.updateDisplay();
+                        act.mProcessor.update();
                     }
                 }
 
@@ -114,10 +114,10 @@
         image2.copy2DRangeFrom(0, 0, mInPixelsAllocation2.getType().getX(), mInPixelsAllocation2.getType().getY(), mInPixelsAllocation2, 0, 0);
 
         mBlendHelper.set_alpha(image1Alpha);
-        mBlendHelper.forEach_setImageAlpha(image1);
+        mBlendHelper.forEach_setImageAlpha(image1, image1);
 
         mBlendHelper.set_alpha(image2Alpha);
-        mBlendHelper.forEach_setImageAlpha(image2);
+        mBlendHelper.forEach_setImageAlpha(image2, image2);
 
         switch (currentIntrinsic) {
         case 0:
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25.java
index fb5db18..374693b 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25.java
@@ -95,20 +95,4 @@
             mScript.forEach_vert(mOutPixelsAllocation);
         }
     }
-
-    public void setupBenchmark() {
-        if (mUseIntrinsic) {
-            mIntrinsic.setRadius(MAX_RADIUS);
-        } else {
-            mScript.invoke_setRadius(MAX_RADIUS);
-        }
-    }
-
-    public void exitBenchmark() {
-        if (mUseIntrinsic) {
-            mIntrinsic.setRadius(mRadius);
-        } else {
-            mScript.invoke_setRadius((int)mRadius);
-        }
-    }
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25G.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25G.java
index 19aa9f7..0d6939e 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25G.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Blur25G.java
@@ -76,14 +76,6 @@
         mIntrinsic.forEach(mScratchPixelsAllocation2);
     }
 
-    public void setupBenchmark() {
-        mIntrinsic.setRadius(MAX_RADIUS);
-    }
-
-    public void exitBenchmark() {
-        mIntrinsic.setRadius(mRadius);
-    }
-
     public void updateBitmap(Bitmap b) {
         mScript.forEach_toU8_4(mScratchPixelsAllocation2, mOutPixelsAllocation);
         mOutPixelsAllocation.copyTo(b);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/ColorMatrix.java b/java/tests/ImageProcessing2/src/com/android/rs/image/ColorMatrix.java
index 9a43f03..8da1615 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/ColorMatrix.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/ColorMatrix.java
@@ -51,6 +51,22 @@
         }
     }
 
+    public void animateBars(float time) {
+        Matrix4f m = new Matrix4f();
+        m.set(1, 0, (time + 0.2f) % 1.0f);
+        m.set(1, 1, (time + 0.9f) % 1.0f);
+        m.set(1, 2, (time + 0.4f) % 1.0f);
+        if (mUseIntrinsic) {
+            if (mUseGrey) {
+                return;
+            } else {
+                mIntrinsic.setColorMatrix(m);
+            }
+        } else {
+            mScript.invoke_setMatrix(m);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mInPixelsAllocation, mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve3x3.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve3x3.java
index 32c5846..89342dc 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve3x3.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve3x3.java
@@ -22,7 +22,7 @@
 import android.util.Log;
 
 public class Convolve3x3 extends TestBase {
-    private ScriptC_ip2_convolve3x3 mScript;
+    private ScriptC_convolve3x3 mScript;
     private ScriptIntrinsicConvolve3x3 mIntrinsic;
 
     private int mWidth;
@@ -33,21 +33,32 @@
         mUseIntrinsic = useIntrinsic;
     }
 
+    private float blend(float v1, float v2, float p) {
+        return (v2 * p) + (v1 * (1.f-p));
+    }
+
+    private float[] updateMatrix(float str) {
+        float f[] = new float[9];
+        float cf1 = blend(1.f / 9.f, 0.f, str);
+        float cf2 = blend(1.f / 9.f, -1.f, str);
+        float cf3 = blend(1.f / 9.f, 5.f, str);
+        f[0] =  cf1;  f[1] = cf2;   f[2] = cf1;
+        f[3] =  cf2;  f[4] = cf3;   f[5] = cf2;
+        f[6] =  cf1;  f[7] = cf2;   f[8] = cf1;
+        return f;
+    }
+
     public void createTest(android.content.res.Resources res) {
         mWidth = mInPixelsAllocation.getType().getX();
         mHeight = mInPixelsAllocation.getType().getY();
 
-        float f[] = new float[9];
-        f[0] =  0.f;    f[1] = -1.f;    f[2] =  0.f;
-        f[3] = -1.f;    f[4] =  5.f;    f[5] = -1.f;
-        f[6] =  0.f;    f[7] = -1.f;    f[8] =  0.f;
-
+        float f[] = updateMatrix(1.f);
         if (mUseIntrinsic) {
             mIntrinsic = ScriptIntrinsicConvolve3x3.create(mRS, Element.U8_4(mRS));
             mIntrinsic.setCoefficients(f);
             mIntrinsic.setInput(mInPixelsAllocation);
         } else {
-            mScript = new ScriptC_ip2_convolve3x3(mRS);
+            mScript = new ScriptC_convolve3x3(mRS);
             mScript.set_gCoeffs(f);
             mScript.set_gIn(mInPixelsAllocation);
             mScript.set_gWidth(mWidth);
@@ -55,6 +66,15 @@
         }
     }
 
+    public void animateBars(float time) {
+        float f[] = updateMatrix(time % 1.f);
+        if (mUseIntrinsic) {
+            mIntrinsic.setCoefficients(f);
+        } else {
+            mScript.set_gCoeffs(f);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve5x5.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve5x5.java
index 411e2a8..0dc4103 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve5x5.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Convolve5x5.java
@@ -33,11 +33,33 @@
         mUseIntrinsic = useIntrinsic;
     }
 
+    private float blend(float v1, float v2, float p) {
+        return (v2 * p) + (v1 * (1.f-p));
+    }
+
+    private float[] updateMatrix(float str) {
+        float f[] = new float[25];
+        final float f125 = 1.f / 25.f;
+        float cf1 = blend(f125, -1.f, str);
+        float cf2 = blend(f125, -3.f, str);
+        float cf3 = blend(f125, -4.f, str);
+        float cf4 = blend(f125, 6.f, str);
+        float cf5 = blend(f125, 20.f, str);
+        float cf6 = blend(f125, 0.f, str);
+        f[0] = cf1;  f[1] = cf2; f[2] = cf3; f[3] = cf2; f[4] = cf1;
+        f[5] = cf2;  f[6] = cf6; f[7] = cf4; f[8] = cf6; f[9] = cf2;
+        f[10]= cf3;  f[11]= cf4; f[12]= cf5; f[13]= cf4; f[14]= cf3;
+        f[15]= cf2;  f[16]= cf6; f[17]= cf4; f[18]= cf6; f[19]= cf2;
+        f[20]= cf1;  f[21]= cf2; f[22]= cf3; f[23]= cf2; f[24]= cf1;
+        return f;
+    }
+
+
     public void createTest(android.content.res.Resources res) {
         mWidth = mInPixelsAllocation.getType().getX();
         mHeight = mInPixelsAllocation.getType().getY();
 
-        float f[] = new float[25];
+        float f[] = updateMatrix(1.f);
         //f[0] = 0.012f; f[1] = 0.025f; f[2] = 0.031f; f[3] = 0.025f; f[4] = 0.012f;
         //f[5] = 0.025f; f[6] = 0.057f; f[7] = 0.075f; f[8] = 0.057f; f[9] = 0.025f;
         //f[10]= 0.031f; f[11]= 0.075f; f[12]= 0.095f; f[13]= 0.075f; f[14]= 0.031f;
@@ -50,12 +72,6 @@
         //f[15]= 4.f; f[16]= 8.f; f[17]= 0.f; f[18]= -8.f; f[19]= -4.f;
         //f[20]= 1.f; f[21]= 2.f; f[22]= 0.f; f[23]= -2.f; f[24]= -1.f;
 
-        f[0] = -1.f; f[1] = -3.f; f[2] = -4.f; f[3] = -3.f; f[4] = -1.f;
-        f[5] = -3.f; f[6] =  0.f; f[7] =  6.f; f[8] =  0.f; f[9] = -3.f;
-        f[10]= -4.f; f[11]=  6.f; f[12]= 20.f; f[13]=  6.f; f[14]= -4.f;
-        f[15]= -3.f; f[16]=  0.f; f[17]=  6.f; f[18]=  0.f; f[19]= -3.f;
-        f[20]= -1.f; f[21]= -3.f; f[22]= -4.f; f[23]= -3.f; f[24]= -1.f;
-
         if (mUseIntrinsic) {
             mIntrinsic = ScriptIntrinsicConvolve5x5.create(mRS, Element.U8_4(mRS));
             mIntrinsic.setCoefficients(f);
@@ -69,6 +85,15 @@
         }
     }
 
+    public void animateBars(float time) {
+        float f[] = updateMatrix(time % 1.f);
+        if (mUseIntrinsic) {
+            mIntrinsic.setCoefficients(f);
+        } else {
+            mScript.set_gCoeffs(f);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Fisheye.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Fisheye.java
index 80a2fb4..22bdd8e 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Fisheye.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Fisheye.java
@@ -68,6 +68,11 @@
         do_init();
     }
 
+    public void animateBars(float time) {
+        scale = time % 2.f;
+        do_init();
+    }
+
     private void do_init() {
         if (approx) {
             if (relaxed)
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Grain.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Grain.java
index 4dac265..4939ea3 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Grain.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Grain.java
@@ -40,6 +40,11 @@
         mScript.set_gNoiseStrength(s);
     }
 
+    public void animateBars(float time) {
+        mScript.set_gNoiseStrength(time % 1.f);
+    }
+
+
     private int findHighBit(int v) {
         int bit = 0;
         while (v > 1) {
@@ -84,6 +89,5 @@
         mScript.forEach_blend9(mNoise2);
         mScript.forEach_root(mInPixelsAllocation, mOutPixelsAllocation);
     }
-
 }
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/GroupTest.java b/java/tests/ImageProcessing2/src/com/android/rs/image/GroupTest.java
index a7ceebe..c224574 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/GroupTest.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/GroupTest.java
@@ -72,6 +72,14 @@
         }
     }
 
+    public void animateBars(float time) {
+        Matrix4f m = new Matrix4f();
+        m.set(1, 0, (time + 0.2f) % 1.0f);
+        m.set(1, 1, (time + 0.9f) % 1.0f);
+        m.set(1, 2, (time + 0.4f) % 1.0f);
+        mMatrix.setColorMatrix(m);
+    }
+
     public void runTest() {
         mConvolve.setInput(mInPixelsAllocation);
         if (mUseNative) {
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/IPControls.java b/java/tests/ImageProcessing2/src/com/android/rs/image/IPControls.java
new file mode 100644
index 0000000..b99c187
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/IPControls.java
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.view.Menu;
+import android.view.MenuItem;
+import android.view.MenuInflater;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.os.Handler;
+import android.graphics.Point;
+import android.view.SurfaceView;
+import android.widget.AdapterView;
+import android.widget.ArrayAdapter;
+import android.widget.ImageView;
+import android.widget.SeekBar;
+import android.widget.Spinner;
+import android.widget.ToggleButton;
+import android.widget.TextView;
+import android.widget.CompoundButton;
+import android.widget.ListView;
+import android.view.View;
+import java.util.ArrayList;
+import java.util.ListIterator;
+import android.util.Log;
+import android.content.Intent;
+
+import android.os.Environment;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+
+public class IPControls extends Activity {
+    private final String TAG = "Img";
+    public final String RESULT_FILE = "ip_compat_result.csv";
+
+    private Spinner mResSpinner;
+    private ListView mTestListView;
+    private TextView mResultView;
+
+    private ArrayAdapter<String> mTestListAdapter;
+    private ArrayList<String> mTestList = new ArrayList<String>();
+
+    private boolean mSettings[] = {true, true, true, false, false, false};
+    // Not supported in compatibility library version
+    //private static final int SETTING_USE_IO = 0;
+    private static final int SETTING_ANIMATE = 1;
+    private static final int SETTING_DISPLAY = 2;
+    private static final int SETTING_LONG_RUN = 3;
+    private static final int SETTING_PAUSE = 4;
+
+    private float mResults[];
+
+    public enum Resolutions {
+        RES_1080P(1920, 1080, "1080p (1920x1080)"),
+        RES_720P(1280, 720, "720p (1280x720)"),
+        RES_WVGA(800, 480, "WVGA (800x480)");
+
+        private final String name;
+        public final int width;
+        public final int height;
+
+        private Resolutions(int w, int h, String s) {
+            width = w;
+            height = h;
+            name = s;
+        }
+
+        // return quoted string as displayed test name
+        public String toString() {
+            return name;
+        }
+    }
+    private Resolutions mRes;
+
+    @Override
+    public boolean onCreateOptionsMenu(Menu menu) {
+        // Inflate the menu items for use in the action bar
+        MenuInflater inflater = getMenuInflater();
+        inflater.inflate(R.menu.main_activity_actions, menu);
+
+        MenuItem searchItem = menu.findItem(R.id.action_res);
+        mResSpinner = (Spinner) searchItem.getActionView();
+
+        mResSpinner.setOnItemSelectedListener(mResSpinnerListener);
+        mResSpinner.setAdapter(new ArrayAdapter<Resolutions>(
+            this, R.layout.spinner_layout, Resolutions.values()));
+
+        // Choose one of the image sizes that close to the resolution
+        // of the screen.
+        Point size = new Point();
+        getWindowManager().getDefaultDisplay().getSize(size);
+        int md = (size.x > size.y) ? size.x : size.y;
+        for (int ct=0; ct < Resolutions.values().length; ct++) {
+            if (Resolutions.values()[ct].width <= (int)(md * 1.2)) {
+                mResSpinner.setSelection(ct);
+                break;
+            }
+        }
+
+        return super.onCreateOptionsMenu(menu);
+    }
+
+
+    private AdapterView.OnItemSelectedListener mResSpinnerListener =
+            new AdapterView.OnItemSelectedListener() {
+                public void onItemSelected(AdapterView<?> parent, View view, int pos, long id) {
+                    mRes = Resolutions.values()[pos];
+                }
+
+                public void onNothingSelected(AdapterView parent) {
+                }
+            };
+
+    void launchDemo(int id) {
+        IPTestList.TestName t[] = IPTestList.TestName.values();
+
+        int testList[] = new int[1];
+        testList[0] = id;
+
+        Intent intent = makeBasicLaunchIntent();
+        intent.putExtra("tests", testList);
+        intent.putExtra("demo", true);
+        startActivityForResult(intent, 0);
+    }
+
+    void init() {
+
+        for (int i=0; i < IPTestList.TestName.values().length; i++) {
+            mTestList.add(IPTestList.TestName.values()[i].toString());
+        }
+
+        mTestListView = (ListView) findViewById(R.id.test_list);
+        mTestListAdapter = new ArrayAdapter(this,
+                android.R.layout.simple_list_item_activated_1,
+                mTestList);
+
+        mTestListView.setAdapter(mTestListAdapter);
+        mTestListView.setChoiceMode(ListView.CHOICE_MODE_MULTIPLE);
+        mTestListAdapter.notifyDataSetChanged();
+
+        mResultView = (TextView) findViewById(R.id.results);
+
+        mTestListView.setOnItemLongClickListener(new ListView.OnItemLongClickListener() {
+                public boolean onItemLongClick(AdapterView<?> arg0, View arg1,
+                        int pos, long id) {
+                    launchDemo(pos);
+                    return true;
+                }
+            });
+    }
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.controls);
+        init();
+    }
+
+    @Override
+    protected void onPause() {
+        super.onPause();
+
+        //cleanup();
+    }
+
+
+    @Override
+    protected void onResume() {
+        super.onResume();
+
+       // init();
+    }
+
+    private void checkGroup(int group) {
+        IPTestList.TestName t[] = IPTestList.TestName.values();
+        for (int i=0; i < t.length; i++) {
+            mTestListView.setItemChecked(i, group == t[i].group);
+        }
+    }
+
+    Intent makeBasicLaunchIntent() {
+        Intent intent = new Intent(this, ImageProcessingActivity2.class);
+        intent.putExtra("enable long", mSettings[SETTING_LONG_RUN]);
+        intent.putExtra("enable pause", mSettings[SETTING_PAUSE]);
+        intent.putExtra("enable animate", mSettings[SETTING_ANIMATE]);
+        intent.putExtra("enable display", mSettings[SETTING_DISPLAY]);
+        intent.putExtra("resolution X", mRes.width);
+        intent.putExtra("resolution Y", mRes.height);
+        return intent;
+    }
+
+    public void btnRun(View v) {
+        IPTestList.TestName t[] = IPTestList.TestName.values();
+
+        int count = 0;
+        for (int i = 0; i < t.length; i++) {
+            if (mTestListView.isItemChecked(i)) {
+                count++;
+            }
+        }
+        if (count == 0) {
+            return;
+        }
+
+        int testList[] = new int[count];
+        count = 0;
+        for (int i = 0; i < t.length; i++) {
+            if (mTestListView.isItemChecked(i)) {
+                testList[count++] = i;
+            }
+        }
+
+        Intent intent = makeBasicLaunchIntent();
+        intent.putExtra("tests", testList);
+        startActivityForResult(intent, 0);
+    }
+
+    float rebase(float v, IPTestList.TestName t) {
+        if (v > 0.001) {
+            v = t.baseline / v;
+        }
+        float pr = (1920.f / mRes.width) * (1080.f / mRes.height);
+        return v / pr;
+    }
+
+    private void writeResults() {
+        // write result into a file
+        File externalStorage = Environment.getExternalStorageDirectory();
+        if (!externalStorage.canWrite()) {
+            Log.v(TAG, "sdcard is not writable");
+            return;
+        }
+        File resultFile = new File(externalStorage, RESULT_FILE);
+        resultFile.setWritable(true, false);
+        try {
+            BufferedWriter rsWriter = new BufferedWriter(new FileWriter(resultFile));
+            Log.v(TAG, "Saved results in: " + resultFile.getAbsolutePath());
+            java.text.DecimalFormat df = new java.text.DecimalFormat("######.##");
+
+            for (int ct=0; ct < IPTestList.TestName.values().length; ct++) {
+                IPTestList.TestName t = IPTestList.TestName.values()[ct];
+                final float r = mResults[ct];
+                float r2 = rebase(r, t);
+                String s = new String("" + t.toString() + ", " + df.format(r) + ", " + df.format(r2));
+                rsWriter.write(s + "\n");
+            }
+            rsWriter.close();
+        } catch (IOException e) {
+            Log.v(TAG, "Unable to write result file " + e.getMessage());
+        }
+    }
+
+    protected void onActivityResult(int requestCode, int resultCode, Intent data) {
+        if (requestCode == 0) {
+            if (resultCode == RESULT_OK) {
+                java.text.DecimalFormat df = new java.text.DecimalFormat("######.#");
+                mResults = new float[IPTestList.TestName.values().length];
+
+                float r[] = data.getFloatArrayExtra("results");
+                int id[] = data.getIntArrayExtra("tests");
+
+                for (int ct=0; ct < id.length; ct++) {
+                    IPTestList.TestName t = IPTestList.TestName.values()[id[ct]];
+
+                    String s = t.toString() + "   " + df.format(rebase(r[ct], t)) +
+                            "X,   " + df.format(r[ct]) + "ms";
+                    mTestList.set(id[ct], s);
+                    mTestListAdapter.notifyDataSetChanged();
+                    mResults[id[ct]] = r[ct];
+                }
+
+                double gm[] = {1.0, 1.0, 1.0};
+                double count[] = {0, 0, 0};
+                for (int ct=0; ct < IPTestList.TestName.values().length; ct++) {
+                    IPTestList.TestName t = IPTestList.TestName.values()[ct];
+                    gm[t.group] *= rebase(mResults[ct], t);
+                    count[t.group] += 1.0;
+                }
+                gm[0] = java.lang.Math.pow(gm[0], 1.0 / count[0]);
+                gm[1] = java.lang.Math.pow(gm[1], 1.0 / count[1]);
+                gm[2] = java.lang.Math.pow(gm[2], 1.0 / count[2]);
+
+                String s = "Results:  fp full=" + df.format(gm[0]) +
+                        ",  fp relaxed=" +df.format(gm[1]) +
+                        ",  intrinsics=" + df.format(gm[2]);
+                mResultView.setText(s);
+                writeResults();
+            }
+        }
+    }
+
+    public void btnSelAll(View v) {
+        IPTestList.TestName t[] = IPTestList.TestName.values();
+        for (int i=0; i < t.length; i++) {
+            mTestListView.setItemChecked(i, true);
+        }
+    }
+
+    public boolean onOptionsItemSelected(MenuItem item) {
+        // Handle presses on the action bar items
+        switch(item.getItemId()) {
+            case R.id.action_settings:
+                IPSettings newFragment = new IPSettings(mSettings);
+                newFragment.show(getFragmentManager(), "settings");
+                return true;
+            default:
+                return super.onOptionsItemSelected(item);
+        }
+    }
+
+    public void btnSelNone(View v) {
+        checkGroup(-1);
+    }
+
+    public void btnSelHp(View v) {
+        checkGroup(0);
+    }
+
+    public void btnSelLp(View v) {
+        checkGroup(1);
+    }
+
+    public void btnSettings(View v) {
+        IPSettings newFragment = new IPSettings(mSettings);
+        newFragment.show(getFragmentManager(), "settings");
+    }
+
+    public void btnSelIntrinsic(View v) {
+        checkGroup(2);
+    }
+
+
+
+}
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/IPSettings.java b/java/tests/ImageProcessing2/src/com/android/rs/image/IPSettings.java
new file mode 100644
index 0000000..e73f542
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/IPSettings.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.DialogFragment;
+import android.app.Dialog;
+import android.content.DialogInterface;
+import android.os.Bundle;
+import android.view.View;
+
+public class IPSettings extends DialogFragment {
+    private boolean[] mEnables;
+    public boolean mOk = false;
+
+    public IPSettings(boolean[] enables) {
+        mEnables = enables;
+    }
+
+    @Override
+    public Dialog onCreateDialog(Bundle savedInstanceState) {
+        AlertDialog.Builder builder = new AlertDialog.Builder(getActivity());
+        builder.setTitle(R.string.settings);
+
+        // Specify the list array, the items to be selected by default (null for none),
+        // and the listener through which to receive callbacks when items are selected
+        builder.setMultiChoiceItems(R.array.settings_array, mEnables,
+                          new DialogInterface.OnMultiChoiceClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int which, boolean isChecked) {
+                       mEnables[which] = isChecked;
+                   }
+               });
+
+        // Set the action buttons
+        builder.setPositiveButton(R.string.ok, new DialogInterface.OnClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int id) {
+                       mOk = true;
+                   }
+               });
+        builder.setNegativeButton(R.string.cancel, new DialogInterface.OnClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int id) {
+                   }
+               });
+
+        return builder.create();
+    }
+}
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/IPTestList.java b/java/tests/ImageProcessing2/src/com/android/rs/image/IPTestList.java
new file mode 100644
index 0000000..f2e7579
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/IPTestList.java
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.app.Activity;
+import android.view.View;
+import android.util.Log;
+
+public class IPTestList {
+    private final String TAG = "Img";
+    public final String RESULT_FILE = "image_processing_result.csv";
+
+    public static final int FULL_FP = 0;
+    public static final int RELAXED_FP = 1;
+    public static final int INTRINSIC = 2;
+
+    /**
+     * Define enum type for test names
+     */
+    public enum TestName {
+        LEVELS_VEC3_RELAXED ("Levels Vec3 Relaxed", RELAXED_FP, 61.1f),
+        LEVELS_VEC4_RELAXED ("Levels Vec4 Relaxed", RELAXED_FP, 44.6f),
+        LEVELS_VEC3_FULL ("Levels Vec3 Full", FULL_FP, 61.9f),
+        LEVELS_VEC4_FULL ("Levels Vec4 Full", FULL_FP, 73.f),
+        BLUR_RADIUS_25 ("Blur radius 25", RELAXED_FP, 1103.f),
+        INTRINSIC_BLUR_RADIUS_25 ("Intrinsic Blur radius 25", INTRINSIC, 176.f),
+        GREYSCALE ("Greyscale", RELAXED_FP, 43.7f),
+        GRAIN ("Grain", RELAXED_FP, 147.4f),
+        FISHEYE_FULL ("Fisheye Full", FULL_FP, 192.f),
+        FISHEYE_RELAXED ("Fisheye Relaxed", RELAXED_FP, 181.f),
+        FISHEYE_APPROXIMATE_FULL ("Fisheye Approximate Full", FULL_FP, 193.f),
+        FISHEYE_APPROXIMATE_RELAXED ("Fisheye Approximate Relaxed", RELAXED_FP, 183.f),
+        VIGNETTE_FULL ("Vignette Full", FULL_FP, 101.f),
+        VIGNETTE_RELAXED ("Vignette Relaxed", RELAXED_FP, 116.f),
+        VIGNETTE_APPROXIMATE_FULL ("Vignette Approximate Full", FULL_FP, 85.1f),
+        VIGNETTE_APPROXIMATE_RELAXED ("Vignette Approximate Relaxed", RELAXED_FP, 96.7f),
+        GROUP_TEST_EMULATED ("Group Test (emulated)", INTRINSIC, 51.7f),
+        GROUP_TEST_NATIVE ("Group Test (native)", INTRINSIC, 52.9f),
+        CONVOLVE_3X3 ("Convolve 3x3", RELAXED_FP, 74.2f),
+        INTRINSICS_CONVOLVE_3X3 ("Intrinsics Convolve 3x3", INTRINSIC, 33.3f),
+        COLOR_MATRIX ("ColorMatrix", RELAXED_FP, 33.8f),
+        INTRINSICS_COLOR_MATRIX ("Intrinsics ColorMatrix", INTRINSIC, 21.3f),
+        INTRINSICS_COLOR_MATRIX_GREY ("Intrinsics ColorMatrix Grey", INTRINSIC, 21.4f),
+        COPY ("Copy", RELAXED_FP, 21.4f),
+        CROSS_PROCESS_USING_LUT ("CrossProcess (using LUT)", INTRINSIC, 23.1f),
+        CONVOLVE_5X5 ("Convolve 5x5", RELAXED_FP, 236.f),
+        INTRINSICS_CONVOLVE_5X5 ("Intrinsics Convolve 5x5", INTRINSIC, 39.6f),
+        MANDELBROT_FLOAT ("Mandelbrot (fp32)", FULL_FP, 117.f),
+        MANDELBROT_DOUBLE ("Mandelbrot (fp64)", FULL_FP, 136.f),
+        INTRINSICS_BLEND ("Intrinsics Blend", INTRINSIC, 105.f),
+        INTRINSICS_BLUR_25G ("Intrinsics Blur 25 uchar", INTRINSIC, 37.8f),
+        VIBRANCE ("Vibrance", RELAXED_FP, 103.f),
+        BW_FILTER ("BW Filter", RELAXED_FP, 86.f),
+        SHADOWS ("Shadows", RELAXED_FP, 130.f),
+        CONTRAST ("Contrast", RELAXED_FP, 45.4f),
+        EXPOSURE ("Exposure", RELAXED_FP, 73.4f),
+        WHITE_BALANCE ("White Balance", RELAXED_FP, 138.2f),
+        COLOR_CUBE ("Color Cube", RELAXED_FP, 83.9f),
+        COLOR_CUBE_3D_INTRINSIC ("Color Cube (3D LUT intrinsic)", INTRINSIC, 34.7f),
+        ARTISTIC1 ("Artistic 1", RELAXED_FP, 140.f),
+        RESIZE_BI_SCRIPT ("Resize BiCubic Script", RELAXED_FP, 253.f),
+        RESIZE_BI_INTRINSIC ("Resize BiCubic Intrinsic", INTRINSIC, 255.f),
+        POSTERIZE_INVOKE ("Posterize with invoke", RELAXED_FP, 215.f),
+        POSTERIZE_SET ("Posterize with set", INTRINSIC, 221.f);
+
+
+        private final String name;
+        public final int group;
+        public final float baseline;
+
+        private TestName(String s, int g, float base) {
+            name = s;
+            group = g;
+            baseline = base;
+        }
+        private TestName(String s, int g) {
+            name = s;
+            group = g;
+            baseline = 1.f;
+        }
+
+        // return quoted string as displayed test name
+        public String toString() {
+            return name;
+        }
+    }
+
+    static TestBase newTest(TestName testName) {
+        switch(testName) {
+        case LEVELS_VEC3_RELAXED:
+            return new LevelsV4(false, false);
+        case LEVELS_VEC4_RELAXED:
+            return new LevelsV4(false, true);
+        case LEVELS_VEC3_FULL:
+            return new LevelsV4(true, false);
+        case LEVELS_VEC4_FULL:
+            return new LevelsV4(true, true);
+        case BLUR_RADIUS_25:
+            return new Blur25(false);
+        case INTRINSIC_BLUR_RADIUS_25:
+            return new Blur25(true);
+        case GREYSCALE:
+            return new Greyscale();
+        case GRAIN:
+            return new Grain();
+        case FISHEYE_FULL:
+            return new Fisheye(false, false);
+        case FISHEYE_RELAXED:
+            return new Fisheye(false, true);
+        case FISHEYE_APPROXIMATE_FULL:
+            return new Fisheye(true, false);
+        case FISHEYE_APPROXIMATE_RELAXED:
+            return new Fisheye(true, true);
+        case VIGNETTE_FULL:
+            return new Vignette(false, false);
+        case VIGNETTE_RELAXED:
+            return new Vignette(false, true);
+        case VIGNETTE_APPROXIMATE_FULL:
+            return new Vignette(true, false);
+        case VIGNETTE_APPROXIMATE_RELAXED:
+            return new Vignette(true, true);
+        case GROUP_TEST_EMULATED:
+            return new GroupTest(false);
+        case GROUP_TEST_NATIVE:
+            return new GroupTest(true);
+        case CONVOLVE_3X3:
+            return new Convolve3x3(false);
+        case INTRINSICS_CONVOLVE_3X3:
+            return new Convolve3x3(true);
+        case COLOR_MATRIX:
+            return new ColorMatrix(false, false);
+        case INTRINSICS_COLOR_MATRIX:
+            return new ColorMatrix(true, false);
+        case INTRINSICS_COLOR_MATRIX_GREY:
+            return new ColorMatrix(true, true);
+        case COPY:
+            return new Copy();
+        case CROSS_PROCESS_USING_LUT:
+            return new CrossProcess();
+        case CONVOLVE_5X5:
+            return new Convolve5x5(false);
+        case INTRINSICS_CONVOLVE_5X5:
+            return new Convolve5x5(true);
+        case MANDELBROT_FLOAT:
+            return new Mandelbrot(false);
+        case MANDELBROT_DOUBLE:
+            return new Mandelbrot(true);
+        case INTRINSICS_BLEND:
+            return new Blend();
+        case INTRINSICS_BLUR_25G:
+            return new Blur25G();
+        case VIBRANCE:
+            return new Vibrance();
+        case BW_FILTER:
+            return new BWFilter();
+        case SHADOWS:
+            return new Shadows();
+        case CONTRAST:
+            return new Contrast();
+        case EXPOSURE:
+            return new Exposure();
+        case WHITE_BALANCE:
+            return new WhiteBalance();
+        case COLOR_CUBE:
+            return new ColorCube(false);
+        case COLOR_CUBE_3D_INTRINSIC:
+            return new ColorCube(true);
+        case ARTISTIC1:
+            return new Artistic1();
+        case RESIZE_BI_SCRIPT:
+            return new Resize(false);
+        case RESIZE_BI_INTRINSIC:
+            return new Resize(true);
+        case POSTERIZE_INVOKE:
+            return new Posterize(true);
+        case POSTERIZE_SET:
+            return new Posterize(false);
+        }
+        return null;
+    }
+}
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/ImageProcessingActivity2.java b/java/tests/ImageProcessing2/src/com/android/rs/image/ImageProcessingActivity2.java
index 4b0e2dd..0e133da 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/ImageProcessingActivity2.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/ImageProcessingActivity2.java
@@ -17,13 +17,15 @@
 package com.android.rs.image2;
 
 import android.app.Activity;
+
+import android.content.Intent;
 import android.os.Bundle;
 import android.graphics.BitmapFactory;
 import android.graphics.Bitmap;
 import android.graphics.Canvas;
 import android.support.v8.renderscript.*;
-import android.view.SurfaceView;
-import android.view.SurfaceHolder;
+import android.os.Handler;
+import android.os.Message;
 import android.widget.AdapterView;
 import android.widget.ArrayAdapter;
 import android.widget.ImageView;
@@ -31,89 +33,14 @@
 import android.widget.Spinner;
 import android.widget.TextView;
 import android.view.View;
-import android.util.Log;
-import java.lang.Math;
+import android.graphics.Point;
 
-import android.os.Environment;
-import android.app.Instrumentation;
-import android.content.Context;
-import android.content.Intent;
-import android.net.Uri;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
+import android.util.Log;
+
 
 public class ImageProcessingActivity2 extends Activity
                                        implements SeekBar.OnSeekBarChangeListener {
     private final String TAG = "Img";
-    public final String RESULT_FILE = "image_processing_result.csv";
-
-    RenderScript mRS;
-    Allocation mInPixelsAllocation;
-    Allocation mInPixelsAllocation2;
-    Allocation mOutPixelsAllocation;
-
-    /**
-     * Define enum type for test names
-     */
-    public enum TestName {
-        // totally there are 38 test cases
-        LEVELS_VEC3_RELAXED ("Levels Vec3 Relaxed"),
-        LEVELS_VEC4_RELAXED ("Levels Vec4 Relaxed"),
-        LEVELS_VEC3_FULL ("Levels Vec3 Full"),
-        LEVELS_VEC4_FULL ("Levels Vec4 Full"),
-        BLUR_RADIUS_25 ("Blur radius 25"),
-        INTRINSIC_BLUE_RADIUS_25 ("Intrinsic Blur radius 25"),
-        GREYSCALE ("Greyscale"),
-        GRAIN ("Grain"),
-        FISHEYE_FULL ("Fisheye Full"),
-        FISHEYE_RELAXED ("Fisheye Relaxed"),
-        FISHEYE_APPROXIMATE_FULL ("Fisheye Approximate Full"),
-        FISHEYE_APPROXIMATE_RELAXED ("Fisheye Approximate Relaxed"),
-        VIGNETTE_FULL ("Vignette Full"),
-        VIGNETTE_RELAXED ("Vignette Relaxed"),
-        VIGNETTE_APPROXIMATE_FULL ("Vignette Approximate Full"),
-        VIGNETTE_APPROXIMATE_RELAXED ("Vignette Approximate Relaxed"),
-        GROUP_TEST_EMULATED ("Group Test (emulated)"),
-        GROUP_TEST_NATIVE ("Group Test (native)"),
-        CONVOLVE_3X3 ("Convolve 3x3"),
-        INTRINSICS_CONVOLVE_3X3 ("Intrinsics Convolve 3x3"),
-        COLOR_MATRIX ("ColorMatrix"),
-        INTRINSICS_COLOR_MATRIX ("Intrinsics ColorMatrix"),
-        INTRINSICS_COLOR_MATRIX_GREY ("Intrinsics ColorMatrix Grey"),
-        COPY ("Copy"),
-        CROSS_PROCESS_USING_LUT ("CrossProcess (using LUT)"),
-        CONVOLVE_5X5 ("Convolve 5x5"),
-        INTRINSICS_CONVOLVE_5X5 ("Intrinsics Convolve 5x5"),
-        MANDELBROT ("Mandelbrot"),
-        INTRINSICS_BLEND ("Intrinsics Blend"),
-        INTRINSICS_BLUR_25G ("Intrinsics Blur 25 uchar"),
-        VIBRANCE ("Vibrance"),
-        BW_FILTER ("BW Filter"),
-        SHADOWS ("Shadows"),
-        CONTRAST ("Contrast"),
-        EXPOSURE ("Exposure"),
-        WHITE_BALANCE ("White Balance"),
-        COLOR_CUBE ("Color Cube"),
-        COLOR_CUBE_3D_INTRINSIC ("Color Cube (3D LUT intrinsic)");
-
-
-        private final String name;
-
-        private TestName(String s) {
-            name = s;
-        }
-
-        // return quoted string as displayed test name
-        public String toString() {
-            return name;
-        }
-    }
-
-    Bitmap mBitmapIn;
-    Bitmap mBitmapIn2;
-    Bitmap mBitmapOut;
 
     private Spinner mSpinner;
     private SeekBar mBar1;
@@ -121,47 +48,431 @@
     private SeekBar mBar3;
     private SeekBar mBar4;
     private SeekBar mBar5;
+
+    private int mBars[] = new int[5];
+    private int mBarsOld[] = new int[5];
+
     private TextView mText1;
     private TextView mText2;
     private TextView mText3;
     private TextView mText4;
     private TextView mText5;
-
-    private float mSaturation = 1.0f;
-
-    private TextView mBenchmarkResult;
-    private Spinner mTestSpinner;
-
-    private SurfaceView mSurfaceView;
     private ImageView mDisplayView;
 
+    private int mTestList[];
+    private float mTestResults[];
+
+    private boolean mToggleLong;
+    private boolean mTogglePause;
+    private boolean mToggleAnimate;
+    private boolean mToggleDisplay;
+    private int mBitmapWidth;
+    private int mBitmapHeight;
+    private boolean mDemoMode;
+
+    // Updates pending is a counter of how many kernels have been
+    // sent to RS for processing
+    //
+    // In benchmark this is incremented each time a kernel is launched and
+    // decremented each time a kernel completes
+    //
+    // In demo mode, each UI input increments the counter and it is zeroed
+    // when the latest settings are sent to RS for processing.
+    private int mUpdatesPending;
+
+    // In demo mode this is used to count updates in the pipeline.  It's
+    // incremented when work is submitted to RS and decremented when invalidate is
+    // called to display a result.
+    private int mShowsPending;
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+
+    // Message processor to handle notifications for when kernel completes
+    private class MessageProcessor extends RenderScript.RSMessageHandler {
+        MessageProcessor() {
+        }
+
+        public void run() {
+            synchronized(mProcessor) {
+                // In demo mode, decrement the pending displays and notify the
+                // UI processor it can now enqueue more work if additional updates
+                // are blocked by a full pipeline.
+                if (mShowsPending > 0) {
+                    mShowsPending --;
+                    mProcessor.notifyAll();
+                    if (mToggleDisplay) {
+                        mProcessor.mHandler.sendMessage(Message.obtain());
+                    }
+                }
+            }
+        }
+    }
+
+
+    /////////////////////////////////////////////////////////////////////////
+    // Processor is a helper thread for running the work without
+    // blocking the UI thread.
+    class Processor extends Thread {
+        RenderScript mRS;
+        ScriptC_util mScriptUtils;
+        Allocation mInPixelsAllocation;
+        Allocation mInPixelsAllocation2;
+        Allocation mOutDisplayAllocation1;
+        Allocation mOutDisplayAllocation2;
+        Bitmap mBitmapOut1;
+        Bitmap mBitmapOut2;
+        int mActiveBitmap;
+
+        private float mLastResult;
+        private boolean mRun = true;
+        private boolean mDoingBenchmark;
+        private TestBase mTest;
+
+        private boolean mBenchmarkMode;
+
+        // We don't want to call the "changed" methods excessively as this
+        // can cause extra work for drivers.  Before running a test update
+        // any bars which have changed.
+        void runTest() {
+            if (mBars[0] != mBarsOld[0]) {
+                mTest.onBar1Changed(mBars[0]);
+                mBarsOld[0] = mBars[0];
+            }
+            if (mBars[1] != mBarsOld[1]) {
+                mTest.onBar2Changed(mBars[1]);
+                mBarsOld[1] = mBars[1];
+            }
+            if (mBars[2] != mBarsOld[2]) {
+                mTest.onBar3Changed(mBars[2]);
+                mBarsOld[2] = mBars[2];
+            }
+            if (mBars[3] != mBarsOld[3]) {
+                mTest.onBar4Changed(mBars[3]);
+                mBarsOld[3] = mBars[3];
+            }
+            if (mBars[4] != mBarsOld[4]) {
+                mTest.onBar5Changed(mBars[4]);
+                mBarsOld[4] = mBars[4];
+            }
+            mTest.runTest();
+        }
+
+        Processor(RenderScript rs, boolean benchmarkMode) {
+            mRS = rs;
+            mRS.setMessageHandler(new MessageProcessor());
+            mScriptUtils = new ScriptC_util(mRS);
+
+            switch(mBitmapWidth) {
+            case 1920:
+                mInPixelsAllocation = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img1920x1080a);
+                mInPixelsAllocation2 = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img1920x1080b);
+                mBitmapOut1 = Bitmap.createBitmap(1920, 1080, Bitmap.Config.ARGB_8888);
+                mBitmapOut2 = Bitmap.createBitmap(1920, 1080, Bitmap.Config.ARGB_8888);
+                break;
+            case 1280:
+                mInPixelsAllocation = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img1280x720a);
+                mInPixelsAllocation2 = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img1280x720b);
+                mBitmapOut1 = Bitmap.createBitmap(1280, 720, Bitmap.Config.ARGB_8888);
+                mBitmapOut2 = Bitmap.createBitmap(1280, 720, Bitmap.Config.ARGB_8888);
+                break;
+            case 800:
+                mInPixelsAllocation = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img800x450a);
+                mInPixelsAllocation2 = Allocation.createFromBitmapResource(
+                        mRS, getResources(), R.drawable.img800x450b);
+                mBitmapOut1 = Bitmap.createBitmap(800, 450, Bitmap.Config.ARGB_8888);
+                mBitmapOut2 = Bitmap.createBitmap(800, 450, Bitmap.Config.ARGB_8888);
+                break;
+            }
+
+            mBitmapOut1.setHasAlpha(false);
+            mBitmapOut2.setHasAlpha(false);
+            mOutDisplayAllocation1 = Allocation.createFromBitmap(mRS, mBitmapOut1);
+            mOutDisplayAllocation2 = Allocation.createFromBitmap(mRS, mBitmapOut2);
+            mBenchmarkMode = benchmarkMode;
+            start();
+        }
+
+        class Result {
+            float totalTime;
+            int itterations;
+        }
+
+        // Run one loop of kernels for at least the specified minimum time.
+        // The function returns the average time in ms for the test run
+        private Result runBenchmarkLoop(float minTime) {
+            mUpdatesPending = 0;
+            Result r = new Result();
+
+            long t = java.lang.System.currentTimeMillis();
+            do {
+                synchronized(this) {
+                    // Shows pending is used to track the number of kernels in the RS pipeline
+                    // We throttle it to 2.  This provide some buffering to allow a kernel to be started
+                    // before we are nofitied the previous finished.  However, larger numbers are uncommon
+                    // in interactive apps as they introduce 'lag' between user input and display.
+                    mShowsPending++;
+                    if (mShowsPending > 2) {
+                        try {
+                            this.wait();
+                        } catch(InterruptedException e) {
+                        }
+                    }
+                }
+
+                // If animations are enabled update the test state.
+                if (mToggleAnimate) {
+                    mTest.animateBars(r.totalTime);
+                }
+
+                // Run the kernel
+                if (mActiveBitmap == 0) {
+                    mTest.mOutPixelsAllocation = mOutDisplayAllocation1;
+                } else {
+                    mTest.mOutPixelsAllocation = mOutDisplayAllocation2;
+                }
+                mTest.runTest();
+                r.itterations ++;
+
+                if (mToggleDisplay) {
+                    if (mActiveBitmap == 0) {
+                        mOutDisplayAllocation1.copyTo(mBitmapOut1);
+                    } else {
+                        mOutDisplayAllocation2.copyTo(mBitmapOut2);
+                    }
+                }
+
+                // Send our RS message handler a message so we know when this work has completed
+                //mRS.sendMessage(mActiveBitmap, null);
+                mScriptUtils.invoke_utilSendMessage(mActiveBitmap);
+                mActiveBitmap ^= 1;
+
+                long t2 = java.lang.System.currentTimeMillis();
+                r.totalTime += (t2 - t) / 1000.f;
+                t = t2;
+            } while (r.totalTime < minTime);
+
+            // Wait for any stray operations to complete and update the final time
+            mRS.finish();
+            long t2 = java.lang.System.currentTimeMillis();
+            r.totalTime += (t2 - t) / 1000.f;
+            t = t2;
+            return r;
+        }
+
+
+        // Get a benchmark result for a specific test
+        private float getBenchmark() {
+            mDoingBenchmark = true;
+            mUpdatesPending = 0;
+
+            long result = 0;
+            float runtime = 1.f;
+            if (mToggleLong) {
+                runtime = 10.f;
+            }
+
+            // We run a short bit of work before starting the actual test
+            // this is to let any power management do its job and respond
+            runBenchmarkLoop(0.3f);
+
+            // Run the actual benchmark
+            Result r = runBenchmarkLoop(runtime);
+
+            Log.v("rs", "Test: time=" + r.totalTime +"s,  frames=" + r.itterations +
+                  ", avg=" + r.totalTime / r.itterations * 1000.f);
+
+            mDoingBenchmark = false;
+            return r.totalTime / r.itterations * 1000.f;
+        }
+
+        private int mDisplayedBitmap;
+        private Handler mHandler = new Handler() {
+            @Override
+            public void handleMessage(Message msg) {
+                if (mDisplayedBitmap == 0) {
+                    mDisplayView.setImageBitmap(mBitmapOut1);
+                } else {
+                    mDisplayView.setImageBitmap(mBitmapOut2);
+                }
+                mDisplayedBitmap ^= 1;
+                mDisplayView.invalidate();
+            }
+        };
+
+        public void run() {
+            while (mRun) {
+                // Our loop for launching tests or benchmarks
+                synchronized(this) {
+                    // If we have no work to do, or we have displays pending, wait
+                    if ((mUpdatesPending == 0) || (mShowsPending != 0)) {
+                        try {
+                            this.wait();
+                        } catch(InterruptedException e) {
+                        }
+                    }
+
+                    // We may have been asked to exit while waiting
+                    if (!mRun) return;
+                }
+
+                if (mBenchmarkMode) {
+                    // Loop over the tests we want to benchmark
+                    for (int ct=0; (ct < mTestList.length) && mRun; ct++) {
+
+                        // For reproducibility we wait a short time for any sporadic work
+                        // created by the user touching the screen to launch the test to pass.
+                        // Also allows for things to settle after the test changes.
+                        mRS.finish();
+                        try {
+                            sleep(250);
+                        } catch(InterruptedException e) {
+                        }
+
+                        // If we just ran a test, we destroy it here to relieve some memory pressure
+                        if (mTest != null) {
+                            mTest.destroy();
+                        }
+
+                        // Select the next test
+                        mTest = changeTest(mTestList[ct], false);
+
+                        // If the user selected the "long pause" option, wait
+                        if (mTogglePause) {
+                            for (int i=0; (i < 100) && mRun; i++) {
+                                try {
+                                    sleep(100);
+                                } catch(InterruptedException e) {
+                                }
+                            }
+                        }
+
+                        // Run the test
+                        mTestResults[ct] = getBenchmark();
+                    }
+                    onBenchmarkFinish(mRun);
+                    return;
+                } else {
+                    boolean update = false;
+                    synchronized(this) {
+                        // If we have updates to process and are not blocked by pending shows,
+                        // start the next kernel
+                        if ((mUpdatesPending > 0) && (mShowsPending == 0)) {
+                            mUpdatesPending = 0;
+                            update = true;
+                            mShowsPending++;
+                        }
+                    }
+
+                    if (update) {
+                        // Run the kernel
+                        if (mActiveBitmap == 0) {
+                            mTest.mOutPixelsAllocation = mOutDisplayAllocation1;
+                        } else {
+                            mTest.mOutPixelsAllocation = mOutDisplayAllocation2;
+                        }
+                        runTest();
+
+                        if (mToggleDisplay) {
+                            if (mActiveBitmap == 0) {
+                                mOutDisplayAllocation1.copyTo(mBitmapOut1);
+                            } else {
+                                mOutDisplayAllocation2.copyTo(mBitmapOut2);
+                            }
+                        }
+
+                        // Send our RS message handler a message so we know when this work has completed
+                        //mRS.sendMessage(mActiveBitmap, null);
+                        mScriptUtils.invoke_utilSendMessage(mActiveBitmap);
+                        mActiveBitmap ^= 1;
+                    }
+                }
+            }
+
+        }
+
+        public void update() {
+            // something UI related has changed, enqueue an update if one is not
+            // already pending.  Wake the worker if needed
+            synchronized(this) {
+                if (mUpdatesPending < 2) {
+                    mUpdatesPending++;
+                    notifyAll();
+                }
+            }
+        }
+
+        public void exit() {
+            mRun = false;
+
+            synchronized(this) {
+                notifyAll();
+            }
+
+            try {
+                this.join();
+            } catch(InterruptedException e) {
+            }
+
+            mInPixelsAllocation.destroy();
+            mInPixelsAllocation2.destroy();
+
+            if (mTest != null) {
+                mTest.destroy();
+                mTest = null;
+            }
+            mOutDisplayAllocation1.destroy();
+            mOutDisplayAllocation2.destroy();
+            mRS.destroy();
+
+            mInPixelsAllocation = null;
+            mInPixelsAllocation2 = null;
+            mOutDisplayAllocation1 = null;
+            mOutDisplayAllocation2 = null;
+            mRS = null;
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////
+
     private boolean mDoingBenchmark;
+    public Processor mProcessor;
 
-    private TestBase mTest;
-    private int mRunCount;
+    TestBase changeTest(IPTestList.TestName t, boolean setupUI) {
+        TestBase tb = IPTestList.newTest(t);
 
-    public void updateDisplay() {
-            mTest.updateBitmap(mBitmapOut);
-            mDisplayView.invalidate();
+        tb.createBaseTest(this);
+        if (setupUI) {
+            setupBars(tb);
+        }
+        return tb;
+    }
+
+    TestBase changeTest(int id, boolean setupUI) {
+        IPTestList.TestName t = IPTestList.TestName.values()[id];
+        return changeTest(t, setupUI);
     }
 
     public void onProgressChanged(SeekBar seekBar, int progress, boolean fromUser) {
         if (fromUser) {
-
             if (seekBar == mBar1) {
-                mTest.onBar1Changed(progress);
+                mBars[0] = progress;
             } else if (seekBar == mBar2) {
-                mTest.onBar2Changed(progress);
+                mBars[1] = progress;
             } else if (seekBar == mBar3) {
-                mTest.onBar3Changed(progress);
+                mBars[2] = progress;
             } else if (seekBar == mBar4) {
-                mTest.onBar4Changed(progress);
+                mBars[3] = progress;
             } else if (seekBar == mBar5) {
-                mTest.onBar5Changed(progress);
+                mBars[4] = progress;
             }
-
-            mTest.runTest();
-            updateDisplay();
+            mProcessor.update();
         }
     }
 
@@ -171,191 +482,57 @@
     public void onStopTrackingTouch(SeekBar seekBar) {
     }
 
-    void setupBars() {
+    void setupBars(TestBase t) {
         mSpinner.setVisibility(View.VISIBLE);
-        mTest.onSpinner1Setup(mSpinner);
+        t.onSpinner1Setup(mSpinner);
 
         mBar1.setVisibility(View.VISIBLE);
         mText1.setVisibility(View.VISIBLE);
-        mTest.onBar1Setup(mBar1, mText1);
+        t.onBar1Setup(mBar1, mText1);
 
         mBar2.setVisibility(View.VISIBLE);
         mText2.setVisibility(View.VISIBLE);
-        mTest.onBar2Setup(mBar2, mText2);
+        t.onBar2Setup(mBar2, mText2);
 
         mBar3.setVisibility(View.VISIBLE);
         mText3.setVisibility(View.VISIBLE);
-        mTest.onBar3Setup(mBar3, mText3);
+        t.onBar3Setup(mBar3, mText3);
 
         mBar4.setVisibility(View.VISIBLE);
         mText4.setVisibility(View.VISIBLE);
-        mTest.onBar4Setup(mBar4, mText4);
+        t.onBar4Setup(mBar4, mText4);
 
         mBar5.setVisibility(View.VISIBLE);
         mText5.setVisibility(View.VISIBLE);
-        mTest.onBar5Setup(mBar5, mText5);
+        t.onBar5Setup(mBar5, mText5);
     }
 
+    void hideBars() {
+        mSpinner.setVisibility(View.INVISIBLE);
 
-    void changeTest(TestName testName) {
-        if (mTest != null) {
-            mTest.destroy();
-        }
-        switch(testName) {
-        case LEVELS_VEC3_RELAXED:
-            mTest = new LevelsV4(false, false);
-            break;
-        case LEVELS_VEC4_RELAXED:
-            mTest = new LevelsV4(false, true);
-            break;
-        case LEVELS_VEC3_FULL:
-            mTest = new LevelsV4(true, false);
-            break;
-        case LEVELS_VEC4_FULL:
-            mTest = new LevelsV4(true, true);
-            break;
-        case BLUR_RADIUS_25:
-            mTest = new Blur25(false);
-            break;
-        case INTRINSIC_BLUE_RADIUS_25:
-            mTest = new Blur25(true);
-            break;
-        case GREYSCALE:
-            mTest = new Greyscale();
-            break;
-        case GRAIN:
-            mTest = new Grain();
-            break;
-        case FISHEYE_FULL:
-            mTest = new Fisheye(false, false);
-            break;
-        case FISHEYE_RELAXED:
-            mTest = new Fisheye(false, true);
-            break;
-        case FISHEYE_APPROXIMATE_FULL:
-            mTest = new Fisheye(true, false);
-            break;
-        case FISHEYE_APPROXIMATE_RELAXED:
-            mTest = new Fisheye(true, true);
-            break;
-        case VIGNETTE_FULL:
-            mTest = new Vignette(false, false);
-            break;
-        case VIGNETTE_RELAXED:
-            mTest = new Vignette(false, true);
-            break;
-        case VIGNETTE_APPROXIMATE_FULL:
-            mTest = new Vignette(true, false);
-            break;
-        case VIGNETTE_APPROXIMATE_RELAXED:
-            mTest = new Vignette(true, true);
-            break;
-        case GROUP_TEST_EMULATED:
-            mTest = new GroupTest(false);
-            break;
-        case GROUP_TEST_NATIVE:
-            mTest = new GroupTest(true);
-            break;
-        case CONVOLVE_3X3:
-            mTest = new Convolve3x3(false);
-            break;
-        case INTRINSICS_CONVOLVE_3X3:
-            mTest = new Convolve3x3(true);
-            break;
-        case COLOR_MATRIX:
-            mTest = new ColorMatrix(false, false);
-            break;
-        case INTRINSICS_COLOR_MATRIX:
-            mTest = new ColorMatrix(true, false);
-            break;
-        case INTRINSICS_COLOR_MATRIX_GREY:
-            mTest = new ColorMatrix(true, true);
-            break;
-        case COPY:
-            mTest = new Copy();
-            break;
-        case CROSS_PROCESS_USING_LUT:
-            mTest = new CrossProcess();
-            break;
-        case CONVOLVE_5X5:
-            mTest = new Convolve5x5(false);
-            break;
-        case INTRINSICS_CONVOLVE_5X5:
-            mTest = new Convolve5x5(true);
-            break;
-        case MANDELBROT:
-            mTest = new Mandelbrot();
-            break;
-        case INTRINSICS_BLEND:
-            mTest = new Blend();
-            break;
-        case INTRINSICS_BLUR_25G:
-            mTest = new Blur25G();
-            break;
-        case VIBRANCE:
-            mTest = new Vibrance();
-            break;
-        case BW_FILTER:
-            mTest = new BWFilter();
-            break;
-        case SHADOWS:
-            mTest = new Shadows();
-            break;
-        case CONTRAST:
-            mTest = new Contrast();
-            break;
-        case EXPOSURE:
-            mTest = new Exposure();
-            break;
-        case WHITE_BALANCE:
-            mTest = new WhiteBalance();
-            break;
-        case COLOR_CUBE:
-            mTest = new ColorCube(false);
-            break;
-        case COLOR_CUBE_3D_INTRINSIC:
-            mTest = new ColorCube(true);
-            break;
-        }
+        mBar1.setVisibility(View.INVISIBLE);
+        mText1.setVisibility(View.INVISIBLE);
 
-        mTest.createBaseTest(this, mBitmapIn, mBitmapIn2, mBitmapOut);
-        setupBars();
+        mBar2.setVisibility(View.INVISIBLE);
+        mText2.setVisibility(View.INVISIBLE);
 
-        mTest.runTest();
-        updateDisplay();
-        mBenchmarkResult.setText("Result: not run");
+        mBar3.setVisibility(View.INVISIBLE);
+        mText3.setVisibility(View.INVISIBLE);
+
+        mBar4.setVisibility(View.INVISIBLE);
+        mText4.setVisibility(View.INVISIBLE);
+
+        mBar5.setVisibility(View.INVISIBLE);
+        mText5.setVisibility(View.INVISIBLE);
     }
 
-    void setupTests() {
-        mTestSpinner.setAdapter(new ArrayAdapter<TestName>(
-            this, R.layout.spinner_layout, TestName.values()));
-    }
-
-    private AdapterView.OnItemSelectedListener mTestSpinnerListener =
-            new AdapterView.OnItemSelectedListener() {
-                public void onItemSelected(AdapterView<?> parent, View view, int pos, long id) {
-                    changeTest(TestName.values()[pos]);
-                }
-
-                public void onNothingSelected(AdapterView parent) {
-
-                }
-            };
-
     @Override
     protected void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
         setContentView(R.layout.main);
 
-        mBitmapIn = loadBitmap(R.drawable.img1600x1067);
-        mBitmapIn2 = loadBitmap(R.drawable.img1600x1067b);
-        mBitmapOut = Bitmap.createBitmap(mBitmapIn.getWidth(), mBitmapIn.getHeight(),
-                                         mBitmapIn.getConfig());
-
-        mSurfaceView = (SurfaceView) findViewById(R.id.surface);
-
         mDisplayView = (ImageView) findViewById(R.id.display);
-        mDisplayView.setImageBitmap(mBitmapOut);
+        //mDisplayView.setImageBitmap(mBitmapOut);
 
         mSpinner = (Spinner) findViewById(R.id.spinner1);
 
@@ -376,95 +553,83 @@
         mText3 = (TextView) findViewById(R.id.slider3Text);
         mText4 = (TextView) findViewById(R.id.slider4Text);
         mText5 = (TextView) findViewById(R.id.slider5Text);
-
-        mTestSpinner = (Spinner) findViewById(R.id.filterselection);
-        mTestSpinner.setOnItemSelectedListener(mTestSpinnerListener);
-
-        mBenchmarkResult = (TextView) findViewById(R.id.benchmarkText);
-        mBenchmarkResult.setText("Result: not run");
-
-
-        mRS = RenderScript.create(this);
-        mInPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapIn);
-        mInPixelsAllocation2 = Allocation.createFromBitmap(mRS, mBitmapIn2);
-        mOutPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
-
-
-        setupTests();
-        changeTest(TestName.LEVELS_VEC3_RELAXED);
     }
 
-
-    private Bitmap loadBitmap(int resource) {
-        final BitmapFactory.Options options = new BitmapFactory.Options();
-        options.inPreferredConfig = Bitmap.Config.ARGB_8888;
-        return BitmapFactory.decodeResource(getResources(), resource, options);
+    @Override
+    protected void onPause() {
+        super.onPause();
+        mProcessor.exit();
     }
 
-    // button hook
-    public void benchmark(View v) {
-        float t = getBenchmark();
-        //long javaTime = javaFilter();
-        //mBenchmarkResult.setText("RS: " + t + " ms  Java: " + javaTime + " ms");
-        mBenchmarkResult.setText("Result: " + t + " ms");
-        Log.v(TAG, "getBenchmark: Renderscript frame time core ms " + t);
-    }
-
-    public void benchmark_all(View v) {
-        // write result into a file
-        File externalStorage = Environment.getExternalStorageDirectory();
-        if (!externalStorage.canWrite()) {
-            Log.v(TAG, "sdcard is not writable");
-            return;
+    public void onBenchmarkFinish(boolean ok) {
+        if (ok) {
+            Intent intent = new Intent();
+            intent.putExtra("tests", mTestList);
+            intent.putExtra("results", mTestResults);
+            setResult(RESULT_OK, intent);
+        } else {
+            setResult(RESULT_CANCELED);
         }
-        File resultFile = new File(externalStorage, RESULT_FILE);
-        //resultFile.setWritable(true, false);
-        try {
-            BufferedWriter rsWriter = new BufferedWriter(new FileWriter(resultFile));
-            Log.v(TAG, "Saved results in: " + resultFile.getAbsolutePath());
-            for (TestName tn: TestName.values()) {
-                changeTest(tn);
-                float t = getBenchmark();
-                String s = new String("" + tn.toString() + ", " + t);
-                rsWriter.write(s + "\n");
-                Log.v(TAG, "Test " + s + "ms\n");
+        finish();
+    }
+
+
+    void startProcessor() {
+        if (!mDemoMode) {
+            hideBars();
+        }
+
+        Point size = new Point();
+        getWindowManager().getDefaultDisplay().getSize(size);
+
+        int mScreenWidth = size.x;
+        int mScreenHeight = size.y;
+
+        int tw = mBitmapWidth;
+        int th = mBitmapHeight;
+
+        if (tw > mScreenWidth || th > mScreenHeight) {
+            float s1 = (float)tw / (float)mScreenWidth;
+            float s2 = (float)th / (float)mScreenHeight;
+
+            if (s1 > s2) {
+                tw /= s1;
+                th /= s1;
+            } else {
+                tw /= s2;
+                th /= s2;
             }
-            rsWriter.close();
-        } catch (IOException e) {
-            Log.v(TAG, "Unable to write result file " + e.getMessage());
         }
-        changeTest(TestName.LEVELS_VEC3_RELAXED);
+
+        android.util.Log.v("rs", "TV sizes " + tw + ", " + th);
+
+        mProcessor = new Processor(RenderScript.create(this), !mDemoMode);
+        if (mDemoMode) {
+            mProcessor.mTest = changeTest(mTestList[0], true);
+        }
+        mProcessor.update();
     }
 
-    // For benchmark test
-    public float getBenchmark() {
-        mDoingBenchmark = true;
+    @Override
+    protected void onResume() {
+        super.onResume();
+        Intent i = getIntent();
+        mTestList = i.getIntArrayExtra("tests");
+        mToggleLong = i.getBooleanExtra("enable long", false);
+        mTogglePause = i.getBooleanExtra("enable pause", false);
+        mToggleAnimate = i.getBooleanExtra("enable animate", false);
+        mToggleDisplay = i.getBooleanExtra("enable display", false);
+        mBitmapWidth = i.getIntExtra("resolution X", 0);
+        mBitmapHeight = i.getIntExtra("resolution Y", 0);
+        mDemoMode = i.getBooleanExtra("demo", false);
 
-        mTest.setupBenchmark();
-        long result = 0;
+        mTestResults = new float[mTestList.length];
 
-        //Log.v(TAG, "Warming");
-        long t = java.lang.System.currentTimeMillis() + 250;
-        do {
-            mTest.runTest();
-            mTest.finish();
-        } while (t > java.lang.System.currentTimeMillis());
-
-        //Log.v(TAG, "Benchmarking");
-        int ct = 0;
-        t = java.lang.System.currentTimeMillis();
-        do {
-            mTest.runTest();
-            mTest.finish();
-            ct++;
-        } while ((t+1000) > java.lang.System.currentTimeMillis());
-        t = java.lang.System.currentTimeMillis() - t;
-        float ft = (float)t;
-        ft /= ct;
-
-        mTest.exitBenchmark();
-        mDoingBenchmark = false;
-
-        return ft;
+        startProcessor();
     }
+
+    protected void onDestroy() {
+        super.onDestroy();
+    }
+
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/LevelsV4.java b/java/tests/ImageProcessing2/src/com/android/rs/image/LevelsV4.java
index 199c504..1f2dd60 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/LevelsV4.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/LevelsV4.java
@@ -103,7 +103,7 @@
     public boolean onBar4Setup(SeekBar b, TextView t) {
         b.setMax(128);
         b.setProgress(128);
-        t.setText("Out White");
+        t.setText("In White");
         return true;
     }
     public boolean onBar5Setup(SeekBar b, TextView t) {
@@ -134,6 +134,12 @@
         setLevels();
     }
 
+    public void animateBars(float time) {
+        mSaturation = time % 2.f;
+        setSaturation();
+    }
+
+
     public void createTest(android.content.res.Resources res) {
         mScriptR = new ScriptC_levels_relaxed(mRS);
         mScriptF = new ScriptC_levels_full(mRS);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Mandelbrot.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Mandelbrot.java
index f957df4..e2c23a1 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Mandelbrot.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Mandelbrot.java
@@ -24,6 +24,11 @@
 
 public class Mandelbrot extends TestBase {
     private ScriptC_mandelbrot mScript;
+    private boolean mUseDouble = false;
+
+    public Mandelbrot(boolean useDouble) {
+        mUseDouble = useDouble;
+    }
 
     public boolean onBar1Setup(SeekBar b, TextView t) {
         t.setText("Iterations");
@@ -84,7 +89,11 @@
     }
 
     public void runTest() {
-        mScript.forEach_root(mOutPixelsAllocation);
+        if (mUseDouble) {
+            mScript.forEach_rootD(mOutPixelsAllocation);
+        } else {
+            mScript.forEach_root(mOutPixelsAllocation);
+        }
         mRS.finish();
     }
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Posterize.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Posterize.java
new file mode 100644
index 0000000..fb1867e
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Posterize.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.support.v8.renderscript.*;
+
+public class Posterize extends TestBase {
+    private ScriptC_posterize mScript;
+    boolean mUseInvokes;
+
+    Posterize(boolean useInvoke) {
+        mUseInvokes = useInvoke;
+    }
+
+    public void createTest(android.content.res.Resources res) {
+        mScript = new ScriptC_posterize(mRS);
+    }
+
+    void setParams(float intensHigh, float intensLow, int r, int g, int b) {
+        if (mUseInvokes) {
+            mScript.invoke_setParams(intensHigh, intensLow,
+                                     (short)r, (short)g, (short)b);
+        } else {
+            mScript.set_intensityLow(intensLow);
+            mScript.set_intensityHigh(intensHigh);
+            mScript.set_color(new Short4((short)r, (short)g, (short)b, (short)255));
+        }
+    }
+
+    public void runTest() {
+        mScript.set_inputImage(mInPixelsAllocation);
+        setParams(.2f, 0.f, 255, 0, 0);
+        mScript.forEach_root(mInPixelsAllocation, mOutPixelsAllocation);
+        setParams(.4f, 0.2f, 0, 255, 0);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(.6f, 0.4f, 0, 0, 255);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(.8f, 0.6f, 255, 255, 0);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(1.0f, 0.8f, 0, 255, 255);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+    }
+
+}
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Resize.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Resize.java
new file mode 100644
index 0000000..4353324
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Resize.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image2;
+
+import android.support.v8.renderscript.*;
+
+
+public class Resize extends TestBase {
+    private ScriptC_resize mScript;
+
+    private Allocation mScratchAllocation;
+    private int mWidth;
+    private int mHeight;
+
+    public Resize(boolean useIntrinsic) {
+    }
+
+    public void createTest(android.content.res.Resources res) {
+        mWidth = mInPixelsAllocation.getType().getX();
+        mHeight = mInPixelsAllocation.getType().getY();
+        float scale = 1.f / 32.f;
+
+        Type.Builder tb = new Type.Builder(mRS, mInPixelsAllocation.getElement());
+        tb.setX((int)(mWidth * scale));
+        tb.setY((int)(mHeight * scale));
+        Type t = tb.create();
+        mScratchAllocation = Allocation.createTyped(mRS, t);
+
+        // make small buffer
+        mScript = new ScriptC_resize(mRS);
+        mScript.set_gIn(mInPixelsAllocation);
+        mScript.set_gWidthIn(mWidth);
+        mScript.set_gHeightIn(mHeight);
+        mScript.set_scale(1.f / scale);
+        mScript.forEach_nearest(mScratchAllocation);
+
+        // setup normal ops
+        mScript.set_gIn(mScratchAllocation);
+        mScript.set_gWidthIn(t.getX());
+        mScript.set_gHeightIn(t.getY());
+        mScript.set_scale(scale);
+    }
+
+    public void runTest() {
+        mScript.forEach_bicubic(mOutPixelsAllocation);
+    }
+
+}
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/TestBase.java b/java/tests/ImageProcessing2/src/com/android/rs/image/TestBase.java
index eeabc73..240239d 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/TestBase.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/TestBase.java
@@ -18,19 +18,12 @@
 
 import android.app.Activity;
 import android.content.Context;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.graphics.Canvas;
 import android.support.v8.renderscript.*;
-import android.view.SurfaceView;
-import android.view.SurfaceHolder;
 import android.widget.ImageView;
 import android.widget.SeekBar;
 import android.widget.TextView;
 import android.view.View;
 import android.util.Log;
-import java.lang.Math;
 import android.widget.Spinner;
 
 public class TestBase  {
@@ -40,7 +33,6 @@
     protected Allocation mInPixelsAllocation;
     protected Allocation mInPixelsAllocation2;
     protected Allocation mOutPixelsAllocation;
-
     protected ImageProcessingActivity2 act;
 
     // Override to use UI elements
@@ -83,18 +75,21 @@
         return false;
     }
 
+    public void animateBars(float time) {
+    }
+
     public boolean onSpinner1Setup(Spinner s) {
         s.setVisibility(View.INVISIBLE);
         return false;
     }
 
-    public final void createBaseTest(ImageProcessingActivity2 ipact, Bitmap b, Bitmap b2, Bitmap outb) {
+    public final void createBaseTest(ImageProcessingActivity2 ipact) {
         act = ipact;
-        mRS = ipact.mRS;
+        mRS = ipact.mProcessor.mRS;
 
-        mInPixelsAllocation = ipact.mInPixelsAllocation;
-        mInPixelsAllocation2 = ipact.mInPixelsAllocation2;
-        mOutPixelsAllocation = ipact.mOutPixelsAllocation;
+        mInPixelsAllocation = ipact.mProcessor.mInPixelsAllocation;
+        mInPixelsAllocation2 = ipact.mProcessor.mInPixelsAllocation2;
+        mOutPixelsAllocation = ipact.mProcessor.mOutDisplayAllocation1;
 
         createTest(act.getResources());
     }
@@ -107,22 +102,6 @@
     public void runTest() {
     }
 
-    public void finish() {
-        mRS.finish();
-    }
-
     public void destroy() {
     }
-
-    public void updateBitmap(Bitmap b) {
-        mOutPixelsAllocation.copyTo(b);
-    }
-
-    // Override to configure specific benchmark config.
-    public void setupBenchmark() {
-    }
-
-    // Override to reset after benchmark.
-    public void exitBenchmark() {
-    }
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
index e24c548..4f00304 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
@@ -68,6 +68,7 @@
         return true;
     }
 
+
     public void onBar1Changed(int progress) {
         scale = progress / 50.0f;
         do_init();
@@ -89,6 +90,11 @@
         do_init();
     }
 
+    public void animateBars(float time) {
+        scale = time % 2.f;
+        do_init();
+    }
+
     private void do_init() {
         if (approx) {
             if (relaxed)
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/WhiteBalance.java b/java/tests/ImageProcessing2/src/com/android/rs/image/WhiteBalance.java
index 658e3b1..50aa5a2 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/WhiteBalance.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/WhiteBalance.java
@@ -18,7 +18,6 @@
 
 import java.lang.Math;
 
-import android.support.v8.renderscript.*;
 
 public class WhiteBalance extends TestBase {
     private ScriptC_wbalance mScript;
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/artistic1.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/artistic1.rs
new file mode 100644
index 0000000..2869e16
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/artistic1.rs
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+rs_allocation gBlur;
+
+static float gOverWm1;
+static float gOverHm1;
+static uchar gLutR[256];
+static uchar gLutG[256];
+static uchar gLutB[256];
+
+void setup() {
+    int w = rsAllocationGetDimX(gBlur);
+    int h = rsAllocationGetDimY(gBlur);
+    gOverWm1 = 1.f / w;
+    gOverHm1 = 1.f / h;
+
+    for (int x=0; x < 256; x++) {
+        gLutR[x] = x;//255-x;
+        gLutG[x] = x;//255-x;
+        gLutB[x] = x;//255-x;
+    }
+}
+
+uchar4 RS_KERNEL process(uchar4 in, uint32_t x, uint32_t y) {
+    float2 xyDist;
+    xyDist.x = (x * gOverWm1 - 0.5f);
+    xyDist.y = (y * gOverHm1 - 0.5f);
+
+    // color
+    float4 v1 = rsUnpackColor8888(in);
+    float4 v2 = rsUnpackColor8888(rsGetElementAt_uchar4(gBlur, x, y));
+
+    float dist = dot(xyDist, xyDist) * 1.4f;
+    float pdist = native_powr(dist, 2.7f * 0.5f);
+    //float pdist = powr(dist, 2.7f * 0.5f);
+
+    pdist = clamp(pdist, 0.f, 1.f);
+    v1 = mix(v1, v2, dist * 2.f);
+    v1 *= 1.f - pdist;
+
+    // apply curve
+    uchar4 out = rsPackColorTo8888(v1);
+
+    out.r = gLutR[out.r];
+    out.g = gLutG[out.g];
+    out.b = gLutB[out.b];
+    return out;
+}
+
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/blend.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/blend.rs
index 9ec1246..96dfc96 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/blend.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/blend.rs
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 uchar alpha = 0x0;
 
-void setImageAlpha(uchar4 *v_out, uint32_t x, uint32_t y) {
-  v_out->rgba = convert_uchar4((convert_uint4(v_out->rgba) * alpha) >> (uint4)8);
-  v_out->a = alpha;
+uchar4 RS_KERNEL setImageAlpha(uchar4 in, uint32_t x, uint32_t y) {
+    uchar4 out;
+    out.rgba = convert_uchar4((convert_uint4(in.rgba) * alpha) >> (uint4)8);
+    out.a = alpha;
+    return out;
 }
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/bwfilter.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/bwfilter.rs
index e706d44..e211620 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/bwfilter.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/bwfilter.rs
@@ -15,7 +15,7 @@
  */
 
 #include "ip.rsh"
-//#pragma rs_fp_relaxed
+#pragma rs_fp_relaxed
 
 static float sr = 0.f;
 static float sg = 0.f;
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/colorcube.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/colorcube.rs
index c0d6ace..dbdd2f6 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/colorcube.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/colorcube.rs
@@ -32,15 +32,10 @@
 
     float4 m = (float4)(1.f / 255.f) * convert_float4(gDims - 1);
     gCoordMul = convert_int4(m * (float4)0x10000);
-
-    rsDebug("dims", gDims);
-    rsDebug("gCoordMul", gCoordMul);
 }
 
-void root(const uchar4 *in, uchar4 *out, uint32_t x, uint32_t y) {
-    //rsDebug("root", in);
-
-    int4 baseCoord = convert_int4(*in) * gCoordMul;
+uchar4 RS_KERNEL root(uchar4 in) {
+    int4 baseCoord = convert_int4(in) * gCoordMul;
     int4 coord1 = baseCoord >> (int4)16;
     int4 coord2 = min(coord1 + 1, gDims - 1);
 
@@ -67,23 +62,8 @@
     uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (uint4)16;
     uint4 v2 = (v + 0x7f) >> (uint4)8;
 
-    *out = convert_uchar4(v2);
-    out->a = 0xff;
-
-    #if 0
-    if (in->r != out->r) {
-        rsDebug("dr", in->r - out->r);
-        //rsDebug("in", convert_int4(*in));
-        //rsDebug("coord1", coord1);
-        //rsDebug("coord2", coord2);
-        //rsDebug("weight1", weight1);
-        //rsDebug("weight2", weight2);
-        //rsDebug("yz00", yz00);
-        //rsDebug("z0", z0);
-        //rsDebug("v", v);
-        //rsDebug("v2", v2);
-        //rsDebug("out", convert_int4(*out));
-    }
-    #endif
+    uchar4 o = convert_uchar4(v2);
+    o.a = 0xff;
+    return o;
 }
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.rs
similarity index 97%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.rs
index a1bcaf8..d13ac64 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/colormatrix.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 static rs_matrix4x4 Mat;
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/contrast.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/contrast.rs
index d3743d3..71cb8c6 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/contrast.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/contrast.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 static float brightM = 0.f;
 static float brightC = 0.f;
@@ -24,14 +25,10 @@
     brightC = 127.f - brightM * 127.f;
 }
 
-void contrast(const uchar4 *in, uchar4 *out)
-{
-#if 0
-    out->r = rsClamp((int)(brightM * in->r + brightC), 0, 255);
-    out->g = rsClamp((int)(brightM * in->g + brightC), 0, 255);
-    out->b = rsClamp((int)(brightM * in->b + brightC), 0, 255);
-#else
-    float3 v = convert_float3(in->rgb) * brightM + brightC;
-    out->rgb = convert_uchar3(clamp(v, 0.f, 255.f));
-#endif
+uchar4 RS_KERNEL contrast(uchar4 in) {
+    float3 v = convert_float3(in.rgb) * brightM + brightC;
+    uchar4 o;
+    o.rgb = convert_uchar3(clamp(v, 0.f, 255.f));
+    o.a = 0xff;
+    return o;
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/convolve3x3.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/convolve3x3.rs
new file mode 100644
index 0000000..3d811ce
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/convolve3x3.rs
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidth;
+int32_t gHeight;
+rs_allocation gIn;
+
+float gCoeffs[9];
+
+uchar4 RS_KERNEL root(uint32_t x, uint32_t y) {
+    uint32_t x1 = min((int32_t)x+1, gWidth-1);
+    uint32_t x2 = max((int32_t)x-1, 0);
+    uint32_t y1 = min((int32_t)y+1, gHeight-1);
+    uint32_t y2 = max((int32_t)y-1, 0);
+
+    float4 sum = convert_float4(rsGetElementAt_uchar4(gIn, x1, y1)) * gCoeffs[0];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x, y1)) * gCoeffs[1];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x2, y1)) * gCoeffs[2];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x1, y)) * gCoeffs[3];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x, y)) * gCoeffs[4];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x2, y)) * gCoeffs[5];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x1, y2)) * gCoeffs[6];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x, y2)) * gCoeffs[7];
+    sum += convert_float4(rsGetElementAt_uchar4(gIn, x2, y2)) * gCoeffs[8];
+
+    sum = clamp(sum, 0.f, 255.f);
+    return convert_uchar4(sum);
+}
+
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.fs
deleted file mode 100644
index 3849bd1..0000000
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.fs
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ip.rsh"
-
-int32_t gWidth;
-int32_t gHeight;
-rs_allocation gIn;
-
-float gCoeffs[25];
-
-uchar4 RS_KERNEL root(uint32_t x, uint32_t y) {
-    uint32_t x0 = max((int32_t)x-2, 0);
-    uint32_t x1 = max((int32_t)x-1, 0);
-    uint32_t x2 = x;
-    uint32_t x3 = min((int32_t)x+1, gWidth-1);
-    uint32_t x4 = min((int32_t)x+2, gWidth-1);
-
-    uint32_t y0 = max((int32_t)y-2, 0);
-    uint32_t y1 = max((int32_t)y-1, 0);
-    uint32_t y2 = y;
-    uint32_t y3 = min((int32_t)y+1, gHeight-1);
-    uint32_t y4 = min((int32_t)y+2, gHeight-1);
-
-    float4 p0 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y0)) * gCoeffs[0]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y0)) * gCoeffs[1]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y0)) * gCoeffs[2]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y0)) * gCoeffs[3]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y0)) * gCoeffs[4];
-
-    float4 p1 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y1)) * gCoeffs[5]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y1)) * gCoeffs[6]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y1)) * gCoeffs[7]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y1)) * gCoeffs[8]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y1)) * gCoeffs[9];
-
-    float4 p2 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y2)) * gCoeffs[10]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y2)) * gCoeffs[11]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y2)) * gCoeffs[12]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y2)) * gCoeffs[13]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y2)) * gCoeffs[14];
-
-    float4 p3 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y3)) * gCoeffs[15]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y3)) * gCoeffs[16]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y3)) * gCoeffs[17]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y3)) * gCoeffs[18]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y3)) * gCoeffs[19];
-
-    float4 p4 = convert_float4(rsGetElementAt_uchar4(gIn, x0, y4)) * gCoeffs[20]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x1, y4)) * gCoeffs[21]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x2, y4)) * gCoeffs[22]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x3, y4)) * gCoeffs[23]
-              + convert_float4(rsGetElementAt_uchar4(gIn, x4, y4)) * gCoeffs[24];
-
-    p0 = clamp(p0 + p1 + p2 + p3 + p4, 0.f, 255.f);
-    return convert_uchar4(p0);
-}
-
-
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.rs
new file mode 100644
index 0000000..a9ddde2
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/convolve5x5.rs
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidth;
+int32_t gHeight;
+rs_allocation gIn;
+
+float gCoeffs[25];
+
+uchar4 RS_KERNEL root(uint32_t x, uint32_t y) {
+    uint32_t x0 = max((int32_t)x-2, 0);
+    uint32_t x1 = max((int32_t)x-1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = min((int32_t)x+1, gWidth-1);
+    uint32_t x4 = min((int32_t)x+2, gWidth-1);
+
+    uint32_t y0 = max((int32_t)y-2, 0);
+    uint32_t y1 = max((int32_t)y-1, 0);
+    uint32_t y2 = y;
+    uint32_t y3 = min((int32_t)y+1, gHeight-1);
+    uint32_t y4 = min((int32_t)y+2, gHeight-1);
+
+    float4 sum = convert_float4(rsGetElementAt_uchar4(gIn, x0, y0)) * gCoeffs[0]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x1, y0)) * gCoeffs[1]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x2, y0)) * gCoeffs[2]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x3, y0)) * gCoeffs[3]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x4, y0)) * gCoeffs[4]
+
+               + convert_float4(rsGetElementAt_uchar4(gIn, x0, y1)) * gCoeffs[5]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x1, y1)) * gCoeffs[6]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x2, y1)) * gCoeffs[7]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x3, y1)) * gCoeffs[8]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x4, y1)) * gCoeffs[9]
+
+               + convert_float4(rsGetElementAt_uchar4(gIn, x0, y2)) * gCoeffs[10]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x1, y2)) * gCoeffs[11]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x2, y2)) * gCoeffs[12]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x3, y2)) * gCoeffs[13]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x4, y2)) * gCoeffs[14]
+
+               + convert_float4(rsGetElementAt_uchar4(gIn, x0, y3)) * gCoeffs[15]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x1, y3)) * gCoeffs[16]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x2, y3)) * gCoeffs[17]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x3, y3)) * gCoeffs[18]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x4, y3)) * gCoeffs[19]
+
+               + convert_float4(rsGetElementAt_uchar4(gIn, x0, y4)) * gCoeffs[20]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x1, y4)) * gCoeffs[21]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x2, y4)) * gCoeffs[22]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x3, y4)) * gCoeffs[23]
+               + convert_float4(rsGetElementAt_uchar4(gIn, x4, y4)) * gCoeffs[24];
+
+    return convert_uchar4(clamp(sum, 0.f, 255.f));
+}
+
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/copy.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/copy.rs
index f36171e..04c86df 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/copy.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 uchar4 RS_KERNEL root(uchar4 v_in) {
     return v_in;
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/exposure.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/exposure.rs
index 0f05cb9..cd9e733 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/exposure.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/exposure.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 static float bright = 0.f;
 
@@ -22,10 +23,12 @@
     bright = 255.f / (255.f - v);
 }
 
-void exposure(const uchar4 *in, uchar4 *out)
+uchar4 RS_KERNEL exposure(uchar4 in)
 {
-    out->r = rsClamp((int)(bright * in->r), 0, 255);
-    out->g = rsClamp((int)(bright * in->g), 0, 255);
-    out->b = rsClamp((int)(bright * in->b), 0, 255);
+    uchar4 out = 0;
+    out.r = rsClamp((int)(bright * in.r), 0, 255);
+    out.g = rsClamp((int)(bright * in.g), 0, 255);
+    out.b = rsClamp((int)(bright * in.b), 0, 255);
+    return out;
 }
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.rs
index ed69ff4..2a18925 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_approx_relaxed.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 #include "fisheye_approx.rsh"
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.rs
index f986b5d..31646c4 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/fisheye_relaxed.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 #include "fisheye.rsh"
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/grain.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/grain.rs
similarity index 98%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/grain.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/grain.rs
index 639bf64..6a88378 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/grain.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/grain.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 uchar RS_KERNEL genRand() {
     return (uchar)rsRand(0xff);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.rs
similarity index 97%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.rs
index 65bc252..3ac601d 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/greyscale.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 const static float3 gMonoMult = {0.299f, 0.587f, 0.114f};
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/ip2_convolve3x3.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/ip2_convolve3x3.rs
deleted file mode 100644
index 3df62f0..0000000
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/ip2_convolve3x3.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ip.rsh"
-
-int32_t gWidth;
-int32_t gHeight;
-rs_allocation gIn;
-
-float gCoeffs[9];
-
-uchar4 RS_KERNEL root(uint32_t x, uint32_t y) {
-    uint32_t x1 = min((int32_t)x+1, gWidth-1);
-    uint32_t x2 = max((int32_t)x-1, 0);
-    uint32_t y1 = min((int32_t)y+1, gHeight-1);
-    uint32_t y2 = max((int32_t)y-1, 0);
-
-    float4 p00 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y1));
-    float4 p01 = convert_float4(rsGetElementAt_uchar4(gIn, x, y1));
-    float4 p02 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y1));
-    float4 p10 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y));
-    float4 p11 = convert_float4(rsGetElementAt_uchar4(gIn, x, y));
-    float4 p12 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y));
-    float4 p20 = convert_float4(rsGetElementAt_uchar4(gIn, x1, y2));
-    float4 p21 = convert_float4(rsGetElementAt_uchar4(gIn, x, y2));
-    float4 p22 = convert_float4(rsGetElementAt_uchar4(gIn, x2, y2));
-    p00 *= gCoeffs[0];
-    p01 *= gCoeffs[1];
-    p02 *= gCoeffs[2];
-    p10 *= gCoeffs[3];
-    p11 *= gCoeffs[4];
-    p12 *= gCoeffs[5];
-    p20 *= gCoeffs[6];
-    p21 *= gCoeffs[7];
-    p22 *= gCoeffs[8];
-
-    p00 += p01;
-    p02 += p10;
-    p11 += p12;
-    p20 += p21;
-
-    p22 += p00;
-    p02 += p11;
-
-    p20 += p22;
-    p20 += p02;
-
-    p20 = clamp(p20, 0.f, 255.f);
-    return convert_uchar4(p20);
-}
-
-
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/levels.rsh b/java/tests/ImageProcessing2/src/com/android/rs/image/levels.rsh
index e289906..b864493 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/levels.rsh
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/levels.rsh
@@ -21,7 +21,7 @@
 float overInWMinInB;
 rs_matrix3x3 colorMat;
 
-uchar4 __attribute__((kernel)) root(uchar4 in, uint32_t x, uint32_t y) {
+uchar4 __attribute__((kernel)) root(uchar4 in) {
     uchar4 out;
     float3 pixel = convert_float4(in).rgb;
     pixel = rsMatrixMultiply(&colorMat, pixel);
@@ -34,7 +34,7 @@
     return out;
 }
 
-uchar4 __attribute__((kernel)) root4(uchar4 in, uint32_t x, uint32_t y) {
+uchar4 __attribute__((kernel)) root4(uchar4 in) {
     float4 pixel = convert_float4(in);
     pixel.rgb = rsMatrixMultiply(&colorMat, pixel.rgb);
     pixel = clamp(pixel, 0.f, 255.f);
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.rs
index 28596ba..c0bc4b7 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/levels_relaxed.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 #include "levels.rsh"
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/mandelbrot.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/mandelbrot.rs
index bdbcfcd..99451f0 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/mandelbrot.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/mandelbrot.rs
@@ -53,3 +53,36 @@
                       (0xff * ((iter - (mi3 * 2)) / mi3)), 0xff};
   }
 }
+
+uchar4 RS_KERNEL rootD(uint32_t x, uint32_t y) {
+  double2 p;
+  p.x = lowerBoundX + ((float)x / gDimX) * scaleFactor;
+  p.y = lowerBoundY + ((float)y / gDimY) * scaleFactor;
+
+  double2 t = 0;
+  double2 t2 = t * t;
+  int iter = 0;
+  while((t2.x + t2.y < 4.f) && (iter < gMaxIteration)) {
+    double xtemp = t2.x - t2.y + p.x;
+    t.y = 2 * t.x * t.y + p.y;
+    t.x = xtemp;
+    iter++;
+    t2 = t * t;
+  }
+
+  if(iter >= gMaxIteration) {
+    // write a non-transparent black pixel
+    return (uchar4){0, 0, 0, 0xff};
+  } else {
+    double mi3 = gMaxIteration / 3.f;
+    if (iter <= (gMaxIteration / 3))
+      return (uchar4){0xff * (iter / mi3), 0, 0, 0xff};
+    else if (iter <= (((gMaxIteration / 3) * 2)))
+      return (uchar4){0xff - (0xff * ((iter - mi3) / mi3)),
+                      (0xff * ((iter - mi3) / mi3)), 0, 0xff};
+    else
+      return (uchar4){0, 0xff - (0xff * ((iter - (mi3 * 2)) / mi3)),
+                      (0xff * ((iter - (mi3 * 2)) / mi3)), 0xff};
+  }
+}
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/posterize.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/posterize.rs
new file mode 100644
index 0000000..043ea5e
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/posterize.rs
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+rs_allocation inputImage;
+
+float intensityLow = 0.f;
+float intensityHigh;
+uchar4 color;
+const static float3 mono = {0.299f, 0.587f, 0.114f};
+
+void setParams(float intensHigh, float intensLow, uchar r, uchar g, uchar b) {
+    intensityLow = intensLow;
+    intensityHigh = intensHigh;
+    uchar4 hats = {r, g, b, 255};
+    color = hats;
+}
+
+uchar4 RS_KERNEL root(uchar4 v_in, uint32_t x, uint32_t y) {
+    uchar4 refpix = rsGetElementAt_uchar4(inputImage, x, y);
+    float pixelIntensity = dot(rsUnpackColor8888(refpix).rgb, mono);
+    if ((pixelIntensity <= intensityHigh) && (pixelIntensity >= intensityLow)) {
+        return color;
+    } else {
+        return v_in;
+    }
+}
+
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/resize.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/resize.rs
new file mode 100644
index 0000000..ec283be
--- /dev/null
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/resize.rs
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidthIn;
+int32_t gHeightIn;
+rs_allocation gIn;
+float scale;
+
+
+uchar4 __attribute__((kernel)) nearest(uint32_t x, uint32_t y) {
+    float xf = clamp(x * scale, 0.f, (float)gWidthIn - 1.f);
+    float yf = clamp(y * scale, 0.f, (float)gHeightIn - 1.f);
+    uint32_t ix = xf;
+    uint32_t iy = yf;
+
+    uchar4 tmp = rsGetElementAt_uchar4(gIn, ix, iy);
+    tmp.a = 0xff;
+    return tmp;
+}
+
+
+static float4 cubicInterpolate (float4 p0,float4 p1,float4 p2,float4 p3 , float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+uchar4 __attribute__((kernel)) bicubic(uint32_t x, uint32_t y) {
+    float xf = x * scale;
+    float yf = y * scale;
+
+    int startx = (int) floor(xf - 2);
+    int starty = (int) floor(yf - 2);
+    xf = xf - floor(xf);
+    yf = yf - floor(yf);
+    int maxx = gWidthIn - 1;
+    int maxy = gHeightIn - 1;
+
+    uint32_t xs0 = (uint32_t) max(0, startx + 0);
+    uint32_t xs1 = (uint32_t) max(0, startx + 1);
+    uint32_t xs2 = (uint32_t) min(maxx, startx + 2);
+    uint32_t xs3 = (uint32_t) min(maxx, startx + 3);
+
+    uint32_t ys0 = (uint32_t) max(0, starty + 0);
+    uint32_t ys1 = (uint32_t) max(0, starty + 1);
+    uint32_t ys2 = (uint32_t) min(maxy, starty + 2);
+    uint32_t ys3 = (uint32_t) min(maxy, starty + 3);
+
+    float4 p00 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys0));
+    float4 p01 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys0));
+    float4 p02 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys0));
+    float4 p03 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys0));
+    float4 p0  = cubicInterpolate(p00, p01, p02, p03, xf);
+
+    float4 p10 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys1));
+    float4 p11 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys1));
+    float4 p12 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys1));
+    float4 p13 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys1));
+    float4 p1  = cubicInterpolate(p10, p11, p12, p13, xf);
+
+    float4 p20 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys2));
+    float4 p21 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys2));
+    float4 p22 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys2));
+    float4 p23 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys2));
+    float4 p2  = cubicInterpolate(p20, p21, p22, p23, xf);
+
+    float4 p30 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys3));
+    float4 p31 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys3));
+    float4 p32 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys3));
+    float4 p33 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys3));
+    float4 p3  = cubicInterpolate(p30, p31, p32, p33, xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p, 0.f, 255.f);
+    return convert_uchar4(p);
+}
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/shadows.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/shadows.rs
index f6c149d..f852aae 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/shadows.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/shadows.rs
@@ -15,20 +15,20 @@
  */
 
 #include "ip.rsh"
-//#pragma rs_fp_relaxed
+#pragma rs_fp_relaxed
 
-static double shadowFilterMap[] = {
-    -0.00591,  0.0001,
-     1.16488,  0.01668,
-    -0.18027, -0.06791,
-    -0.12625,  0.09001,
-     0.15065, -0.03897
+static float shadowFilterMap[] = {
+    -0.00591f,  0.0001f,
+     1.16488f,  0.01668f,
+    -0.18027f, -0.06791f,
+    -0.12625f,  0.09001f,
+     0.15065f, -0.03897f
 };
 
-static double poly[] = {
-    0., 0.,
-    0., 0.,
-    0.
+static float poly[] = {
+    0.f, 0.f,
+    0.f, 0.f,
+    0.f
 };
 
 static const int ABITS = 4;
@@ -36,10 +36,10 @@
 static const int k1=255 << ABITS;
 static const int k2=HSCALE << ABITS;
 
-static double fastevalPoly(double *poly,int n, double x){
+static float fastevalPoly(float *poly,int n, float x){
 
-    double f =x;
-    double sum = poly[0]+poly[1]*f;
+    float f =x;
+    float sum = poly[0]+poly[1]*f;
     int i;
     for (i = 2; i < n; i++) {
         f*=x;
@@ -177,16 +177,15 @@
 }
 
 void prepareShadows(float scale) {
-    double s = (scale>=0)?scale:scale/5;
+    float s = (scale>=0) ? scale : scale / 5.f;
     for (int i = 0; i < 5; i++) {
         poly[i] = fastevalPoly(shadowFilterMap+i*2,2 , s);
     }
 }
 
-void shadowsKernel(const uchar4 *in, uchar4 *out) {
-    ushort3 hsv = rgb2hsv(*in);
-    double v = (fastevalPoly(poly,5,hsv.x/4080.)*4080);
-    if (v>4080) v = 4080;
-    hsv.x = (unsigned short) ((v>0)?v:0);
-    *out = hsv2rgb(hsv);
+uchar4 RS_KERNEL shadowsKernel(uchar4 in) {
+    ushort3 hsv = rgb2hsv(in);
+    float v = (fastevalPoly(poly, 5, hsv.x * (1.f / 4080.f)) * 4080.f);
+    hsv.x = (unsigned short) clamp(v, 0.f, 4080.f);
+    return hsv2rgb(hsv);
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/threshold.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/threshold.rs
similarity index 98%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/threshold.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/threshold.rs
index 96996f5..40d4fa2 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/threshold.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/threshold.rs
@@ -15,7 +15,7 @@
  */
 
 #include "ip.rsh"
-
+#pragma rs_fp_relaxed
 
 int height;
 int width;
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/util.rs
similarity index 84%
copy from java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs
copy to java/tests/ImageProcessing2/src/com/android/rs/image/util.rs
index f36171e..f11b06c 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/copy.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/util.rs
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012 The Android Open Source Project
+ * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,7 @@
 
 #include "ip.rsh"
 
-uchar4 RS_KERNEL root(uchar4 v_in) {
-    return v_in;
+void utilSendMessage(int op) {
+    rsSendToClient(op);
 }
 
-
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/vibrance.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/vibrance.rs
index ad4de58..865c77e 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/vibrance.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/vibrance.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 float vibrance = 0.f;
 
@@ -22,49 +23,36 @@
 static const float Gf = 0.587f;
 static const float Bf = 0.114f;
 
-static float S  = 0.f;
-static float MS = 0.f;
-static float Rt = 0.f;
-static float Gt = 0.f;
-static float Bt = 0.f;
 static float Vib = 0.f;
 
-void vibranceKernel(const uchar4 *in, uchar4 *out) {
+uchar4 RS_KERNEL vibranceKernel(uchar4 in) {
+    int r = in.r;
+    int g = in.g;
+    int b = in.b;
+    float red = (r-max(g, b)) * (1.f / 256.f);
+    float S = (float)(Vib/(1+native_exp(-red*3)))+1;
+    float MS = 1.0f - S;
+    float Rt = Rf * MS;
+    float Gt = Gf * MS;
+    float Bt = Bf * MS;
+    int t = (r + g) >> 1;
 
-    float R, G, B;
-
-    int r = in->r;
-    int g = in->g;
-    int b = in->b;
-    float red = (r-max(g, b))/256.f;
-    float sx = (float)(Vib/(1+native_exp(-red*3)));
-    S = sx+1;
-    MS = 1.0f - S;
-    Rt = Rf * MS;
-    Gt = Gf * MS;
-    Bt = Bf * MS;
-    int t = (r + g) / 2;
-    R = r;
-    G = g;
-    B = b;
+    float R = r;
+    float G = g;
+    float B = b;
 
     float Rc = R * (Rt + S) + G * Gt + B * Bt;
     float Gc = R * Rt + G * (Gt + S) + B * Bt;
     float Bc = R * Rt + G * Gt + B * (Bt + S);
 
-    out->r = rsClamp(Rc, 0, 255);
-    out->g = rsClamp(Gc, 0, 255);
-    out->b = rsClamp(Bc, 0, 255);
-
+    uchar4 o;
+    o.r = rsClamp(Rc, 0, 255);
+    o.g = rsClamp(Gc, 0, 255);
+    o.b = rsClamp(Bc, 0, 255);
+    o.a = 0xff;
+    return o;
 }
 
 void prepareVibrance() {
-
     Vib = vibrance/100.f;
-    S  = Vib + 1;
-    MS = 1.0f - S;
-    Rt = Rf * MS;
-    Gt = Gf * MS;
-    Bt = Bf * MS;
-
 }
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.rs
index 00cbbc4..347eb7a 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_approx_relaxed.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 #include "vignette_approx.rsh"
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.fs b/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.rs
similarity index 96%
rename from java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.fs
rename to java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.rs
index 8202c5c..262d516 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.fs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/vignette_relaxed.rs
@@ -15,6 +15,7 @@
  */
 
 #include "ip.rsh"
+#pragma rs_fp_relaxed
 
 #include "vignette.rsh"
 
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/wbalance.rs b/java/tests/ImageProcessing2/src/com/android/rs/image/wbalance.rs
index 6650671..58f64d1 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/wbalance.rs
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/wbalance.rs
@@ -15,7 +15,7 @@
  */
 
 #include "ip.rsh"
-//#pragma rs_fp_relaxed
+#pragma rs_fp_relaxed
 
 static int histR[256] = {0}, histG[256] = {0}, histB[256] = {0};
 
@@ -23,9 +23,7 @@
 uint32_t histogramHeight;
 uint32_t histogramWidth;
 
-static float scaleR;
-static float scaleG;
-static float scaleB;
+static float3 scale;
 
 static uchar4 estimateWhite() {
 
@@ -115,28 +113,19 @@
     int maximum = max(estimation.r, max(estimation.g, estimation.b));
     float avg = (minimum + maximum) / 2.f;
 
-    scaleR =  avg/estimation.r;
-    scaleG =  avg/estimation.g;
-    scaleB =  avg/estimation.b;
-
+    scale.r =  avg/estimation.r;
+    scale.g =  avg/estimation.g;
+    scale.b =  avg/estimation.b;
 }
 
-static unsigned char contrastClamp(int c)
-{
-    int N = 255;
-    c &= ~(c >> 31);
-    c -= N;
-    c &= (c >> 31);
-    c += N;
-    return  (unsigned char) c;
+uchar4 RS_KERNEL whiteBalanceKernel(uchar4 in) {
+    float3 t = convert_float3(in.rgb);
+    t *= scale;
+    t = min(t, 255.f);
+
+    uchar4 out;
+    out.rgb = convert_uchar3(t);
+    out.a = 255;
+    return out;
 }
 
-void whiteBalanceKernel(const uchar4 *in, uchar4 *out) {
-    float Rc =  in->r*scaleR;
-    float Gc =  in->g*scaleG;
-    float Bc =  in->b*scaleB;
-
-    out->r = contrastClamp(Rc);
-    out->g = contrastClamp(Gc);
-    out->b = contrastClamp(Bc);
-}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
index 3de9809..589f009 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
@@ -44,7 +44,7 @@
     protected RenderScript mRS;
     protected Allocation mInPixelsAllocation;
     protected Allocation mInPixelsAllocation2;
-    protected Allocation mOutPixelsAllocation;
+    public Allocation mOutPixelsAllocation;
     protected ImageProcessingActivityJB act;
 
     private class MessageProcessor extends RenderScript.RSMessageHandler {
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index 3acfe98..3047a56 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -93,7 +93,7 @@
         /*unitTests.add(new UT_program_store(this, mRes, mCtx));
         unitTests.add(new UT_program_raster(this, mRes, mCtx));
         unitTests.add(new UT_mesh(this, mRes, mCtx));*/
-        //unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
+        unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
         unitTests.add(new UT_fp_mad(this, mRes, mCtx));
 
         /*
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
new file mode 100644
index 0000000..1a05f80
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+import android.util.Log;
+
+public class UT_foreach_multi extends UnitTest {
+    private Resources mRes;
+    private Allocation Ain0;
+    private Allocation Ain1;
+    private Allocation Ain2;
+    private Allocation Ain3;
+
+    private Allocation Out0;
+    private Allocation Out1;
+    private Allocation Out2;
+    private Allocation Out3;
+
+    protected UT_foreach_multi(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "Foreach Multi-input", ctx);
+        mRes = res;
+    }
+
+    private void initializeGlobals(RenderScript RS, ScriptC_foreach_multi s) {
+        Type.Builder type32Builder = new Type.Builder(RS, Element.U32(RS));
+        Type.Builder type16Builder = new Type.Builder(RS, Element.U16(RS));
+
+        int Xdim = 5;
+        s.set_dimX(Xdim);
+        type32Builder.setX(Xdim);
+        type16Builder.setX(Xdim);
+
+        // 32-bit input allocations
+
+        Ain0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain0(Ain0);
+        s.forEach_init_uint32_alloc(Ain0);
+
+        Ain1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain1(Ain1);
+        s.forEach_init_uint32_alloc(Ain1);
+
+        Ain2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain2(Ain2);
+        s.forEach_init_uint32_alloc(Ain2);
+
+        // 16-bit input allocation
+
+        Ain3 = Allocation.createTyped(RS, type16Builder.create());
+        s.set_ain3(Ain3);
+        s.forEach_init_uint16_alloc(Ain3);
+
+        // 32-bit output allocations
+
+        Out0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout0(Out0);
+
+        Out1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout1(Out1);
+
+        Out2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout2(Out2);
+
+        // RetStruct output allocations
+
+        ScriptField_RetStruct StructType = new ScriptField_RetStruct(RS, Xdim);
+        Out3 = StructType.getAllocation();
+        s.set_aout3(Out3);
+
+        return;
+    }
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_foreach_multi s = new ScriptC_foreach_multi(pRS);
+
+        pRS.setMessageHandler(mRsMessage);
+
+        initializeGlobals(pRS, s);
+
+        s.forEach_sum2(Ain0, Ain1, Out0);
+        s.forEach_sum3(Ain0, Ain1, Ain2, Out1);
+        s.forEach_sum_mixed(Ain0, Ain3, Out2);
+        s.forEach_sum2_struct(Ain0, Ain1, Out3);
+
+        s.invoke_test_outputs();
+        s.invoke_check_test_results();
+
+        pRS.finish();
+        waitForMessage();
+        pRS.destroy();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
new file mode 100644
index 0000000..0857e86
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
@@ -0,0 +1,178 @@
+#include "shared.rsh"
+
+struct RetStruct {
+    uint32_t i0;
+    uint32_t i1;
+    uint32_t i2;
+    uint32_t i3;
+    uint32_t i4;
+    uint32_t i5;
+    uint32_t i6;
+    uint32_t i7;
+};
+
+rs_allocation ain0, ain1, ain2;
+rs_allocation ain3;
+
+rs_allocation aout0, aout1, aout2, aout3;
+
+uint32_t dimX;
+
+static bool failed = false;
+
+uint32_t RS_KERNEL init_uint32_alloc(uint32_t x) {
+    return x;
+}
+
+uint16_t RS_KERNEL init_uint16_alloc(uint32_t x) {
+    return x;
+}
+
+uint32_t RS_KERNEL sum2(uint32_t in0, uint32_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+struct RetStruct RS_KERNEL
+sum2_struct(uint32_t in0, uint32_t in1, uint32_t x) {
+
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    struct RetStruct retval;
+
+    retval.i0 = in0 + in1;
+    retval.i1 = in0 + in1;
+    retval.i2 = in0 + in1;
+    retval.i3 = in0 + in1;
+    retval.i4 = in0 + in1;
+    retval.i5 = in0 + in1;
+    retval.i6 = in0 + in1;
+    retval.i7 = in0 + in1;
+
+    return retval;
+}
+
+uint32_t RS_KERNEL sum3(uint32_t in0, uint32_t in1, uint32_t in2, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+    _RS_ASSERT(in2 == x);
+
+    return in0 + in1 + in2;
+}
+
+
+uint32_t RS_KERNEL sum_mixed(uint32_t in0, uint16_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+static bool test_sum2_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout0, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum2_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum2_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum3_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout1, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i) +
+                    rsGetElementAt_uint(ain2, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum3_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum3_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum_mixed_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout2, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_ushort(ain3, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum_mixed_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum_mixed_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum2_struct_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        struct RetStruct *result = (struct RetStruct*)rsGetElementAt(aout3, i);
+
+        uint32_t sum = rsGetElementAt_uint(ain0, i) +
+                       rsGetElementAt_uint(ain1, i);
+
+        _RS_ASSERT(result->i0 == sum);
+        _RS_ASSERT(result->i1 == sum);
+        _RS_ASSERT(result->i2 == sum);
+        _RS_ASSERT(result->i3 == sum);
+        _RS_ASSERT(result->i4 == sum);
+        _RS_ASSERT(result->i5 == sum);
+        _RS_ASSERT(result->i6 == sum);
+        _RS_ASSERT(result->i7 == sum);
+    }
+
+    if (failed) {
+        rsDebug("test_sum2_struct_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum2_struct_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+void test_outputs() {
+    failed |= test_sum2_output();
+    failed |= test_sum3_output();
+    failed |= test_sum_mixed_output();
+    failed |= test_sum2_struct_output();
+}
+
+void check_test_results() {
+    if (failed) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    } else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
diff --git a/rsAdapter.cpp b/rsAdapter.cpp
index 9fd39f8..52d8ec7 100644
--- a/rsAdapter.cpp
+++ b/rsAdapter.cpp
@@ -45,7 +45,7 @@
 }
 
 Adapter1D *Adapter1D::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 namespace android {
@@ -118,7 +118,7 @@
 }
 
 Adapter2D *Adapter2D::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 7dcbdf8..46aff3c 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -55,7 +55,7 @@
 
     if (!allocMem) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Couldn't allocate memory for Allocation");
-        return NULL;
+        return nullptr;
     }
 
     Allocation *a = new (allocMem) Allocation(rsc, type, usages, mc, ptr);
@@ -63,7 +63,7 @@
     if (!rsc->mHal.funcs.allocation.init(rsc, a, type->getElement()->getHasReferences())) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Allocation::Allocation, alloc failure");
         delete a;
-        return NULL;
+        return nullptr;
     }
 
     return a;
@@ -82,7 +82,7 @@
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
     if (mGrallocConsumer.get()) {
         mGrallocConsumer->unlockBuffer();
-        mGrallocConsumer = NULL;
+        mGrallocConsumer = nullptr;
     }
 #endif
 
@@ -101,12 +101,12 @@
         (z && (z >= mHal.drvState.lod[lod].dimZ)) ||
         ((face != RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X) && !mHal.state.hasFaces) ||
         (array != 0)) {
-        return NULL;
+        return nullptr;
     }
 
     size_t s = 0;
     //void *ptr = mRSC->mHal.funcs.allocation.lock1D(rsc, this);
-    if ((stride != NULL) && mHal.drvState.lod[0].dimY) {
+    if ((stride != nullptr) && mHal.drvState.lod[0].dimY) {
         *stride = mHal.drvState.lod[lod].stride;
     }
     return mHal.drvState.lod[lod].mallocPtr;
@@ -223,34 +223,40 @@
     }
 
     if (y >= mHal.drvState.lod[0].dimY) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData X offset out of range.");
+        rsc->setError(RS_ERROR_BAD_VALUE,
+                      "subElementData X offset out of range.");
         return;
     }
 
     if (cIdx >= mHal.state.type->getElement()->getFieldCount()) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData component out of range.");
+        rsc->setError(RS_ERROR_BAD_VALUE,
+                      "subElementData component out of range.");
         return;
     }
 
     const Element * e = mHal.state.type->getElement()->getField(cIdx);
-    uint32_t elemArraySize = mHal.state.type->getElement()->getFieldArraySize(cIdx);
+    uint32_t elemArraySize =
+        mHal.state.type->getElement()->getFieldArraySize(cIdx);
     if (sizeBytes != e->getSizeBytes() * elemArraySize) {
         rsc->setError(RS_ERROR_BAD_VALUE, "subElementData bad size.");
         return;
     }
 
-    rsc->mHal.funcs.allocation.elementData2D(rsc, this, x, y, data, cIdx, sizeBytes);
+    rsc->mHal.funcs.allocation.elementData2D(rsc, this, x, y, data, cIdx,
+                                             sizeBytes);
     sendDirty(rsc);
 }
 
 void Allocation::addProgramToDirty(const Program *p) {
-    mToDirtyList.push(p);
+    mToDirtyList.push_back(p);
 }
 
 void Allocation::removeProgramToDirty(const Program *p) {
-    for (size_t ct=0; ct < mToDirtyList.size(); ct++) {
-        if (mToDirtyList[ct] == p) {
-            mToDirtyList.removeAt(ct);
+    for (auto entryIter = mToDirtyList.begin(), endIter = mToDirtyList.end();
+         entryIter != endIter; entryIter++) {
+
+        if (p == *entryIter) {
+            mToDirtyList.erase(entryIter);
             return;
         }
     }
@@ -268,7 +274,8 @@
         }
     }
     ALOGV("%s allocation ptr=%p  mUsageFlags=0x04%x, mMipmapControl=0x%04x",
-         prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags, mHal.state.mipmapControl);
+          prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags,
+          mHal.state.mipmapControl);
 }
 
 uint32_t Allocation::getPackedSize() const {
@@ -378,14 +385,14 @@
     if (classID != RS_A3D_CLASS_ID_ALLOCATION) {
         rsc->setError(RS_ERROR_FATAL_DRIVER,
                       "allocation loading failed due to corrupt file. (invalid id)\n");
-        return NULL;
+        return nullptr;
     }
 
     const char *name = stream->loadString();
 
     Type *type = Type::createFromStream(rsc, stream);
     if (!type) {
-        return NULL;
+        return nullptr;
     }
     type->compute();
 
@@ -402,7 +409,7 @@
                       "allocation loading failed due to corrupt file. (invalid size)\n");
         ObjectBase::checkDelete(alloc);
         ObjectBase::checkDelete(type);
-        return NULL;
+        return nullptr;
     }
 
     alloc->assignName(name);
@@ -439,7 +446,7 @@
 }
 
 void Allocation::callUpdateCacheObject(const Context *rsc, void *dstObj) const {
-    if (rsc->mHal.funcs.allocation.updateCachedObject != NULL) {
+    if (rsc->mHal.funcs.allocation.updateCachedObject != nullptr) {
         rsc->mHal.funcs.allocation.updateCachedObject(rsc, this, (rs_allocation *)dstObj);
     } else {
         *((const void **)dstObj) = this;
@@ -499,7 +506,7 @@
     sp<IGraphicBufferConsumer> bc;
     BufferQueue::createBufferQueue(&bp, &bc);
     mGrallocConsumer = new GrallocConsumer(this, bc);
-    bp->incStrong(NULL);
+    bp->incStrong(nullptr);
 
     mBufferListener = new NewBufferListener();
     mBufferListener->rsc = rsc;
@@ -508,7 +515,7 @@
     mGrallocConsumer->setFrameAvailableListener(mBufferListener);
     return bp.get();
 #else
-    return NULL;
+    return nullptr;
 #endif
     //return rsc->mHal.funcs.allocation.getSurface(rsc, this);
 }
@@ -523,7 +530,7 @@
 }
 
 void Allocation::ioReceive(const Context *rsc) {
-    void *ptr = NULL;
+    void *ptr = nullptr;
     size_t stride = 0;
 #ifndef RS_COMPATIBILITY_LIB
     if (mHal.state.usageFlags & RS_ALLOCATION_USAGE_SCRIPT) {
@@ -637,7 +644,7 @@
                                        uint32_t usages, uintptr_t ptr) {
     Allocation * alloc = Allocation::createAllocation(rsc, static_cast<Type *>(vtype), usages, mipmaps, (void*)ptr);
     if (!alloc) {
-        return NULL;
+        return nullptr;
     }
     alloc->incUserRef();
     return alloc;
@@ -650,9 +657,9 @@
 
     RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, vtype, mipmaps, usages, 0);
     Allocation *texAlloc = static_cast<Allocation *>(vTexAlloc);
-    if (texAlloc == NULL) {
+    if (texAlloc == nullptr) {
         ALOGE("Memory allocation failure");
-        return NULL;
+        return nullptr;
     }
 
     texAlloc->data(rsc, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
@@ -675,9 +682,9 @@
     // Error checking is done in the java layer
     RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, vtype, mipmaps, usages, 0);
     Allocation *texAlloc = static_cast<Allocation *>(vTexAlloc);
-    if (texAlloc == NULL) {
+    if (texAlloc == nullptr) {
         ALOGE("Memory allocation failure");
-        return NULL;
+        return nullptr;
     }
 
     uint32_t faceSize = t->getDimX();
diff --git a/rsAllocation.h b/rsAllocation.h
index f197efc..47344d8 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -170,7 +170,7 @@
     bool hasSameDims(const Allocation *Other) const;
 
 protected:
-    Vector<const Program *> mToDirtyList;
+    std::vector<const Program *> mToDirtyList;
     ObjectBaseRef<const Type> mType;
     void setType(const Type *t) {
         mType.set(t);
diff --git a/rsAnimation.cpp b/rsAnimation.cpp
index f6da138..964e7c1 100644
--- a/rsAnimation.cpp
+++ b/rsAnimation.cpp
@@ -25,7 +25,7 @@
 }
 
 Animation *Animation::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 /*
@@ -34,8 +34,8 @@
     mAllocFile = __FILE__;
     mAllocLine = __LINE__;
 
-    mValuesInput = NULL;
-    mValuesOutput = NULL;
+    mValuesInput = nullptr;
+    mValuesOutput = nullptr;
     mValueCount = 0;
     mInterpolation = RS_ANIMATION_INTERPOLATION_STEP;
     mEdgePre = RS_ANIMATION_EDGE_UNDEFINED;
@@ -51,22 +51,22 @@
 {
     if (valueCount < 2) {
         rsc->setError(RS_ERROR_BAD_VALUE, "Animations require more than 2 values.");
-        return NULL;
+        return nullptr;
     }
     Animation *a = new Animation(rsc);
     if (!a) {
         rsc->setError(RS_ERROR_OUT_OF_MEMORY);
-        return NULL;
+        return nullptr;
     }
 
     float *vin = (float *)malloc(valueCount * sizeof(float));
     float *vout = (float *)malloc(valueCount * sizeof(float));
     a->mValuesInput = vin;
     a->mValuesOutput = vout;
-    if (a->mValuesInput == NULL || a->mValuesOutput == NULL) {
+    if (a->mValuesInput == nullptr || a->mValuesOutput == nullptr) {
         delete a;
         rsc->setError(RS_ERROR_OUT_OF_MEMORY);
-        return NULL;
+        return nullptr;
     }
 
     a->mEdgePre = pre;
@@ -127,8 +127,8 @@
                                 RsAnimationEdge pre,
                                 RsAnimationEdge post) {
     //ALOGE("rsi_ElementCreate %i %i %i %i", dt, dk, norm, vecSize);
-    Animation *a = NULL;//Animation::create(rsc, inValues, outValues, valueCount, interp, pre, post);
-    if (a != NULL) {
+    Animation *a = nullptr;//Animation::create(rsc, inValues, outValues, valueCount, interp, pre, post);
+    if (a != nullptr) {
         a->incUserRef();
     }
     return (RsAnimation)a;
diff --git a/rsContext.cpp b/rsContext.cpp
index e1a7c71..e7ff33d 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -231,7 +231,7 @@
     uint32_t bufferLen = strlen(buffer);
 
     ObjectBaseRef<Font> lastFont(getFont());
-    setFont(NULL);
+    setFont(nullptr);
     float shadowCol = 0.1f;
     mStateFont.setFontColor(shadowCol, shadowCol, shadowCol, 1.0f);
     mStateFont.renderText(buffer, bufferLen, 5, getHeight() - 6);
@@ -247,10 +247,10 @@
 bool Context::loadRuntime(const char* filename, Context* rsc) {
 
     // TODO: store the driverSO somewhere so we can dlclose later
-    void *driverSO = NULL;
+    void *driverSO = nullptr;
 
     driverSO = dlopen(filename, RTLD_LAZY);
-    if (driverSO == NULL) {
+    if (driverSO == nullptr) {
         ALOGE("Failed loading RS driver: %s", dlerror());
         return false;
     }
@@ -261,13 +261,13 @@
     HalSig halInit = (HalSig) dlsym(driverSO, "rsdHalInit");
 
     // If we can't find the C variant, we go looking for the C++ version.
-    if (halInit == NULL) {
+    if (halInit == nullptr) {
         ALOGW("Falling back to find C++ rsdHalInit: %s", dlerror());
         halInit = (HalSig) dlsym(driverSO,
                 "_Z10rsdHalInitPN7android12renderscript7ContextEjj");
     }
 
-    if (halInit == NULL) {
+    if (halInit == nullptr) {
         dlclose(driverSO);
         ALOGE("Failed to find rsdHalInit: %s", dlerror());
         return false;
@@ -353,12 +353,12 @@
         if (!loadRuntime("libRSDriver.so", rsc)) {
             ALOGE("Failed to load default runtime!");
             rsc->setError(RS_ERROR_FATAL_DRIVER, "Failed loading RS driver");
-            return NULL;
+            return nullptr;
         }
     }
 #else // RS_COMPATIBILITY_LIB
     if (rsdHalInit(rsc, 0, 0) != true) {
-        return NULL;
+        return nullptr;
     }
 #endif
 
@@ -369,19 +369,19 @@
     if (rsc->mIsGraphicsContext) {
         if (!rsc->initGLThread()) {
             rsc->setError(RS_ERROR_OUT_OF_MEMORY, "Failed initializing GL");
-            return NULL;
+            return nullptr;
         }
 
         rsc->mStateRaster.init(rsc);
-        rsc->setProgramRaster(NULL);
+        rsc->setProgramRaster(nullptr);
         rsc->mStateVertex.init(rsc);
-        rsc->setProgramVertex(NULL);
+        rsc->setProgramVertex(nullptr);
         rsc->mStateFragment.init(rsc);
-        rsc->setProgramFragment(NULL);
+        rsc->setProgramFragment(nullptr);
         rsc->mStateFragmentStore.init(rsc);
-        rsc->setProgramStore(NULL);
+        rsc->setProgramStore(nullptr);
         rsc->mStateFont.init(rsc);
-        rsc->setFont(NULL);
+        rsc->setFont(nullptr);
         rsc->mStateSampler.init(rsc);
         rsc->mFBOCache.init(rsc);
     }
@@ -390,7 +390,7 @@
     rsc->mRunning = true;
 
     if (rsc->isSynchronous()) {
-        return NULL;
+        return nullptr;
     }
 
     if (!rsc->mIsGraphicsContext) {
@@ -430,7 +430,7 @@
                 drawOnce |= rsc->mIO.playCoreCommands(rsc, -1);
             }
 
-            if ((rsc->mRootScript.get() != NULL) && rsc->mHasSurface &&
+            if ((rsc->mRootScript.get() != nullptr) && rsc->mHasSurface &&
                 (targetRate || drawOnce) && !rsc->mPaused) {
 
                 drawOnce = false;
@@ -462,7 +462,7 @@
 #endif
 
     //ALOGV("%p RS Thread exited", rsc);
-    return NULL;
+    return nullptr;
 }
 
 void Context::destroyWorkerThreadResources() {
@@ -512,11 +512,11 @@
 }
 
 Context::Context() {
-    mDev = NULL;
+    mDev = nullptr;
     mRunning = false;
     mExit = false;
     mPaused = false;
-    mObjHead = NULL;
+    mObjHead = nullptr;
     mError = RS_ERROR_NONE;
     mTargetSdkVersion = 14;
     mDPI = 96;
@@ -543,7 +543,7 @@
 
     if (!rsc->initContext(dev, sc)) {
         delete rsc;
-        return NULL;
+        return nullptr;
     }
     return rsc;
 }
@@ -568,7 +568,7 @@
         memset(&mUserSurfaceConfig, 0, sizeof(mUserSurfaceConfig));
     }
 
-    mIsGraphicsContext = sc != NULL;
+    mIsGraphicsContext = sc != nullptr;
 
     int status;
     pthread_attr_t threadAttr;
@@ -635,7 +635,7 @@
         pthread_mutex_lock(&gInitMutex);
         if (mDev) {
             mDev->removeContext(this);
-            mDev = NULL;
+            mDev = nullptr;
         }
         pthread_mutex_unlock(&gInitMutex);
     }
@@ -647,7 +647,7 @@
     rsAssert(mIsGraphicsContext);
     mHal.funcs.setSurface(this, w, h, sur);
 
-    mHasSurface = sur != NULL;
+    mHasSurface = sur != nullptr;
     mWidth = w;
     mHeight = h;
 
@@ -659,11 +659,11 @@
 
 uint32_t Context::getCurrentSurfaceWidth() const {
     for (uint32_t i = 0; i < mFBOCache.mHal.state.colorTargetsCount; i ++) {
-        if (mFBOCache.mHal.state.colorTargets[i] != NULL) {
+        if (mFBOCache.mHal.state.colorTargets[i] != nullptr) {
             return mFBOCache.mHal.state.colorTargets[i]->getType()->getDimX();
         }
     }
-    if (mFBOCache.mHal.state.depthTarget != NULL) {
+    if (mFBOCache.mHal.state.depthTarget != nullptr) {
         return mFBOCache.mHal.state.depthTarget->getType()->getDimX();
     }
     return mWidth;
@@ -671,11 +671,11 @@
 
 uint32_t Context::getCurrentSurfaceHeight() const {
     for (uint32_t i = 0; i < mFBOCache.mHal.state.colorTargetsCount; i ++) {
-        if (mFBOCache.mHal.state.colorTargets[i] != NULL) {
+        if (mFBOCache.mHal.state.colorTargets[i] != nullptr) {
             return mFBOCache.mHal.state.colorTargets[i]->getType()->getDimY();
         }
     }
-    if (mFBOCache.mHal.state.depthTarget != NULL) {
+    if (mFBOCache.mHal.state.depthTarget != nullptr) {
         return mFBOCache.mHal.state.depthTarget->getType()->getDimY();
     }
     return mHeight;
@@ -698,7 +698,7 @@
 
 void Context::setProgramStore(ProgramStore *pfs) {
     rsAssert(mIsGraphicsContext);
-    if (pfs == NULL) {
+    if (pfs == nullptr) {
         mFragmentStore.set(mStateFragmentStore.mDefault);
     } else {
         mFragmentStore.set(pfs);
@@ -707,7 +707,7 @@
 
 void Context::setProgramFragment(ProgramFragment *pf) {
     rsAssert(mIsGraphicsContext);
-    if (pf == NULL) {
+    if (pf == nullptr) {
         mFragment.set(mStateFragment.mDefault);
     } else {
         mFragment.set(pf);
@@ -716,7 +716,7 @@
 
 void Context::setProgramRaster(ProgramRaster *pr) {
     rsAssert(mIsGraphicsContext);
-    if (pr == NULL) {
+    if (pr == nullptr) {
         mRaster.set(mStateRaster.mDefault);
     } else {
         mRaster.set(pr);
@@ -725,7 +725,7 @@
 
 void Context::setProgramVertex(ProgramVertex *pv) {
     rsAssert(mIsGraphicsContext);
-    if (pv == NULL) {
+    if (pv == nullptr) {
         mVertex.set(mStateVertex.mDefault);
     } else {
         mVertex.set(pv);
@@ -734,7 +734,7 @@
 
 void Context::setFont(Font *f) {
     rsAssert(mIsGraphicsContext);
-    if (f == NULL) {
+    if (f == nullptr) {
         mFont.set(mStateFont.mDefault);
     } else {
         mFont.set(f);
@@ -751,13 +751,15 @@
 void Context::assignName(ObjectBase *obj, const char *name, uint32_t len) {
     rsAssert(!obj->getName());
     obj->setName(name, len);
-    mNames.add(obj);
+    mNames.push_back(obj);
 }
 
 void Context::removeName(ObjectBase *obj) {
-    for (size_t ct=0; ct < mNames.size(); ct++) {
-        if (obj == mNames[ct]) {
-            mNames.removeAt(ct);
+    for (auto nameIter = mNames.begin(), endIter = mNames.end();
+         nameIter != endIter; nameIter++) {
+
+        if (obj == *nameIter) {
+            mNames.erase(nameIter);
             return;
         }
     }
@@ -963,7 +965,7 @@
                                      RsContextType ct, uint32_t flags) {
     //ALOGV("rsContextCreate dev=%p", vdev);
     Device * dev = static_cast<Device *>(vdev);
-    Context *rsc = Context::createContext(dev, NULL, ct, flags);
+    Context *rsc = Context::createContext(dev, nullptr, ct, flags);
     if (rsc) {
         rsc->setTargetSdkVersion(sdkVersion);
     }
@@ -992,4 +994,3 @@
     ObjectBase *ob = static_cast<ObjectBase *>(obj);
     (*name) = ob->getName();
 }
-
diff --git a/rsContext.h b/rsContext.h
index b382358..2e59cc7 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -234,7 +234,7 @@
     static void printWatchdogInfo(void *ctx);
 
     void dumpDebug() const;
-    void setError(RsError e, const char *msg = NULL) const;
+    void setError(RsError e, const char *msg = nullptr) const;
 
     mutable const ObjectBase * mObjHead;
 
@@ -297,7 +297,7 @@
     bool mHasSurface;
     bool mIsContextLite;
 
-    Vector<ObjectBase *> mNames;
+    std::vector<ObjectBase *> mNames;
 
     uint64_t mTimers[_RS_TIMER_TOTAL];
     Timers mTimerActive;
diff --git a/rsCppUtils.h b/rsCppUtils.h
index 71cf077..7f1d58c 100644
--- a/rsCppUtils.h
+++ b/rsCppUtils.h
@@ -19,8 +19,6 @@
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
 #include <utils/Log.h>
-#include <utils/String8.h>
-#include <utils/Vector.h>
 #include <cutils/atomic.h>
 #endif
 
@@ -54,96 +52,6 @@
 #define ALOGV(...) \
     __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__);
 
-namespace android {
-
-    // server has no Vector or String8 classes; implement on top of STL
-    class String8: public std::string {
-    public:
-    String8(const char *ptr) : std::string(ptr) {
-
-        }
-    String8(const char *ptr, size_t len) : std::string(ptr, len) {
-
-        }
-    String8() : std::string() {
-
-        }
-
-        const char* string() const {
-            return this->c_str();
-        }
-
-        void setTo(const char* str, ssize_t len) {
-            this->assign(str, len);
-        }
-        void setTo(const char* str) {
-            this->assign(str);
-        }
-        String8 getPathDir(void) const {
-            const char* cp;
-            const char*const str = this->c_str();
-
-            cp = strrchr(str, OS_PATH_SEPARATOR);
-            if (cp == NULL)
-                return String8("");
-            else
-                return String8(str, cp - str);
-        }
-    };
-
-    template <class T> class Vector: public std::vector<T> {
-    public:
-        void push(T obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const T& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        T* editArray() {
-            return (T*)(this->begin());
-        }
-
-        const T* array() {
-            return (const T*)(this->begin());
-        }
-
-    };
-
-    template<> class Vector<bool>: public std::vector<char> {
-    public:
-        void push(bool obj) {
-            this->push_back(obj);
-        }
-        void removeAt(uint32_t index) {
-            this->erase(this->begin() + index);
-        }
-        ssize_t add(const bool& obj) {
-            this->push_back(obj);
-            return this->size() - 1;
-        }
-        void setCapacity(ssize_t capacity) {
-            this->resize(capacity);
-        }
-
-        bool* editArray() {
-            return (bool*)(this->begin());
-        }
-
-        const bool* array() {
-            return (const bool*)(this->begin());
-        }
-    };
-
-}
-
 typedef int64_t nsecs_t;  // nano-seconds
 
 enum {
@@ -170,7 +78,7 @@
     // we don't support the clocks here.
     struct timeval t;
     t.tv_sec = t.tv_usec = 0;
-    gettimeofday(&t, NULL);
+    gettimeofday(&t, nullptr);
     return nsecs_t(t.tv_sec)*1000000000LL + nsecs_t(t.tv_usec)*1000LL;
 #endif
 }
@@ -286,5 +194,3 @@
 }
 
 #endif //ANDROID_RS_OBJECT_BASE_H
-
-
diff --git a/rsDefines.h b/rsDefines.h
index 532cde1..35b5b94 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -366,7 +366,9 @@
     RS_SCRIPT_INTRINSIC_ID_3DLUT = 8,
     RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
     // unused 10, 11
-    RS_SCRIPT_INTRINSIC_ID_RESIZE = 12
+    RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
+
+    RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
 };
 
 typedef struct {
diff --git a/rsDevice.cpp b/rsDevice.cpp
index 2688890..1ba005a 100644
--- a/rsDevice.cpp
+++ b/rsDevice.cpp
@@ -28,14 +28,16 @@
 }
 
 void Device::addContext(Context *rsc) {
-    mContexts.push(rsc);
+    mContexts.push_back(rsc);
 }
 
 void Device::removeContext(Context *rsc) {
-    for (size_t idx=0; idx < mContexts.size(); idx++) {
-        if (mContexts[idx] == rsc) {
-            mContexts.removeAt(idx);
-            break;
+    for (auto ctxIter = mContexts.begin(), endIter = mContexts.end();
+         ctxIter != endIter; ctxIter++) {
+
+        if (rsc == *ctxIter) {
+            mContexts.erase(ctxIter);
+            return;
         }
     }
 }
@@ -58,4 +60,3 @@
     }
     rsAssert(0);
 }
-
diff --git a/rsDevice.h b/rsDevice.h
index ffb514b..5961336 100644
--- a/rsDevice.h
+++ b/rsDevice.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_RS_DEVICE_H
 #define ANDROID_RS_DEVICE_H
 
+#include <vector>
+
 #include "rsUtils.h"
 
 // ---------------------------------------------------------------------------
@@ -36,7 +38,7 @@
     bool mForceSW;
 
 protected:
-    Vector<Context *> mContexts;
+    std::vector<Context *> mContexts;
 };
 
 }
diff --git a/rsElement.cpp b/rsElement.cpp
index f7b064a..907e3d2 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -24,7 +24,7 @@
 Element::Element(Context *rsc) : ObjectBase(rsc) {
     mBits = 0;
     mBitsUnpadded = 0;
-    mFields = NULL;
+    mFields = nullptr;
     mFieldCount = 0;
     mHasReference = false;
     memset(&mHal, 0, sizeof(mHal));
@@ -42,10 +42,14 @@
 }
 
 void Element::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateElement.mElements.size(); ct++) {
-        if (mRSC->mStateElement.mElements[ct] == this) {
-            mRSC->mStateElement.mElements.removeAt(ct);
-            break;
+    auto &elements = mRSC->mStateElement.mElements;
+
+    for (auto elIter = elements.begin(), endIter = elements.end();
+         elIter != endIter; elIter++) {
+
+        if (this == *elIter) {
+            elements.erase(elIter);
+            return;
         }
     }
 }
@@ -57,7 +61,7 @@
         }
         delete [] mFields;
     }
-    mFields = NULL;
+    mFields = nullptr;
     mFieldCount = 0;
     mHasReference = false;
 
@@ -125,7 +129,7 @@
     RsA3DClassID classID = (RsA3DClassID)stream->loadU32();
     if (classID != RS_A3D_CLASS_ID_ELEMENT) {
         ALOGE("element loading skipped due to invalid class id\n");
-        return NULL;
+        return nullptr;
     }
 
     const char *name = stream->loadString();
@@ -249,7 +253,7 @@
     void* allocMem = rsc->mHal.funcs.allocRuntimeMem(sizeof(Element), 0);
     if (!allocMem) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Couldn't allocate memory for Element");
-        return NULL;
+        return nullptr;
     }
 
     Element *e = new (allocMem) Element(rsc);
@@ -264,7 +268,7 @@
 
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push(e);
+    rsc->mStateElement.mElements.push_back(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -313,7 +317,7 @@
     void* allocMem = rsc->mHal.funcs.allocRuntimeMem(sizeof(Element), 0);
     if (!allocMem) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Couldn't allocate memory for Element");
-        return NULL;
+        return nullptr;
     }
 
     Element *e = new (allocMem) Element(rsc);
@@ -339,7 +343,7 @@
     e->compute();
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push(e);
+    rsc->mStateElement.mElements.push_back(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -390,7 +394,7 @@
 }
 
 void Element::callUpdateCacheObject(const Context *rsc, void *dstObj) const {
-    if (rsc->mHal.funcs.element.updateCachedObject != NULL) {
+    if (rsc->mHal.funcs.element.updateCachedObject != nullptr) {
         rsc->mHal.funcs.element.updateCachedObject(rsc, this, (rs_element *)dstObj);
     } else {
         *((const void **)dstObj) = this;
diff --git a/rsElement.h b/rsElement.h
index 5a3bc13..5c44ba0 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -17,6 +17,8 @@
 #ifndef ANDROID_STRUCTURED_ELEMENT_H
 #define ANDROID_STRUCTURED_ELEMENT_H
 
+#include <vector>
+
 #include "rsComponent.h"
 #include "rsUtils.h"
 #include "rsDefines.h"
@@ -123,8 +125,8 @@
     static const Element* create(Context *rsc, size_t count,
                                  const Element **ein,
                                  const char **nin,
-                                 const size_t * lengths = NULL,
-                                 const uint32_t *asin = NULL) {
+                                 const size_t * lengths = nullptr,
+                                 const uint32_t *asin = nullptr) {
         ObjectBaseRef<const Element> elem = createRef(rsc, count, ein, nin, lengths, asin);
         elem->incUserRef();
         return elem.get();
@@ -170,7 +172,7 @@
     ~ElementState();
 
     // Cache of all existing elements.
-    Vector<Element *> mElements;
+    std::vector<Element *> mElements;
 };
 
 
diff --git a/rsFBOCache.cpp b/rsFBOCache.cpp
index 1da327f..2185235 100644
--- a/rsFBOCache.cpp
+++ b/rsFBOCache.cpp
@@ -28,7 +28,7 @@
     mHal.state.colorTargetsCount = 1;
     mHal.state.colorTargets = new Allocation*[mHal.state.colorTargetsCount];
     mColorTargets = new ObjectBaseRef<Allocation>[mHal.state.colorTargetsCount];
-    resetAll(NULL);
+    resetAll(nullptr);
 }
 
 FBOCache::~FBOCache() {
@@ -49,7 +49,7 @@
         ALOGE("Invalid render target index");
         return;
     }
-    if (a != NULL) {
+    if (a != nullptr) {
         if (!(a->getIsTexture() || (a->mHal.state.usageFlags & RS_ALLOCATION_USAGE_IO_OUTPUT))) {
             ALOGE("Invalid Color Target");
             return;
@@ -61,7 +61,7 @@
 }
 
 void FBOCache::bindDepthTarget(Context *rsc, Allocation *a) {
-    if (a != NULL) {
+    if (a != nullptr) {
         if (!a->getIsRenderTarget()) {
             ALOGE("Invalid Depth Target");
             return;
@@ -74,11 +74,11 @@
 
 void FBOCache::resetAll(Context *) {
     for (uint32_t i = 0; i < mHal.state.colorTargetsCount; i ++) {
-        mColorTargets[i].set(NULL);
-        mHal.state.colorTargets[i] = NULL;
+        mColorTargets[i].set(nullptr);
+        mHal.state.colorTargets[i] = nullptr;
     }
-    mDepthTarget.set(NULL);
-    mHal.state.depthTarget = NULL;
+    mDepthTarget.set(nullptr);
+    mHal.state.depthTarget = nullptr;
     mDirty = true;
 }
 
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index a589033..6f14637 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -32,11 +32,11 @@
 using namespace android::renderscript;
 
 FileA3D::FileA3D(Context *rsc) : ObjectBase(rsc) {
-    mAlloc = NULL;
-    mData = NULL;
-    mWriteStream = NULL;
-    mReadStream = NULL;
-    mAsset = NULL;
+    mAlloc = nullptr;
+    mData = nullptr;
+    mWriteStream = nullptr;
+    mReadStream = nullptr;
+    mAsset = nullptr;
 
     mMajorVersion = 0;
     mMinorVersion = 1;
@@ -86,8 +86,8 @@
             entry->mOffset = headerStream->loadU32();
             entry->mLength = headerStream->loadU32();
         }
-        entry->mRsObj = NULL;
-        mIndex.push(entry);
+        entry->mRsObj = nullptr;
+        mIndex.push_back(entry);
     }
 }
 
@@ -225,17 +225,17 @@
     if (index < mIndex.size()) {
         return mIndex[index];
     }
-    return NULL;
+    return nullptr;
 }
 
 ObjectBase *FileA3D::initializeFromEntry(size_t index) {
     if (index >= mIndex.size()) {
-        return NULL;
+        return nullptr;
     }
 
     FileA3D::A3DIndexEntry *entry = mIndex[index];
     if (!entry) {
-        return NULL;
+        return nullptr;
     }
 
     if (entry->mRsObj) {
@@ -247,7 +247,7 @@
     mReadStream->reset(entry->mOffset);
     switch (entry->mType) {
         case RS_A3D_CLASS_ID_UNKNOWN:
-            return NULL;
+            return nullptr;
         case RS_A3D_CLASS_ID_MESH:
             entry->mRsObj = Mesh::createFromStream(mRSC, mReadStream);
             break;
@@ -379,7 +379,7 @@
     indexEntry->mType = obj->getClassId();
     indexEntry->mOffset = mWriteStream->getPos();
     indexEntry->mRsObj = obj;
-    mWriteIndex.push(indexEntry);
+    mWriteIndex.push_back(indexEntry);
     obj->serialize(con, mWriteStream);
     indexEntry->mLength = mWriteStream->getPos() - indexEntry->mOffset;
     mWriteStream->align(4);
@@ -389,7 +389,7 @@
     FileA3D *fa3d = static_cast<FileA3D *>(file);
     if (!fa3d) {
         ALOGE("Can't load entry. No valid file");
-        return NULL;
+        return nullptr;
     }
 
     ObjectBase *obj = fa3d->initializeFromEntry(index);
@@ -418,7 +418,7 @@
     }
 
     uint32_t numFileEntries = fa3d->getNumIndexEntries();
-    if (numFileEntries != numEntries || numEntries == 0 || fileEntries == NULL) {
+    if (numFileEntries != numEntries || numEntries == 0 || fileEntries == nullptr) {
         ALOGE("Can't load index entries. Invalid number requested");
         return;
     }
@@ -431,9 +431,9 @@
 }
 
 RsFile rsaFileA3DCreateFromMemory(RsContext con, const void *data, uint32_t len) {
-    if (data == NULL) {
-        ALOGE("File load failed. Asset stream is NULL");
-        return NULL;
+    if (data == nullptr) {
+        ALOGE("File load failed. Asset stream is nullptr");
+        return nullptr;
     }
 
     Context *rsc = static_cast<Context *>(con);
@@ -454,18 +454,18 @@
     fa3d->load(asset);
     return fa3d;
 #else
-    return NULL;
+    return nullptr;
 #endif
 }
 
 RsFile rsaFileA3DCreateFromFile(RsContext con, const char *path) {
-    if (path == NULL) {
-        ALOGE("File load failed. Path is NULL");
-        return NULL;
+    if (path == nullptr) {
+        ALOGE("File load failed. Path is nullptr");
+        return nullptr;
     }
 
     Context *rsc = static_cast<Context *>(con);
-    FileA3D *fa3d = NULL;
+    FileA3D *fa3d = nullptr;
 
     FILE *f = fopen(path, "rb");
     if (f) {
diff --git a/rsFileA3D.h b/rsFileA3D.h
index 8bf36b9..0c8b3d6 100644
--- a/rsFileA3D.h
+++ b/rsFileA3D.h
@@ -88,15 +88,13 @@
     Asset *mAsset;
 
     OStream *mWriteStream;
-    Vector<A3DIndexEntry*> mWriteIndex;
+    std::vector<A3DIndexEntry*> mWriteIndex;
 
     IStream *mReadStream;
-    Vector<A3DIndexEntry*> mIndex;
+    std::vector<A3DIndexEntry*> mIndex;
 };
 
 
 }
 }
 #endif //ANDROID_RS_FILE_A3D_H
-
-
diff --git a/rsFont.cpp b/rsFont.cpp
index 8feef2d..2f09384 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -33,10 +33,10 @@
 using namespace android;
 using namespace android::renderscript;
 
-Font::Font(Context *rsc) : ObjectBase(rsc), mCachedGlyphs(NULL) {
+Font::Font(Context *rsc) : ObjectBase(rsc) {
     mInitialized = false;
     mHasKerning = false;
-    mFace = NULL;
+    mFace = nullptr;
 }
 
 bool Font::init(const char *name, float fontSize, uint32_t dpi, const void *data, uint32_t dataLen) {
@@ -47,7 +47,7 @@
     }
 
     FT_Error error = 0;
-    if (data != NULL && dataLen > 0) {
+    if (data != nullptr && dataLen > 0) {
         error = FT_New_Memory_Face(mRSC->mStateFont.getLib(), (const FT_Byte*)data, dataLen, 0, &mFace);
     } else {
         error = FT_New_Face(mRSC->mStateFont.getLib(), name, 0, &mFace);
@@ -76,17 +76,21 @@
 }
 
 void Font::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateFont.mActiveFonts.size(); ct++) {
-        if (mRSC->mStateFont.mActiveFonts[ct] == this) {
-            mRSC->mStateFont.mActiveFonts.removeAt(ct);
-            break;
+    auto &activeFonts = mRSC->mStateFont.mActiveFonts;
+
+    for (auto font = activeFonts.begin(), end = activeFonts.end(); font != end;
+         font++) {
+
+        if (this == *font) {
+            activeFonts.erase(font);
+            return;
         }
     }
 }
 
 void Font::invalidateTextureCache() {
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        mCachedGlyphs.valueAt(i)->mIsValid = false;
+        mCachedGlyphs[i]->mIsValid = false;
     }
 }
 
@@ -162,12 +166,12 @@
                      uint32_t start, int32_t numGlyphs,
                      RenderMode mode, Rect *bounds,
                      uint8_t *bitmap, uint32_t bitmapW, uint32_t bitmapH) {
-    if (!mInitialized || numGlyphs == 0 || text == NULL || len == 0) {
+    if (!mInitialized || numGlyphs == 0 || text == nullptr || len == 0) {
         return;
     }
 
     if (mode == Font::MEASURE) {
-        if (bounds == NULL) {
+        if (bounds == nullptr) {
             ALOGE("No return rectangle provided to measure text");
             return;
         }
@@ -224,8 +228,8 @@
 
 Font::CachedGlyphInfo* Font::getCachedUTFChar(int32_t utfChar) {
 
-    CachedGlyphInfo *cachedGlyph = mCachedGlyphs.valueFor((uint32_t)utfChar);
-    if (cachedGlyph == NULL) {
+    CachedGlyphInfo *cachedGlyph = mCachedGlyphs[(uint32_t)utfChar];
+    if (cachedGlyph == nullptr) {
         cachedGlyph = cacheGlyph((uint32_t)utfChar);
     }
     // Is the glyph still in texture cache?
@@ -283,7 +287,7 @@
 
 Font::CachedGlyphInfo *Font::cacheGlyph(uint32_t glyph) {
     CachedGlyphInfo *newGlyph = new CachedGlyphInfo();
-    mCachedGlyphs.add(glyph, newGlyph);
+    mCachedGlyphs[glyph] = newGlyph;
 #ifndef ANDROID_RS_SERIALIZE
     newGlyph->mGlyphIndex = FT_Get_Char_Index(mFace, glyph);
     newGlyph->mIsValid = false;
@@ -296,11 +300,14 @@
 Font * Font::create(Context *rsc, const char *name, float fontSize, uint32_t dpi,
                     const void *data, uint32_t dataLen) {
     rsc->mStateFont.checkInit();
-    Vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
+    std::vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
 
     for (uint32_t i = 0; i < activeFonts.size(); i ++) {
         Font *ithFont = activeFonts[i];
-        if (ithFont->mFontName == name && ithFont->mFontSize == fontSize && ithFont->mDpi == dpi) {
+        if (ithFont->mFontName == name &&
+            ithFont->mFontSize == fontSize &&
+            ithFont->mDpi == dpi) {
+
             return ithFont;
         }
     }
@@ -308,13 +315,13 @@
     Font *newFont = new Font(rsc);
     bool isInitialized = newFont->init(name, fontSize, dpi, data, dataLen);
     if (isInitialized) {
-        activeFonts.push(newFont);
+        activeFonts.push_back(newFont);
         rsc->mStateFont.precacheLatin(newFont);
         return newFont;
     }
 
     ObjectBase::checkDelete(newFont);
-    return NULL;
+    return nullptr;
 }
 
 Font::~Font() {
@@ -325,7 +332,7 @@
 #endif
 
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        CachedGlyphInfo *glyph = mCachedGlyphs.valueAt(i);
+        CachedGlyphInfo *glyph = mCachedGlyphs[i];
         delete glyph;
     }
 }
@@ -334,9 +341,9 @@
     mInitialized = false;
     mMaxNumberOfQuads = 1024;
     mCurrentQuadIndex = 0;
-    mRSC = NULL;
+    mRSC = nullptr;
 #ifndef ANDROID_RS_SERIALIZE
-    mLibrary = NULL;
+    mLibrary = nullptr;
 #endif //ANDROID_RS_SERIALIZE
 
     float gamma = DEFAULT_TEXT_GAMMA;
@@ -348,17 +355,17 @@
     char property[PROPERTY_VALUE_MAX];
 
     // Get the gamma
-    if (property_get(PROPERTY_TEXT_GAMMA, property, NULL) > 0) {
+    if (property_get(PROPERTY_TEXT_GAMMA, property, nullptr) > 0) {
         gamma = atof(property);
     }
 
     // Get the black gamma threshold
-    if (property_get(PROPERTY_TEXT_BLACK_GAMMA_THRESHOLD, property, NULL) > 0) {
+    if (property_get(PROPERTY_TEXT_BLACK_GAMMA_THRESHOLD, property, nullptr) > 0) {
         blackThreshold = atoi(property);
     }
 
     // Get the white gamma threshold
-    if (property_get(PROPERTY_TEXT_WHITE_GAMMA_THRESHOLD, property, NULL) > 0) {
+    if (property_get(PROPERTY_TEXT_WHITE_GAMMA_THRESHOLD, property, nullptr) > 0) {
         whiteThreshold = atoi(property);
     }
 #endif
@@ -386,7 +393,7 @@
         FT_Error error = FT_Init_FreeType(&mLibrary);
         if (error) {
             ALOGE("Unable to initialize freetype");
-            return NULL;
+            return nullptr;
         }
     }
 
@@ -551,29 +558,39 @@
     mCacheHeight = 256;
     mCacheWidth = 1024;
     ObjectBaseRef<Type> texType = Type::getTypeRef(mRSC, alphaElem.get(),
-                                                   mCacheWidth, mCacheHeight, 0, false, false, 0);
+                                                   mCacheWidth, mCacheHeight,
+                                                   0, false, false, 0);
+
     mCacheBuffer = new uint8_t[mCacheWidth * mCacheHeight];
 
 
-    Allocation *cacheAlloc = Allocation::createAllocation(mRSC, texType.get(),
-                                RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
+    Allocation *cacheAlloc =
+        Allocation::createAllocation(mRSC, texType.get(),
+                                     RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
     mTextTexture.set(cacheAlloc);
 
     // Split up our cache texture into lines of certain widths
     int32_t nextLine = 0;
-    mCacheLines.push(new CacheTextureLine(16, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(40, texType->getDimX(), nextLine, 0));
-    nextLine += mCacheLines.top()->mMaxHeight;
-    mCacheLines.push(new CacheTextureLine(texType->getDimY() - nextLine, texType->getDimX(), nextLine, 0));
+    mCacheLines.push_back(new CacheTextureLine(16, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(40, texType->getDimX(),
+                          nextLine, 0));
+    nextLine += mCacheLines.back()->mMaxHeight;
+    mCacheLines.push_back(new CacheTextureLine(texType->getDimY() - nextLine,
+                          texType->getDimX(), nextLine, 0));
 }
 
 // Avoid having to reallocate memory and render quad by quad
@@ -835,7 +852,7 @@
 #ifndef ANDROID_RS_SERIALIZE
     if (mLibrary) {
         FT_Done_FreeType( mLibrary );
-        mLibrary = NULL;
+        mLibrary = nullptr;
     }
 #endif //ANDROID_RS_SERIALIZE
 }
diff --git a/rsFont.h b/rsFont.h
index 7bac508..bc343c1 100644
--- a/rsFont.h
+++ b/rsFont.h
@@ -17,9 +17,10 @@
 #ifndef ANDROID_RS_FONT_H
 #define ANDROID_RS_FONT_H
 
+#include <map>
+#include <vector>
+
 #include "rsStream.h"
-#include <utils/Vector.h>
-#include <utils/KeyedVector.h>
 
 struct FT_LibraryRec_;
 struct FT_FaceRec_;
@@ -73,7 +74,7 @@
     }
 
     static Font * create(Context *rsc, const char *name, float fontSize, uint32_t dpi,
-                         const void *data = NULL, uint32_t dataLen = 0);
+                         const void *data = nullptr, uint32_t dataLen = 0);
 
 protected:
 
@@ -84,8 +85,8 @@
     // Last two variables are the initial pen position
     void renderUTF(const char *text, uint32_t len, int32_t x, int32_t y,
                    uint32_t start, int32_t numGlyphs,
-                   RenderMode mode = FRAMEBUFFER, Rect *bounds = NULL,
-                   uint8_t *bitmap = NULL, uint32_t bitmapW = 0, uint32_t bitmapH = 0);
+                   RenderMode mode = FRAMEBUFFER, Rect *bounds = nullptr,
+                   uint8_t *bitmap = nullptr, uint32_t bitmapW = 0, uint32_t bitmapH = 0);
 
     void invalidateTextureCache();
     struct CachedGlyphInfo
@@ -117,14 +118,14 @@
     uint32_t mDpi;
 
     Font(Context *rsc);
-    bool init(const char *name, float fontSize, uint32_t dpi, const void *data = NULL, uint32_t dataLen = 0);
+    bool init(const char *name, float fontSize, uint32_t dpi, const void *data = nullptr, uint32_t dataLen = 0);
 
     virtual void preDestroy() const;
     FT_FaceRec_ *mFace;
     bool mInitialized;
     bool mHasKerning;
 
-    DefaultKeyedVector<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
+    std::map<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
     CachedGlyphInfo* getCachedUTFChar(int32_t utfChar);
 
     CachedGlyphInfo *cacheGlyph(uint32_t glyph);
@@ -148,8 +149,8 @@
     void renderText(const char *text, uint32_t len, int32_t x, int32_t y,
                     uint32_t startIndex = 0, int numGlyphs = -1,
                     Font::RenderMode mode = Font::FRAMEBUFFER,
-                    Font::Rect *bounds = NULL,
-                    uint8_t *bitmap = NULL, uint32_t bitmapW = 0, uint32_t bitmapH = 0);
+                    Font::Rect *bounds = nullptr,
+                    uint8_t *bitmap = nullptr, uint32_t bitmapW = 0, uint32_t bitmapH = 0);
 
     void measureText(const char *text, uint32_t len, Font::Rect *bounds);
 
@@ -178,7 +179,7 @@
         bool fitBitmap(FT_Bitmap_ *bitmap, uint32_t *retOriginX, uint32_t *retOriginY);
     };
 
-    Vector<CacheTextureLine*> mCacheLines;
+    std::vector<CacheTextureLine*> mCacheLines;
     uint32_t getRemainingCacheCapacity();
 
     void precacheLatin(Font *font);
@@ -203,7 +204,7 @@
     FT_LibraryRec_ *mLibrary;
     FT_LibraryRec_ *getLib();
 #endif //ANDROID_RS_SERIALIZE
-    Vector<Font*> mActiveFonts;
+    std::vector<Font*> mActiveFonts;
 
     // Render state for the font
     ObjectBaseRef<Allocation> mFontShaderFConstant;
diff --git a/rsGrallocConsumer.cpp b/rsGrallocConsumer.cpp
index c5d37b2..ea333cc 100644
--- a/rsGrallocConsumer.cpp
+++ b/rsGrallocConsumer.cpp
@@ -87,7 +87,7 @@
         }
     }
 
-    void *bufferPointer = NULL;
+    void *bufferPointer = nullptr;
     android_ycbcr ycbcr = android_ycbcr();
 
     if (mSlots[buf].mGraphicBuffer->getPixelFormat() ==
@@ -182,7 +182,7 @@
     }
 
     mAcquiredBuffer.mSlot = BufferQueue::INVALID_BUFFER_SLOT;
-    mAcquiredBuffer.mBufferPointer = NULL;
+    mAcquiredBuffer.mBufferPointer = nullptr;
     mAcquiredBuffer.mGraphicBuffer.clear();
     return OK;
 }
diff --git a/rsGrallocConsumer.h b/rsGrallocConsumer.h
index 9e4fc58..1f4daef 100644
--- a/rsGrallocConsumer.h
+++ b/rsGrallocConsumer.h
@@ -17,12 +17,12 @@
 #ifndef ANDROID_RS_GRALLOC_CONSUMER_H
 #define ANDROID_RS_GRALLOC_CONSUMER_H
 
+#include <vector>
+
 #include <gui/ConsumerBase.h>
 
 #include <ui/GraphicBuffer.h>
 
-#include <utils/String8.h>
-#include <utils/Vector.h>
 #include <utils/threads.h>
 
 
@@ -65,7 +65,7 @@
 
         AcquiredBuffer() :
                 mSlot(BufferQueue::INVALID_BUFFER_SLOT),
-                mBufferPointer(NULL) {
+                mBufferPointer(nullptr) {
         }
     };
     AcquiredBuffer mAcquiredBuffer;
@@ -75,4 +75,3 @@
 } // namespace android
 
 #endif // ANDROID_RS_GRALLOC_CONSUMER_H
-
diff --git a/rsMesh.cpp b/rsMesh.cpp
index 8decfc5..08d85fb 100644
--- a/rsMesh.cpp
+++ b/rsMesh.cpp
@@ -22,23 +22,23 @@
 using namespace android::renderscript;
 
 Mesh::Mesh(Context *rsc) : ObjectBase(rsc) {
-    mHal.drv = NULL;
-    mHal.state.primitives = NULL;
+    mHal.drv = nullptr;
+    mHal.state.primitives = nullptr;
     mHal.state.primitivesCount = 0;
-    mHal.state.indexBuffers = NULL;
+    mHal.state.indexBuffers = nullptr;
     mHal.state.indexBuffersCount = 0;
-    mHal.state.vertexBuffers = NULL;
+    mHal.state.vertexBuffers = nullptr;
     mHal.state.vertexBuffersCount = 0;
     mInitialized = false;
 
-    mVertexBuffers = NULL;
-    mIndexBuffers = NULL;
+    mVertexBuffers = nullptr;
+    mIndexBuffers = nullptr;
 }
 
 Mesh::Mesh(Context *rsc,
            uint32_t vertexBuffersCount,
            uint32_t primitivesCount) : ObjectBase(rsc) {
-    mHal.drv = NULL;
+    mHal.drv = nullptr;
     mHal.state.primitivesCount = primitivesCount;
     mHal.state.indexBuffersCount = primitivesCount;
     mHal.state.primitives = new RsPrimitive[mHal.state.primitivesCount];
@@ -47,12 +47,12 @@
         mHal.state.primitives[i] = RS_PRIMITIVE_POINT;
     }
     for (uint32_t i = 0; i < mHal.state.indexBuffersCount; i ++) {
-        mHal.state.indexBuffers[i] = NULL;
+        mHal.state.indexBuffers[i] = nullptr;
     }
     mHal.state.vertexBuffersCount = vertexBuffersCount;
     mHal.state.vertexBuffers = new Allocation *[mHal.state.vertexBuffersCount];
     for (uint32_t i = 0; i < mHal.state.vertexBuffersCount; i ++) {
-        mHal.state.vertexBuffers[i] = NULL;
+        mHal.state.vertexBuffers[i] = nullptr;
     }
 
     mVertexBuffers = new ObjectBaseRef<Allocation>[mHal.state.vertexBuffersCount];
@@ -108,13 +108,13 @@
     RsA3DClassID classID = (RsA3DClassID)stream->loadU32();
     if (classID != RS_A3D_CLASS_ID_MESH) {
         ALOGE("mesh loading skipped due to invalid class id");
-        return NULL;
+        return nullptr;
     }
 
     const char *name = stream->loadString();
 
     uint32_t vertexBuffersCount = stream->loadU32();
-    ObjectBaseRef<Allocation> *vertexBuffers = NULL;
+    ObjectBaseRef<Allocation> *vertexBuffers = nullptr;
     if (vertexBuffersCount) {
         vertexBuffers = new ObjectBaseRef<Allocation>[vertexBuffersCount];
 
@@ -125,8 +125,8 @@
     }
 
     uint32_t primitivesCount = stream->loadU32();
-    ObjectBaseRef<Allocation> *indexBuffers = NULL;
-    RsPrimitive *primitives = NULL;
+    ObjectBaseRef<Allocation> *indexBuffers = nullptr;
+    RsPrimitive *primitives = nullptr;
     if (primitivesCount) {
         indexBuffers = new ObjectBaseRef<Allocation>[primitivesCount];
         primitives = new RsPrimitive[primitivesCount];
@@ -213,11 +213,11 @@
 }
 
 void Mesh::computeBBox(Context *rsc) {
-    float *posPtr = NULL;
+    float *posPtr = nullptr;
     uint32_t vectorSize = 0;
     uint32_t stride = 0;
     uint32_t numVerts = 0;
-    Allocation *posAlloc = NULL;
+    Allocation *posAlloc = nullptr;
     // First we need to find the position ptr and stride
     for (uint32_t ct=0; ct < mHal.state.vertexBuffersCount; ct++) {
         const Type *bufferType = mHal.state.vertexBuffers[ct]->getType();
diff --git a/rsMesh.h b/rsMesh.h
index 9b61ebe..c7ee088 100644
--- a/rsMesh.h
+++ b/rsMesh.h
@@ -45,7 +45,7 @@
             Allocation **vertexBuffers;
             uint32_t vertexBuffersCount;
 
-            // indexBuffers[i] could be NULL, in which case only primitives[i] is used
+            // indexBuffers[i] could be nullptr, in which case only primitives[i] is used
             Allocation **indexBuffers;
             uint32_t indexBuffersCount;
             RsPrimitive *primitives;
diff --git a/rsMutex.cpp b/rsMutex.cpp
index 6512372..557c588 100644
--- a/rsMutex.cpp
+++ b/rsMutex.cpp
@@ -28,7 +28,7 @@
 }
 
 bool Mutex::init() {
-    int status = pthread_mutex_init(&mMutex, NULL);
+    int status = pthread_mutex_init(&mMutex, nullptr);
     if (status) {
         ALOGE("Mutex::Mutex init failure");
         return false;
diff --git a/rsObjectBase.cpp b/rsObjectBase.cpp
index 0a0961f..dc94bcc 100644
--- a/rsObjectBase.cpp
+++ b/rsObjectBase.cpp
@@ -26,10 +26,10 @@
     mUserRefCount = 0;
     mSysRefCount = 0;
     mRSC = rsc;
-    mNext = NULL;
-    mPrev = NULL;
-    mDH = NULL;
-    mName = NULL;
+    mNext = nullptr;
+    mPrev = nullptr;
+    mDH = nullptr;
+    mName = nullptr;
 
 #if RS_OBJECT_DEBUG
     mDH = new DebugHelper();
@@ -45,7 +45,7 @@
 #if RS_OBJECT_DEBUG
     mDH->dump();
     delete mDH;
-    mDH = NULL;
+    mDH = nullptr;
 #endif
 
     free(const_cast<char *>(mName));
@@ -203,8 +203,8 @@
     if (mNext) {
         mNext->mPrev = mPrev;
     }
-    mPrev = NULL;
-    mNext = NULL;
+    mPrev = nullptr;
+    mNext = nullptr;
 }
 
 void ObjectBase::zeroAllUserRef(Context *rsc) {
diff --git a/rsObjectBase.h b/rsObjectBase.h
index 66977a9..cd1b16e 100644
--- a/rsObjectBase.h
+++ b/rsObjectBase.h
@@ -96,7 +96,7 @@
 class ObjectBaseRef {
 public:
     ObjectBaseRef() {
-        mRef = NULL;
+        mRef = nullptr;
     }
 
     ObjectBaseRef(const ObjectBaseRef &ref) {
@@ -142,7 +142,7 @@
         if (mRef) {
             mRef->decSysRef();
         }
-        mRef = NULL;
+        mRef = nullptr;
     }
 
     inline T * get() const {
diff --git a/rsProgram.cpp b/rsProgram.cpp
index d5cb344..5b24c0d 100644
--- a/rsProgram.cpp
+++ b/rsProgram.cpp
@@ -103,21 +103,21 @@
     mHal.state.constantsCount = 0;
     mHal.state.texturesCount = 0;
 
-    if (mUserShader != NULL) {
+    if (mUserShader != nullptr) {
         delete[] mUserShader;
-        mUserShader = NULL;
+        mUserShader = nullptr;
     }
     mUserShaderLen = 0;
 }
 
 bool Program::freeChildren() {
     for (uint32_t ct=0; ct < mHal.state.constantsCount; ct++) {
-        bindAllocation(NULL, NULL, ct);
+        bindAllocation(nullptr, nullptr, ct);
     }
 
     for (uint32_t ct=0; ct < mHal.state.texturesCount; ct++) {
-        bindTexture(NULL, ct, NULL);
-        bindSampler(NULL, ct, NULL);
+        bindTexture(nullptr, ct, nullptr);
+        bindSampler(nullptr, ct, nullptr);
     }
     return false;
 }
@@ -125,32 +125,32 @@
 void Program::initMemberVars() {
     mDirty = true;
 
-    mHal.drv = NULL;
-    mHal.state.textures = NULL;
-    mHal.state.samplers = NULL;
-    mHal.state.textureTargets = NULL;
-    mHal.state.inputElements = NULL;
-    mHal.state.constantTypes = NULL;
-    mHal.state.constants = NULL;
+    mHal.drv = nullptr;
+    mHal.state.textures = nullptr;
+    mHal.state.samplers = nullptr;
+    mHal.state.textureTargets = nullptr;
+    mHal.state.inputElements = nullptr;
+    mHal.state.constantTypes = nullptr;
+    mHal.state.constants = nullptr;
 
     mHal.state.inputElementsCount = 0;
     mHal.state.constantsCount = 0;
     mHal.state.texturesCount = 0;
 
-    mTextures = NULL;
-    mSamplers = NULL;
-    mInputElements = NULL;
-    mConstantTypes = NULL;
-    mConstants = NULL;
+    mTextures = nullptr;
+    mSamplers = nullptr;
+    mInputElements = nullptr;
+    mConstantTypes = nullptr;
+    mConstants = nullptr;
 
     mIsInternal = false;
 
-    mUserShader = NULL;
+    mUserShader = nullptr;
     mUserShaderLen = 0;
 }
 
 void Program::bindAllocation(Context *rsc, Allocation *alloc, uint32_t slot) {
-    if (alloc != NULL) {
+    if (alloc != nullptr) {
         if (slot >= mHal.state.constantsCount) {
             ALOGE("Attempt to bind alloc at slot %u, on shader id %" PRIuPTR ", but const count is %u",
                  slot, (uintptr_t)this, mHal.state.constantsCount);
diff --git a/rsProgramFragment.cpp b/rsProgramFragment.cpp
index 360d3ab..7af6738 100644
--- a/rsProgramFragment.cpp
+++ b/rsProgramFragment.cpp
@@ -45,7 +45,7 @@
         rsc->setError(RS_ERROR_BAD_SHADER, "Cannot  set fixed function emulation color on user program");
         return;
     }
-    if (mHal.state.constants[0] == NULL) {
+    if (mHal.state.constants[0] == nullptr) {
         ALOGE("Unable to set fixed function emulation color because allocation is missing");
         rsc->setError(RS_ERROR_BAD_SHADER, "Unable to set fixed function emulation color because allocation is missing");
         return;
@@ -81,16 +81,16 @@
 }
 
 ProgramFragment *ProgramFragment::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 ProgramFragmentState::ProgramFragmentState() {
-    mPF = NULL;
+    mPF = nullptr;
 }
 
 ProgramFragmentState::~ProgramFragmentState() {
     ObjectBase::checkDelete(mPF);
-    mPF = NULL;
+    mPF = nullptr;
 }
 
 void ProgramFragmentState::init(Context *rsc) {
@@ -118,7 +118,7 @@
     Allocation *constAlloc = Allocation::createAllocation(rsc, inputType.get(),
                               RS_ALLOCATION_USAGE_SCRIPT | RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS);
     ProgramFragment *pf = new ProgramFragment(rsc, shaderString, strlen(shaderString),
-                                              NULL, 0, NULL, tmp, 2);
+                                              nullptr, 0, nullptr, tmp, 2);
     pf->bindAllocation(rsc, constAlloc, 0);
     pf->setConstantColor(rsc, 1.0f, 1.0f, 1.0f, 1.0f);
 
diff --git a/rsProgramFragment.h b/rsProgramFragment.h
index e7456b9..1357bfc 100644
--- a/rsProgramFragment.h
+++ b/rsProgramFragment.h
@@ -55,7 +55,7 @@
     void deinit(Context *rsc);
 
     ObjectBaseRef<ProgramFragment> mDefault;
-    Vector<ProgramFragment *> mPrograms;
+    std::vector<ProgramFragment *> mPrograms;
 
     ObjectBaseRef<ProgramFragment> mLast;
 };
@@ -63,7 +63,3 @@
 }
 }
 #endif
-
-
-
-
diff --git a/rsProgramRaster.cpp b/rsProgramRaster.cpp
index 4f27f2e..d47e588 100644
--- a/rsProgramRaster.cpp
+++ b/rsProgramRaster.cpp
@@ -31,10 +31,14 @@
 }
 
 void ProgramRaster::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateRaster.mRasterPrograms.size(); ct++) {
-        if (mRSC->mStateRaster.mRasterPrograms[ct] == this) {
-            mRSC->mStateRaster.mRasterPrograms.removeAt(ct);
-            break;
+    auto &rasters = mRSC->mStateRaster.mRasterPrograms;
+
+    for (auto prIter = rasters.begin(), endIter = rasters.end();
+         prIter != endIter; prIter++) {
+
+        if (this == *prIter) {
+            rasters.erase(prIter);
+            return;
         }
     }
 }
@@ -57,7 +61,7 @@
 }
 
 ProgramRaster *ProgramRaster::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 ProgramRasterState::ProgramRasterState() {
@@ -94,7 +98,7 @@
     returnRef.set(pr);
 
     ObjectBase::asyncLock();
-    rsc->mStateRaster.mRasterPrograms.push(pr);
+    rsc->mStateRaster.mRasterPrograms.push_back(pr);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -111,4 +115,3 @@
 
 }
 }
-
diff --git a/rsProgramRaster.h b/rsProgramRaster.h
index e9a524b..207d74c 100644
--- a/rsProgramRaster.h
+++ b/rsProgramRaster.h
@@ -75,14 +75,10 @@
     ObjectBaseRef<ProgramRaster> mLast;
 
     // Cache of all existing raster programs.
-    Vector<ProgramRaster *> mRasterPrograms;
+    std::vector<ProgramRaster *> mRasterPrograms;
 };
 
 
 }
 }
 #endif
-
-
-
-
diff --git a/rsProgramStore.cpp b/rsProgramStore.cpp
index 83c1f2c..b07f820 100644
--- a/rsProgramStore.cpp
+++ b/rsProgramStore.cpp
@@ -42,10 +42,14 @@
 }
 
 void ProgramStore::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateFragmentStore.mStorePrograms.size(); ct++) {
-        if (mRSC->mStateFragmentStore.mStorePrograms[ct] == this) {
-            mRSC->mStateFragmentStore.mStorePrograms.removeAt(ct);
-            break;
+    auto &stores = mRSC->mStateFragmentStore.mStorePrograms;
+
+    for (auto psIter = stores.begin(), endIter = stores.end();
+         psIter != endIter; psIter++) {
+
+        if (this == *psIter) {
+            stores.erase(psIter);
+            return;
         }
     }
 }
@@ -67,7 +71,7 @@
 }
 
 ProgramStore *ProgramStore::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 void ProgramStore::init() {
@@ -118,7 +122,7 @@
     pfs->init();
 
     ObjectBase::asyncLock();
-    rsc->mStateFragmentStore.mStorePrograms.push(pfs);
+    rsc->mStateFragmentStore.mStorePrograms.push_back(pfs);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsProgramStore.h b/rsProgramStore.h
index 9a7f7f1..06824fe 100644
--- a/rsProgramStore.h
+++ b/rsProgramStore.h
@@ -92,12 +92,9 @@
     ObjectBaseRef<ProgramStore> mLast;
 
     // Cache of all existing store programs.
-    Vector<ProgramStore *> mStorePrograms;
+    std::vector<ProgramStore *> mStorePrograms;
 };
 
 }
 }
 #endif
-
-
-
diff --git a/rsProgramVertex.cpp b/rsProgramVertex.cpp
index 9c0f7fb..baf1fff 100644
--- a/rsProgramVertex.cpp
+++ b/rsProgramVertex.cpp
@@ -41,7 +41,7 @@
     }
 
     if (!isUserProgram()) {
-        if (mHal.state.constants[0] == NULL) {
+        if (mHal.state.constants[0] == nullptr) {
             rsc->setError(RS_ERROR_FATAL_UNKNOWN,
                           "Unable to set fixed function emulation matrices because allocation is missing");
             return;
@@ -70,7 +70,7 @@
                       "Attempting to set fixed function emulation matrix projection on user program");
         return;
     }
-    if (mHal.state.constants[0] == NULL) {
+    if (mHal.state.constants[0] == nullptr) {
         rsc->setError(RS_ERROR_FATAL_UNKNOWN,
                       "Unable to set fixed function emulation matrix projection because allocation is missing");
         return;
@@ -88,7 +88,7 @@
                       "Attempting to set fixed function emulation matrix modelview on user program");
         return;
     }
-    if (mHal.state.constants[0] == NULL) {
+    if (mHal.state.constants[0] == nullptr) {
         rsc->setError(RS_ERROR_FATAL_UNKNOWN,
                       "Unable to set fixed function emulation matrix modelview because allocation is missing");
         return;
@@ -106,7 +106,7 @@
                       "Attempting to set fixed function emulation matrix texture on user program");
         return;
     }
-    if (mHal.state.constants[0] == NULL) {
+    if (mHal.state.constants[0] == nullptr) {
         rsc->setError(RS_ERROR_FATAL_UNKNOWN,
                       "Unable to set fixed function emulation matrix texture because allocation is missing");
         return;
@@ -124,7 +124,7 @@
                       "Attempting to get fixed function emulation matrix projection on user program");
         return;
     }
-    if (mHal.state.constants[0] == NULL) {
+    if (mHal.state.constants[0] == nullptr) {
         rsc->setError(RS_ERROR_FATAL_UNKNOWN,
                       "Unable to get fixed function emulation matrix projection because allocation is missing");
         return;
@@ -152,7 +152,7 @@
 }
 
 ProgramVertex *ProgramVertex::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 
@@ -203,7 +203,7 @@
     tmp[3] = (uintptr_t)attrElem.get();
 
     ProgramVertex *pv = new ProgramVertex(rsc, shaderString, strlen(shaderString),
-                                          NULL, 0, NULL, tmp, 4);
+                                          nullptr, 0, nullptr, tmp, 4);
     Allocation *alloc = Allocation::createAllocation(rsc, inputType.get(),
                               RS_ALLOCATION_USAGE_SCRIPT | RS_ALLOCATION_USAGE_GRAPHICS_CONSTANTS);
     pv->bindAllocation(rsc, alloc, 0);
diff --git a/rsRuntime.h b/rsRuntime.h
index eb93e25..5a05883 100644
--- a/rsRuntime.h
+++ b/rsRuntime.h
@@ -158,7 +158,7 @@
                 Allocation *in,
                 Allocation *out,
                 const void *usr,
-                 uint32_t usrBytes,
+                uint32_t usrBytes,
                 const RsScriptCall *call);
 
 
diff --git a/rsSampler.cpp b/rsSampler.cpp
index 0cf0b55..924ba86 100644
--- a/rsSampler.cpp
+++ b/rsSampler.cpp
@@ -49,10 +49,14 @@
 }
 
 void Sampler::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateSampler.mAllSamplers.size(); ct++) {
-        if (mRSC->mStateSampler.mAllSamplers[ct] == this) {
-            mRSC->mStateSampler.mAllSamplers.removeAt(ct);
-            break;
+    auto &samplers = mRSC->mStateSampler.mAllSamplers;
+
+    for (auto sampleIter = samplers.begin(), endIter = samplers.end();
+         sampleIter != endIter; sampleIter++) {
+
+        if (this == *sampleIter) {
+            samplers.erase(sampleIter);
+            return;
         }
     }
 }
@@ -72,7 +76,7 @@
 }
 
 Sampler *Sampler::createFromStream(Context *rsc, IStream *stream) {
-    return NULL;
+    return nullptr;
 }
 
 ObjectBaseRef<Sampler> Sampler::getSampler(Context *rsc,
@@ -101,7 +105,7 @@
     void* allocMem = rsc->mHal.funcs.allocRuntimeMem(sizeof(Sampler), 0);
     if (!allocMem) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Couldn't allocate memory for Allocation");
-        return NULL;
+        return nullptr;
     }
 
     Sampler *s = new (allocMem) Sampler(rsc, magFilter, minFilter, wrapS, wrapT, wrapR, aniso);
@@ -113,7 +117,7 @@
 #endif
 
     ObjectBase::asyncLock();
-    rsc->mStateSampler.mAllSamplers.push(s);
+    rsc->mStateSampler.mAllSamplers.push_back(s);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsSampler.h b/rsSampler.h
index 2fdf707..3f5855f 100644
--- a/rsSampler.h
+++ b/rsSampler.h
@@ -96,12 +96,9 @@
         }
     }
     // Cache of all existing raster programs.
-    Vector<Sampler *> mAllSamplers;
+    std::vector<Sampler *> mAllSamplers;
 };
 
 }
 }
 #endif //ANDROID_RS_SAMPLER_H
-
-
-
diff --git a/rsScript.cpp b/rsScript.cpp
index dd962d1..3059833 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -24,8 +24,8 @@
     memset(&mEnviroment, 0, sizeof(mEnviroment));
     memset(&mHal, 0, sizeof(mHal));
 
-    mSlots = NULL;
-    mTypes = NULL;
+    mSlots = nullptr;
+    mTypes = nullptr;
     mInitialized = false;
     mHasObjectSlots = false;
 }
@@ -33,11 +33,11 @@
 Script::~Script() {
     if (mSlots) {
         delete [] mSlots;
-        mSlots = NULL;
+        mSlots = nullptr;
     }
     if (mTypes) {
         delete [] mTypes;
-        mTypes = NULL;
+        mTypes = nullptr;
     }
 }
 
@@ -95,7 +95,7 @@
 }
 
 void Script::callUpdateCacheObject(const Context *rsc, void *dstObj) const {
-    if (rsc->mHal.funcs.script.updateCachedObject != NULL) {
+    if (rsc->mHal.funcs.script.updateCachedObject != nullptr) {
         rsc->mHal.funcs.script.updateCachedObject(rsc, this, (rs_script *)dstObj);
     } else {
         *((const void **)dstObj) = this;
@@ -170,7 +170,7 @@
 
 void rsi_ScriptSetTimeZone(Context * rsc, RsScript vs, const char * timeZone, size_t length) {
     // We unfortunately need to make a new copy of the string, since it is
-    // not NULL-terminated. We then use setenv(), which properly handles
+    // not nullptr-terminated. We then use setenv(), which properly handles
     // freeing/duplicating the actual string for the environment.
     char *tz = (char *) malloc(length + 1);
     if (!tz) {
@@ -187,38 +187,13 @@
     free(tz);
 }
 
-void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
-                       RsAllocation vain, RsAllocation vaout,
-                       const void *params, size_t paramLen,
-                       const RsScriptCall *sc, size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
-    s->runForEach(rsc, slot,
-                  static_cast<const Allocation *>(vain), static_cast<Allocation *>(vaout),
-                  params, paramLen, sc);
-
-}
-
 void rsi_ScriptForEachMulti(Context *rsc, RsScript vs, uint32_t slot,
                             RsAllocation *vains, size_t inLen,
                             RsAllocation vaout, const void *params,
                             size_t paramLen, const RsScriptCall *sc,
                             size_t scLen) {
-    Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
 
+    Script      *s    = static_cast<Script *>(vs);
     Allocation **ains = (Allocation**)(vains);
 
     s->runForEach(rsc, slot,
@@ -227,15 +202,32 @@
 
 }
 
+void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
+                       RsAllocation vain, RsAllocation vaout,
+                       const void *params, size_t paramLen,
+                       const RsScriptCall *sc, size_t scLen) {
+
+    if (vain == nullptr) {
+        rsi_ScriptForEachMulti(rsc, vs, slot, nullptr, 0, vaout, params, paramLen,
+                               sc, scLen);
+    } else {
+        RsAllocation ains[1] = {vain};
+
+        rsi_ScriptForEachMulti(rsc, vs, slot, ains,
+                               sizeof(ains) / sizeof(RsAllocation), vaout,
+                               params, paramLen, sc, scLen);
+    }
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
-    s->Invoke(rsc, slot, NULL, 0);
+    s->Invoke(rsc, slot, nullptr, 0);
 }
 
 
 void rsi_ScriptInvokeData(Context *rsc, RsScript vs, uint32_t slot, void *data) {
     Script *s = static_cast<Script *>(vs);
-    s->Invoke(rsc, slot, NULL, 0);
+    s->Invoke(rsc, slot, nullptr, 0);
 }
 
 void rsi_ScriptInvokeV(Context *rsc, RsScript vs, uint32_t slot, const void *data, size_t len) {
diff --git a/rsScript.h b/rsScript.h
index 1ad013f..1172dea 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -108,22 +108,14 @@
 
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL) = 0;
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
                             size_t usrBytes,
-                            const RsScriptCall *sc = NULL) = 0;
+                            const RsScriptCall *sc = nullptr) = 0;
 
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) = 0;
     virtual void setupScript(Context *rsc) = 0;
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index e7ff8c7..6255b44 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <string>
+
 #include "rsContext.h"
 #include "rsScriptC.h"
 
@@ -29,6 +31,19 @@
 
 #include <sys/stat.h>
 
+#ifdef USE_MINGW
+/* Define the default path separator for the platform. */
+#define OS_PATH_SEPARATOR     '\\'
+#define OS_PATH_SEPARATOR_STR "\\"
+
+#else /* not USE_MINGW */
+
+/* Define the default path separator for the platform. */
+#define OS_PATH_SEPARATOR     '/'
+#define OS_PATH_SEPARATOR_STR "/"
+
+#endif
+
 using namespace android;
 using namespace android::renderscript;
 
@@ -39,7 +54,7 @@
 
 ScriptC::ScriptC(Context *rsc) : Script(rsc) {
 #if !defined(RS_COMPATIBILITY_LIB) && !defined(ANDROID_RS_SERIALIZE)
-    BT = NULL;
+    BT = nullptr;
 #endif
 }
 
@@ -47,7 +62,7 @@
 #if !defined(RS_COMPATIBILITY_LIB) && !defined(ANDROID_RS_SERIALIZE)
     if (BT) {
         delete BT;
-        BT = NULL;
+        BT = nullptr;
     }
 #endif
     if (mInitialized) {
@@ -58,29 +73,45 @@
 
 #ifndef RS_COMPATIBILITY_LIB
 bool ScriptC::createCacheDir(const char *cacheDir) {
-    String8 cacheDirString, currentDir;
+    std::string currentDir;
+    const std::string cacheDirString(cacheDir);
+
     struct stat statBuf;
     int statReturn = stat(cacheDir, &statBuf);
     if (!statReturn) {
         return true;
     }
 
-    // String8 path functions strip leading /'s
-    // insert if necessary
-    if (cacheDir[0] == '/') {
-        currentDir += "/";
-    }
+    // Start from the beginning of the cacheDirString.
+    int currPos = 0;
 
-    cacheDirString.setPathName(cacheDir);
+    // Reserve space in currentDir for the entire cacheDir path.
+    currentDir.reserve(cacheDirString.length());
 
-    while (cacheDirString.length()) {
-        currentDir += (cacheDirString.walkPath(&cacheDirString));
-        statReturn = stat(currentDir.string(), &statBuf);
+    while (currPos >= 0) {
+        /*
+         * The character at currPos should be a path separator.  We need to look
+         * for the next one.
+         */
+        int nextPos = cacheDirString.find(OS_PATH_SEPARATOR_STR, currPos + 1);
+
+        if (nextPos > 0) {
+            // A new path separator has been found.
+            currentDir += cacheDirString.substr(currPos, nextPos - currPos);
+        } else {
+            // There are no more path separators.
+            currentDir += cacheDirString.substr(currPos);
+        }
+
+        currPos = nextPos;
+
+        statReturn = stat(currentDir.c_str(), &statBuf);
+
         if (statReturn) {
             if (errno == ENOENT) {
-                if (mkdir(currentDir.string(), S_IRUSR | S_IWUSR | S_IXUSR)) {
+                if (mkdir(currentDir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR)) {
                     ALOGE("Couldn't create cache directory: %s",
-                          currentDir.string());
+                          currentDir.c_str());
                     ALOGE("Error: %s", strerror(errno));
                     return false;
                 }
@@ -89,7 +120,6 @@
                 return false;
             }
         }
-        currentDir += "/";
     }
     return true;
 }
@@ -130,7 +160,7 @@
 }
 
 uint32_t ScriptC::run(Context *rsc) {
-    if (mHal.info.root == NULL) {
+    if (mHal.info.root == nullptr) {
         rsc->setError(RS_ERROR_BAD_SCRIPT, "Attempted to run bad script");
         return 0;
     }
@@ -156,36 +186,6 @@
 
 void ScriptC::runForEach(Context *rsc,
                          uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-    // Trace this function call.
-    // To avoid overhead, we only build the string, if tracing is actually
-    // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
-    if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
-    }
-    ATRACE_NAME(String);
-    (void)String;
-
-    Context::PushState ps(rsc);
-
-    setupGLState(rsc);
-    setupScript(rsc);
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-
-    if (AString)
-        delete AString;
-}
-
-void ScriptC::runForEach(Context *rsc,
-                         uint32_t slot,
                          const Allocation ** ains,
                          size_t inLen,
                          Allocation * aout,
@@ -195,25 +195,36 @@
     // Trace this function call.
     // To avoid overhead we only build the string if tracing is actually
     // enabled.
-    String8 *AString = NULL;
-    const char *String = "";
+    std::string *traceString = nullptr;
+    const char  *stringData  = "";
     if (ATRACE_ENABLED()) {
-        AString = new String8("runForEach_");
-        AString->append(mHal.info.exportedForeachFuncList[slot].first);
-        String = AString->string();
+        traceString = new std::string("runForEach_");
+        traceString->append(mHal.info.exportedForeachFuncList[slot].first);
+        stringData = traceString->c_str();
     }
-    ATRACE_NAME(String);
-    (void)String;
+    ATRACE_NAME(stringData);
 
     Context::PushState ps(rsc);
 
     setupGLState(rsc);
     setupScript(rsc);
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != nullptr) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
 
-    if (AString)
-        delete AString;
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
+
+    if (traceString) {
+        delete traceString;
+    }
 }
 
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
@@ -265,7 +276,7 @@
     if (!BT->translate()) {
         ALOGE("Failed to translate bitcode from version: %u", sdkVersion);
         delete BT;
-        BT = NULL;
+        BT = nullptr;
         return false;
     }
     bitcode = (const uint8_t *) BT->getTranslatedBitcode();
@@ -381,7 +392,7 @@
     if (!s->runCompiler(rsc, resName, cacheDir, (uint8_t *)text, text_length)) {
         // Error during compile, destroy s and return null.
         ObjectBase::checkDelete(s);
-        return NULL;
+        return nullptr;
     }
 
     s->incUserRef();
diff --git a/rsScriptC.h b/rsScriptC.h
index d3d9d51..86de5b2 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -44,24 +44,16 @@
 
     virtual void runForEach(Context *rsc,
                             uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
                             const Allocation ** ains,
                             size_t inLen,
                             Allocation * aout,
                             const void * usr,
                             size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
+                            const RsScriptCall *sc = nullptr);
 
     virtual void serialize(Context *rsc, OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
-    static Type *createFromStream(Context *rsc, IStream *stream) { return NULL; }
+    static Type *createFromStream(Context *rsc, IStream *stream) { return nullptr; }
 
     bool runCompiler(Context *rsc, const char *resName, const char *cacheDir,
                      const uint8_t *bitcode, size_t bitcodeLen);
diff --git a/rsScriptC_Lib.cpp b/rsScriptC_Lib.cpp
index a41f4a7..4bb0b1c 100644
--- a/rsScriptC_Lib.cpp
+++ b/rsScriptC_Lib.cpp
@@ -87,7 +87,7 @@
 
 tm* rsrLocalTime(Context *rsc, tm *local, time_t *timer) {
     if (!local) {
-      return NULL;
+      return nullptr;
     }
 
     // The native localtime function is not thread-safe, so we
@@ -151,7 +151,7 @@
     ObjectBase **odst = (ObjectBase **)dst;
     //ALOGE("rsrSetObject (base) %p,%p  %p", dst, *odst, src);
     SetObjectRef(rsc, odst[0], src);
-    if (src != NULL) {
+    if (src != nullptr) {
         src->callUpdateCacheObject(rsc, dst);
     }
 }
@@ -160,7 +160,7 @@
     ObjectBase **odst = (ObjectBase **)dst;
     //ALOGE("rsrSetObject (base) %p,%p  %p", dst, *odst, src);
     SetObjectRef(rsc, odst[0], src);
-    if (src != NULL) {
+    if (src != nullptr) {
         src->callUpdateCacheObject(rsc, dst);
     }
 }
@@ -173,7 +173,7 @@
         CHECK_OBJ(odst[0]);
         odst[0]->decSysRef();
     }
-    *odst = NULL;
+    *odst = nullptr;
 }
 
 void rsrClearObject(const Context *rsc, rs_object_base *dst) {
@@ -182,17 +182,17 @@
         CHECK_OBJ(dst->p);
         dst->p->decSysRef();
     }
-    dst->p = NULL;
+    dst->p = nullptr;
 }
 
 // Legacy, remove when drivers are updated
 bool rsrIsObject(const Context *, ObjectBase* src) {
     ObjectBase **osrc = (ObjectBase **)src;
-    return osrc != NULL;
+    return osrc != nullptr;
 }
 
 bool rsrIsObject(const Context *rsc, rs_object_base o) {
-    return o.p != NULL;
+    return o.p != nullptr;
 }
 
 
@@ -230,7 +230,17 @@
                 Allocation *in, Allocation *out,
                 const void *usr, uint32_t usrBytes,
                 const RsScriptCall *call) {
-    target->runForEach(rsc, /* root slot */ 0, in, out, usr, usrBytes, call);
+
+    if (in == nullptr) {
+        target->runForEach(rsc, /* root slot */ 0, nullptr, 0, out, usr,
+                           usrBytes, call);
+
+    } else {
+        const Allocation *ins[1] = {in};
+        target->runForEach(rsc, /* root slot */ 0, ins,
+                           sizeof(ins) / sizeof(RsAllocation), out, usr,
+                           usrBytes, call);
+    }
 }
 
 void rsrAllocationSyncAll(Context *rsc, Allocation *a, RsAllocationUsageType usage) {
diff --git a/rsScriptC_LibGL.cpp b/rsScriptC_LibGL.cpp
index dbf2336..5c55c6b 100644
--- a/rsScriptC_LibGL.cpp
+++ b/rsScriptC_LibGL.cpp
@@ -103,12 +103,12 @@
 }
 
 void rsrClearFrameBufferObjectColorTarget(Context *rsc, uint32_t slot) {
-    rsc->mFBOCache.bindColorTarget(rsc, NULL, slot);
+    rsc->mFBOCache.bindColorTarget(rsc, nullptr, slot);
     rsc->mStateVertex.updateSize(rsc);
 }
 
 void rsrClearFrameBufferObjectDepthTarget(Context *rsc) {
-    rsc->mFBOCache.bindDepthTarget(rsc, NULL);
+    rsc->mFBOCache.bindDepthTarget(rsc, nullptr);
     rsc->mStateVertex.updateSize(rsc);
 }
 
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
index d1dd9d8..618c28c 100644
--- a/rsScriptGroup.cpp
+++ b/rsScriptGroup.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+
 #include "rsContext.h"
 #include <time.h>
 
@@ -28,8 +30,8 @@
         mRSC->mHal.funcs.scriptgroup.destroy(mRSC, this);
     }
 
-    for (size_t ct=0; ct < mLinks.size(); ct++) {
-        delete mLinks[ct];
+    for (auto link : mLinks) {
+        delete link;
     }
 }
 
@@ -44,148 +46,116 @@
 }
 
 ScriptGroup::Node * ScriptGroup::findNode(Script *s) const {
-    //ALOGE("find %p   %i", s, (int)mNodes.size());
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
-            if (n->mKernels[ct2]->mScript == s) {
-                return n;
+    for (auto node : mNodes) {
+        for (auto kernelRef : node->mKernels) {
+            if (kernelRef->mScript == s) {
+                return node;
             }
         }
     }
-    return NULL;
+
+    return nullptr;
 }
 
-bool ScriptGroup::calcOrderRecurse(Node *n, int depth) {
-    n->mSeen = true;
-    if (n->mOrder < depth) {
-        n->mOrder = depth;
+bool ScriptGroup::calcOrderRecurse(Node *node0, int depth) {
+    node0->mSeen = true;
+    if (node0->mOrder < depth) {
+        node0->mOrder = depth;
     }
     bool ret = true;
-    for (size_t ct=0; ct < n->mOutputs.size(); ct++) {
-        const Link *l = n->mOutputs[ct];
-        Node *nt = NULL;
-        if (l->mDstField.get()) {
-            nt = findNode(l->mDstField->mScript);
+
+    for (auto link : node0->mOutputs) {
+        Node *node1 = nullptr;
+        if (link->mDstField.get()) {
+            node1 = findNode(link->mDstField->mScript);
         } else {
-            nt = findNode(l->mDstKernel->mScript);
+            node1 = findNode(link->mDstKernel->mScript);
         }
-        if (nt->mSeen) {
+        if (node1->mSeen) {
             return false;
         }
-        ret &= calcOrderRecurse(nt, n->mOrder + 1);
+        ret &= calcOrderRecurse(node1, node0->mOrder + 1);
     }
+
     return ret;
 }
 
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-static int CompareNodeForSort(ScriptGroup::Node *const* lhs,
-                              ScriptGroup::Node *const* rhs) {
-    if (lhs[0]->mOrder > rhs[0]->mOrder) {
-        return 1;
-    }
-    return 0;
-}
-#else
-class NodeCompare {
-public:
-    bool operator() (const ScriptGroup::Node* lhs,
-                     const ScriptGroup::Node* rhs) {
-        if (lhs->mOrder > rhs->mOrder) {
-            return true;
-        }
-        return false;
-    }
-};
-#endif
-
 bool ScriptGroup::calcOrder() {
     // Make nodes
-    for (size_t ct=0; ct < mKernels.size(); ct++) {
-        const ScriptKernelID *k = mKernels[ct].get();
-        //ALOGE(" kernel %i, %p  s=%p", (int)ct, k, mKernels[ct]->mScript);
-        Node *n = findNode(k->mScript);
-        //ALOGE("    n = %p", n);
-        if (n == NULL) {
-            n = new Node(k->mScript);
-            mNodes.add(n);
+
+    for (auto kernelRef : mKernels) {
+        const ScriptKernelID *kernel = kernelRef.get();
+        Node *node = findNode(kernel->mScript);
+        if (node == nullptr) {
+            node = new Node(kernel->mScript);
+            mNodes.push_back(node);
         }
-        n->mKernels.add(k);
+        node->mKernels.push_back(kernel);
     }
 
     // add links
-    //ALOGE("link count %i", (int)mLinks.size());
-    for (size_t ct=0; ct < mLinks.size(); ct++) {
-        Link *l = mLinks[ct];
-        //ALOGE("link  %i %p", (int)ct, l);
-        Node *n = findNode(l->mSource->mScript);
-        //ALOGE("link n %p", n);
-        n->mOutputs.add(l);
+    for (auto link : mLinks) {
+        Node *node = findNode(link->mSource->mScript);
+        node->mOutputs.push_back(link);
 
-        if (l->mDstKernel.get()) {
-            //ALOGE("l->mDstKernel.get() %p", l->mDstKernel.get());
-            n = findNode(l->mDstKernel->mScript);
-            //ALOGE("  n1 %p", n);
-            n->mInputs.add(l);
+        if (link->mDstKernel.get()) {
+            node = findNode(link->mDstKernel->mScript);
+            node->mInputs.push_back(link);
         } else {
-            n = findNode(l->mDstField->mScript);
-            //ALOGE("  n2 %p", n);
-            n->mInputs.add(l);
+            node = findNode(link->mDstField->mScript);
+            node->mInputs.push_back(link);
         }
     }
 
-    //ALOGE("node count %i", (int)mNodes.size());
     // Order nodes
     bool ret = true;
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        if (n->mInputs.size() == 0) {
-            for (size_t ct2=0; ct2 < mNodes.size(); ct2++) {
-                mNodes[ct2]->mSeen = false;
+    for (auto n0 : mNodes) {
+        if (n0->mInputs.size() == 0) {
+            for (auto n1 : mNodes) {
+                n1->mSeen = false;
             }
-            ret &= calcOrderRecurse(n, 0);
+            ret &= calcOrderRecurse(n0, 1);
         }
     }
 
-    for (size_t ct=0; ct < mKernels.size(); ct++) {
-        const ScriptKernelID *k = mKernels[ct].get();
-        const Node *n = findNode(k->mScript);
+    for (auto kernelRef : mKernels) {
+        const ScriptKernelID *kernel = kernelRef.get();
+        const Node *node = findNode(kernel->mScript);
 
-        if (k->mHasKernelOutput) {
+        if (kernel->mHasKernelOutput) {
             bool found = false;
-            for (size_t ct2=0; ct2 < n->mOutputs.size(); ct2++) {
-                if (n->mOutputs[ct2]->mSource.get() == k) {
+            for (auto output : node->mOutputs) {
+                if (output->mSource.get() == kernel) {
                     found = true;
                     break;
                 }
             }
+
             if (!found) {
-                //ALOGE("add io out %p", k);
-                mOutputs.add(new IO(k));
+                mOutputs.push_back(new IO(kernel));
             }
         }
 
-        if (k->mHasKernelInput) {
+        if (kernel->mHasKernelInput) {
             bool found = false;
-            for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
-                if (n->mInputs[ct2]->mDstKernel.get() == k) {
+            for (auto input : node->mInputs) {
+                if (input->mDstKernel.get() == kernel) {
                     found = true;
                     break;
                 }
             }
             if (!found) {
-                //ALOGE("add io in %p", k);
-                mInputs.add(new IO(k));
+                mInputs.push_back(new IO(kernel));
             }
         }
     }
 
     // sort
-#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
-    mNodes.sort(&CompareNodeForSort);
-#else
-    std::sort(mNodes.begin(), mNodes.end(), NodeCompare());
-#endif
+    std::stable_sort(mNodes.begin(), mNodes.end(),
+                     [](const ScriptGroup::Node* lhs,
+                        const ScriptGroup::Node* rhs) {
+        return lhs->mOrder < rhs->mOrder;
+    });
 
     return ret;
 }
@@ -209,7 +179,7 @@
 
     sg->mKernels.reserve(kernelCount);
     for (size_t ct=0; ct < kernelCount; ct++) {
-        sg->mKernels.add(kernels[ct]);
+        sg->mKernels.push_back(kernels[ct]);
     }
 
     sg->mLinks.reserve(linkCount);
@@ -219,7 +189,7 @@
         l->mSource = src[ct];
         l->mDstField = dstF[ct];
         l->mDstKernel = dstK[ct];
-        sg->mLinks.add(l);
+        sg->mLinks.push_back(l);
     }
 
     sg->calcOrder();
@@ -254,9 +224,9 @@
 }
 
 void ScriptGroup::setInput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (size_t ct=0; ct < mInputs.size(); ct++) {
-        if (mInputs[ct]->mKernel == kid) {
-            mInputs[ct]->mAlloc = a;
+    for (auto input : mInputs) {
+        if (input->mKernel == kid) {
+            input->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setInput) {
                 rsc->mHal.funcs.scriptgroup.setInput(rsc, this, kid, a);
@@ -268,9 +238,9 @@
 }
 
 void ScriptGroup::setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (size_t ct=0; ct < mOutputs.size(); ct++) {
-        if (mOutputs[ct]->mKernel == kid) {
-            mOutputs[ct]->mAlloc = a;
+    for (auto output : mOutputs) {
+        if (output->mKernel == kid) {
+            output->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setOutput) {
                 rsc->mHal.funcs.scriptgroup.setOutput(rsc, this, kid, a);
@@ -283,14 +253,14 @@
 
 bool ScriptGroup::validateInputAndOutput(Context *rsc) {
     for(size_t i = 0; i < mInputs.size(); i++) {
-        if (mInputs[i]->mAlloc.get() == NULL) {
+        if (mInputs[i]->mAlloc.get() == nullptr) {
             rsc->setError(RS_ERROR_BAD_VALUE, "ScriptGroup missing input.");
             return false;
         }
     }
 
     for(size_t i = 0; i < mOutputs.size(); i++) {
-        if (mOutputs[i]->mAlloc.get() == NULL) {
+        if (mOutputs[i]->mAlloc.get() == nullptr) {
             rsc->setError(RS_ERROR_BAD_VALUE, "ScriptGroup missing output.");
             return false;
         }
@@ -311,44 +281,45 @@
         return;
     }
 
-    for (size_t ct=0; ct < mNodes.size(); ct++) {
-        Node *n = mNodes[ct];
-        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
+    for (auto node : mNodes) {
+        for (auto kernel : node->mKernels) {
+            Allocation *ain  = nullptr;
+            Allocation *aout = nullptr;
 
-        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
-            const ScriptKernelID *k = n->mKernels[ct2];
-            Allocation *ain = NULL;
-            Allocation *aout = NULL;
-
-            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
-                if (n->mInputs[ct3]->mDstKernel.get() == k) {
-                    ain = n->mInputs[ct3]->mAlloc.get();
-                    //ALOGE(" link in %p", ain);
-                }
-            }
-            for (size_t ct3=0; ct3 < mInputs.size(); ct3++) {
-                if (mInputs[ct3]->mKernel == k) {
-                    ain = mInputs[ct3]->mAlloc.get();
-                    //ALOGE(" io in %p", ain);
+            for (auto nodeInput : node->mInputs) {
+                if (nodeInput->mDstKernel.get() == kernel) {
+                    ain = nodeInput->mAlloc.get();
                 }
             }
 
-            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
-                if (n->mOutputs[ct3]->mSource.get() == k) {
-                    aout = n->mOutputs[ct3]->mAlloc.get();
-                    //ALOGE(" link out %p", aout);
-                }
-            }
-            for (size_t ct3=0; ct3 < mOutputs.size(); ct3++) {
-                if (mOutputs[ct3]->mKernel == k) {
-                    aout = mOutputs[ct3]->mAlloc.get();
-                    //ALOGE(" io out %p", aout);
+            for (auto sgInput : mInputs) {
+                if (sgInput->mKernel == kernel) {
+                    ain = sgInput->mAlloc.get();
                 }
             }
 
-            n->mScript->runForEach(rsc, k->mSlot, ain, aout, NULL, 0);
+            for (auto nodeOutput : node->mOutputs) {
+                if (nodeOutput->mDstKernel.get() == kernel) {
+                    aout = nodeOutput->mAlloc.get();
+                }
+            }
+
+            for (auto sgOutput : mOutputs) {
+                if (sgOutput->mKernel == kernel) {
+                    aout = sgOutput->mAlloc.get();
+                }
+            }
+
+            if (ain == nullptr) {
+                node->mScript->runForEach(rsc, kernel->mSlot, nullptr, 0, aout,
+                                          nullptr, 0);
+            } else {
+                const Allocation *ains[1] = {ain};
+                node->mScript->runForEach(rsc, kernel->mSlot, ains,
+                                          sizeof(ains) / sizeof(RsAllocation),
+                                          aout, nullptr, 0);
+            }
         }
-
     }
 
 }
@@ -389,24 +360,20 @@
 
 void rsi_ScriptGroupSetInput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
-    //ALOGE("rsi_ScriptGroupSetInput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setInput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
 
 void rsi_ScriptGroupSetOutput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
-    //ALOGE("rsi_ScriptGroupSetOutput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setOutput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
 
 void rsi_ScriptGroupExecute(Context *rsc, RsScriptGroup sg) {
-    //ALOGE("rsi_ScriptGroupExecute");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->execute(rsc);
 }
 
 }
 }
-
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
index af98b50..974e3ba 100644
--- a/rsScriptGroup.h
+++ b/rsScriptGroup.h
@@ -32,7 +32,7 @@
 
 class ScriptGroup : public ObjectBase {
 public:
-    Vector<ObjectBaseRef<ScriptKernelID> > mKernels;
+    std::vector<ObjectBaseRef<ScriptKernelID> > mKernels;
 
     class Link {
     public:
@@ -49,9 +49,9 @@
     public:
         Node(Script *);
 
-        Vector<const ScriptKernelID *> mKernels;
-        Vector<Link *> mOutputs;
-        Vector<Link *> mInputs;
+        std::vector<const ScriptKernelID *> mKernels;
+        std::vector<Link *> mOutputs;
+        std::vector<Link *> mInputs;
         bool mSeen;
         int mOrder;
         Script *mScript;
@@ -65,10 +65,10 @@
         ObjectBaseRef<Allocation> mAlloc;
     };
 
-    Vector<Link *> mLinks;
-    Vector<Node *> mNodes;
-    Vector<IO *> mInputs;
-    Vector<IO *> mOutputs;
+    std::vector<Link *> mLinks;
+    std::vector<Node *> mNodes;
+    std::vector<IO *> mInputs;
+    std::vector<IO *> mOutputs;
 
     struct Hal {
         void * drv;
@@ -115,4 +115,3 @@
 }
 }
 #endif
-
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index 86f1c50..ee9d73c 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -55,18 +55,6 @@
     return 0;
 }
 
-
-void ScriptIntrinsic::runForEach(Context *rsc,
-                         uint32_t slot,
-                         const Allocation * ain,
-                         Allocation * aout,
-                         const void * usr,
-                         size_t usrBytes,
-                         const RsScriptCall *sc) {
-
-    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
-}
-
 void ScriptIntrinsic::runForEach(Context* rsc,
                          uint32_t slot,
                          const Allocation** ains,
@@ -76,7 +64,18 @@
                          size_t usrBytes,
                          const RsScriptCall* sc) {
 
-    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+    if (rsc->mHal.funcs.script.invokeForEachMulti != nullptr) {
+        rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen,
+                                                  aout, usr, usrBytes, sc);
+
+    } else if (inLen == 1) {
+        rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ains[0], aout,
+                                             usr, usrBytes, sc);
+
+    } else {
+        rsc->setError(RS_ERROR_FATAL_DRIVER,
+                      "Driver support for multi-input not present");
+    }
 }
 
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
@@ -99,7 +98,7 @@
     ScriptIntrinsic *si = new ScriptIntrinsic(rsc);
     if (!si->init(rsc, (RsScriptIntrinsicID)id, (Element *)ve)) {
         delete si;
-        return NULL;
+        return nullptr;
     }
     si->incUserRef();
     return si;
@@ -107,5 +106,3 @@
 
 }
 }
-
-
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 66b6031..ac49325 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -40,22 +40,14 @@
     virtual RsA3DClassID getClassId() const;
     virtual bool freeChildren();
 
-    virtual void runForEach(Context *rsc,
-                            uint32_t slot,
-                            const Allocation * ain,
-                            Allocation * aout,
-                            const void * usr,
-                            size_t usrBytes,
-                            const RsScriptCall *sc = NULL);
-
     virtual void runForEach(Context* rsc,
                             uint32_t slot,
-                            const Allocation** ains,
+                            const Allocation ** ains,
                             size_t inLen,
                             Allocation* aout,
                             const void* usr,
                             size_t usrBytes,
-                            const RsScriptCall* sc = NULL);
+                            const RsScriptCall* sc = nullptr);
 
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
     virtual void setupScript(Context *rsc);
@@ -69,5 +61,3 @@
 }
 }
 #endif
-
-
diff --git a/rsSignal.cpp b/rsSignal.cpp
index 658c0b7..bf25384 100644
--- a/rsSignal.cpp
+++ b/rsSignal.cpp
@@ -31,13 +31,13 @@
 }
 
 bool Signal::init() {
-    int status = pthread_mutex_init(&mMutex, NULL);
+    int status = pthread_mutex_init(&mMutex, nullptr);
     if (status) {
         ALOGE("LocklessFifo mutex init failure");
         return false;
     }
 
-    status = pthread_cond_init(&mCondition, NULL);
+    status = pthread_cond_init(&mCondition, nullptr);
     if (status) {
         ALOGE("LocklessFifo condition init failure");
         pthread_mutex_destroy(&mMutex);
diff --git a/rsThreadIO.cpp b/rsThreadIO.cpp
index 4f67dbb..f259591 100644
--- a/rsThreadIO.cpp
+++ b/rsThreadIO.cpp
@@ -79,7 +79,7 @@
 
 void ThreadIO::coreSetReturn(const void *data, size_t dataLen) {
     uint32_t buf;
-    if (data == NULL) {
+    if (data == nullptr) {
         data = &buf;
         dataLen = sizeof(buf);
     }
@@ -89,7 +89,7 @@
 
 void ThreadIO::coreGetReturn(void *data, size_t dataLen) {
     uint32_t buf;
-    if (data == NULL) {
+    if (data == nullptr) {
         data = &buf;
         dataLen = sizeof(buf);
     }
diff --git a/rsType.cpp b/rsType.cpp
index 31d6ce8..c0cda91 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -33,10 +33,14 @@
 }
 
 void Type::preDestroy() const {
-    for (uint32_t ct = 0; ct < mRSC->mStateType.mTypes.size(); ct++) {
-        if (mRSC->mStateType.mTypes[ct] == this) {
-            mRSC->mStateType.mTypes.removeAt(ct);
-            break;
+    auto &types = mRSC->mStateType.mTypes;
+
+    for (auto typeIter = types.begin(), endIter = types.end();
+         typeIter != endIter; typeIter++) {
+
+        if (this == *typeIter) {
+            types.erase(typeIter);
+            return;
         }
     }
 }
@@ -178,14 +182,14 @@
     RsA3DClassID classID = (RsA3DClassID)stream->loadU32();
     if (classID != RS_A3D_CLASS_ID_TYPE) {
         ALOGE("type loading skipped due to invalid class id\n");
-        return NULL;
+        return nullptr;
     }
 
     const char *name = stream->loadString();
 
     Element *elem = Element::createFromStream(rsc, stream);
     if (!elem) {
-        return NULL;
+        return nullptr;
     }
 
     uint32_t x = stream->loadU32();
@@ -244,7 +248,7 @@
     void* allocMem = rsc->mHal.funcs.allocRuntimeMem(sizeof(Type), 0);
     if (!allocMem) {
         rsc->setError(RS_ERROR_FATAL_DRIVER, "Couldn't allocate memory for Type");
-        return NULL;
+        return nullptr;
     }
 
     Type *nt = new (allocMem) Type(rsc);
@@ -265,7 +269,7 @@
     nt->compute();
 
     ObjectBase::asyncLock();
-    stc->mTypes.push(nt);
+    stc->mTypes.push_back(nt);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -315,7 +319,7 @@
 }
 
 void Type::callUpdateCacheObject(const Context *rsc, void *dstObj) const {
-    if (rsc->mHal.funcs.type.updateCachedObject != NULL) {
+    if (rsc->mHal.funcs.type.updateCachedObject != nullptr) {
         rsc->mHal.funcs.type.updateCachedObject(rsc, this, (rs_type *)dstObj);
     } else {
         *((const void **)dstObj) = this;
diff --git a/rsType.h b/rsType.h
index e44e270..86d6ece 100644
--- a/rsType.h
+++ b/rsType.h
@@ -146,7 +146,7 @@
     ~TypeState();
 
     // Cache of all existing types.
-    Vector<Type *> mTypes;
+    std::vector<Type *> mTypes;
 };
 
 
diff --git a/rsg_generator.c b/rsg_generator.c
index d0f0b7c..2558f67 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -294,7 +294,9 @@
                 const VarType *vt = &api->params[ct2];
                 needFlush += vt->ptrLevel;
                 if (vt->ptrLevel && hasInlineDataPointers(api)) {
-                    fprintf(f, "    if (dataSize < io->getMaxInlineSize()) {\n");
+                    fprintf(f, "    if (%s_length == 0) {\n", vt->name);
+                    fprintf(f, "        cmd->%s = NULL;\n", vt->name);
+                    fprintf(f, "    } else if (dataSize < io->getMaxInlineSize()) {\n");
                     fprintf(f, "        memcpy(payload, %s, %s_length);\n", vt->name, vt->name);
                     fprintf(f, "        cmd->%s = (", vt->name);
                     printVarType(f, vt);
@@ -489,7 +491,8 @@
             needFlush += vt->ptrLevel;
 
             if (hasInlineDataPointers(api) && vt->ptrLevel) {
-                fprintf(f, ",\n           (const %s *)&baseData[(intptr_t)cmd->%s]", vt->typeName, vt->name);
+                fprintf(f, ",\n           cmd->%s_length == 0 ? NULL : (const %s *)&baseData[(intptr_t)cmd->%s]",
+                        vt->name, vt->typeName, vt->name);
             } else {
                 fprintf(f, ",\n           cmd->%s", vt->name);
             }
diff --git a/server/RefBase.h b/server/RefBase.h
index e1e5007..81744a1 100644
--- a/server/RefBase.h
+++ b/server/RefBase.h
@@ -199,7 +199,7 @@
 public:
     typedef typename RefBase::weakref_type weakref_type;
 
-    inline wp() : m_ptr(0) { }
+    inline wp() : m_ptr(nullptr) { }
 
     wp(T* other);
     wp(const wp<T>& other);
@@ -350,7 +350,7 @@
 wp<T>& wp<T>::operator = (T* other)
 {
     weakref_type* newRefs =
-        other ? other->createWeak(this) : 0;
+        other ? other->createWeak(this) : nullptr;
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = other;
     m_refs = newRefs;
@@ -373,7 +373,7 @@
 wp<T>& wp<T>::operator = (const sp<T>& other)
 {
     weakref_type* newRefs =
-        other != NULL ? other->createWeak(this) : 0;
+        other != nullptr ? other->createWeak(this) : nullptr;
     T* otherPtr(other.m_ptr);
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = otherPtr;
@@ -385,7 +385,7 @@
 wp<T>& wp<T>::operator = (U* other)
 {
     weakref_type* newRefs =
-        other ? other->createWeak(this) : 0;
+        other ? other->createWeak(this) : nullptr;
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = other;
     m_refs = newRefs;
@@ -408,7 +408,7 @@
 wp<T>& wp<T>::operator = (const sp<U>& other)
 {
     weakref_type* newRefs =
-        other != NULL ? other->createWeak(this) : 0;
+        other != nullptr ? other->createWeak(this) : nullptr;
     U* otherPtr(other.m_ptr);
     if (m_ptr) m_refs->decWeak(this);
     m_ptr = otherPtr;
@@ -440,7 +440,7 @@
 {
     if (m_ptr) {
         m_refs->decWeak(this);
-        m_ptr = 0;
+        m_ptr = nullptr;
     }
 }
 
diff --git a/tests/cppallocation/Android.mk b/tests/cppallocation/Android.mk
index 2e46c9e..36ea7cb 100644
--- a/tests/cppallocation/Android.mk
+++ b/tests/cppallocation/Android.mk
@@ -11,6 +11,7 @@
 LOCAL_STATIC_LIBRARIES := \
 	libRScpp_static
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS += -llog -ldl
 
 LOCAL_MODULE:= rstest-cppallocation
diff --git a/tests/cppbasic-getpointer/Android.mk b/tests/cppbasic-getpointer/Android.mk
index eb4ac34..a57bab5 100644
--- a/tests/cppbasic-getpointer/Android.mk
+++ b/tests/cppbasic-getpointer/Android.mk
@@ -1,5 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
 LOCAL_SRC_FILES:= \
 	mono.rs \
@@ -7,7 +8,6 @@
 
 LOCAL_SHARED_LIBRARIES := \
 	libRScpp \
-	libstlport
 
 LOCAL_MODULE:= rstest-compute-getpointer
 
@@ -15,12 +15,14 @@
 
 intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
 
-LOCAL_C_INCLUDES += external/stlport/stlport bionic/ bionic/libstdc++/include
+LOCAL_CFLAGS := -std=c++11
+
 LOCAL_C_INCLUDES += frameworks/rs/cpp
 LOCAL_C_INCLUDES += frameworks/rs
 LOCAL_C_INCLUDES += $(intermediates)
 
 LOCAL_CLANG := true
 
+include external/stlport/libstlport.mk
 include $(BUILD_EXECUTABLE)
 
diff --git a/tests/cppbasic-shared/Android.mk b/tests/cppbasic-shared/Android.mk
index 63cd715..191b499 100644
--- a/tests/cppbasic-shared/Android.mk
+++ b/tests/cppbasic-shared/Android.mk
@@ -1,5 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
 
 LOCAL_SRC_FILES:= \
 	mono.rs \
@@ -7,7 +8,6 @@
 
 LOCAL_SHARED_LIBRARIES := \
 	libRScpp \
-	libstlport
 
 LOCAL_MODULE:= rstest-compute-shared
 
@@ -15,12 +15,13 @@
 
 intermediates := $(call intermediates-dir-for,STATIC_LIBRARIES,libRS,TARGET,)
 
-LOCAL_C_INCLUDES += external/stlport/stlport bionic/ bionic/libstdc++/include
+LOCAL_CFLAGS := -std=c++11
+
 LOCAL_C_INCLUDES += frameworks/rs/cpp
 LOCAL_C_INCLUDES += frameworks/rs
 LOCAL_C_INCLUDES += $(intermediates)
 
 LOCAL_CLANG := true
 
+include external/stlport/libstlport.mk
 include $(BUILD_EXECUTABLE)
-
diff --git a/tests/cppbasic-shared/compute.cpp b/tests/cppbasic-shared/compute.cpp
index d93b453..6741d7a 100644
--- a/tests/cppbasic-shared/compute.cpp
+++ b/tests/cppbasic-shared/compute.cpp
@@ -42,7 +42,7 @@
         sc->set_elem(e);
         sc->set_type(t);
         sc->set_script(sc);
-        sc->set_script(NULL);
+        sc->set_script(nullptr);
         sp<const Sampler> samp = Sampler::CLAMP_NEAREST(rs);
         sc->set_sampler(samp);
 
diff --git a/tests/cppbasic/Android.mk b/tests/cppbasic/Android.mk
index d4f7441..9e24dca 100644
--- a/tests/cppbasic/Android.mk
+++ b/tests/cppbasic/Android.mk
@@ -11,6 +11,7 @@
 LOCAL_STATIC_LIBRARIES := \
 	libRScpp_static
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS += -llog -ldl
 
 LOCAL_MODULE:= rstest-compute
diff --git a/tests/cppbasic/compute.cpp b/tests/cppbasic/compute.cpp
index d93b453..6741d7a 100644
--- a/tests/cppbasic/compute.cpp
+++ b/tests/cppbasic/compute.cpp
@@ -42,7 +42,7 @@
         sc->set_elem(e);
         sc->set_type(t);
         sc->set_script(sc);
-        sc->set_script(NULL);
+        sc->set_script(nullptr);
         sp<const Sampler> samp = Sampler::CLAMP_NEAREST(rs);
         sc->set_sampler(samp);
 
diff --git a/tests/cppstrided/Android.mk b/tests/cppstrided/Android.mk
index 9fb98a6..253a104 100644
--- a/tests/cppstrided/Android.mk
+++ b/tests/cppstrided/Android.mk
@@ -11,6 +11,7 @@
 LOCAL_STATIC_LIBRARIES := \
 	libRScpp_static
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS += -llog -ldl
 
 LOCAL_MODULE:= rstest-cppstrided
diff --git a/tests/latency/Android.mk b/tests/latency/Android.mk
index fdfa208..5200e5d 100644
--- a/tests/latency/Android.mk
+++ b/tests/latency/Android.mk
@@ -11,6 +11,7 @@
 LOCAL_STATIC_LIBRARIES := \
 	libRScpp_static
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS += -llog -ldl
 
 LOCAL_MODULE:= rstest-latency
diff --git a/tests/latency/latency.cpp b/tests/latency/latency.cpp
index 05337be..274a518 100644
--- a/tests/latency/latency.cpp
+++ b/tests/latency/latency.cpp
@@ -74,7 +74,7 @@
 
     struct timeval start, stop;
 
-    gettimeofday(&start, NULL);
+    gettimeofday(&start, nullptr);
 
     for (int i = 0; i < iters; i++) {
         sc->forEach_root(ain, aout);
@@ -82,13 +82,13 @@
 
     rs->finish();
 
-    gettimeofday(&stop, NULL);
+    gettimeofday(&stop, nullptr);
 
     long long elapsed = (stop.tv_sec * 1000000) - (start.tv_sec * 1000000) + (stop.tv_usec - start.tv_usec);
     printf("elapsed time : %lld microseconds\n", elapsed);
     printf("time per iter: %f microseconds\n", (double)elapsed / iters);
 
-    gettimeofday(&start, NULL);
+    gettimeofday(&start, nullptr);
 
     for (int i = 0; i < iters; i++) {
         ain->copy1DFrom(buf);
@@ -98,7 +98,7 @@
 
     rs->finish();
 
-    gettimeofday(&stop, NULL);
+    gettimeofday(&stop, nullptr);
     elapsed = (stop.tv_sec * 1000000) - (start.tv_sec * 1000000) + (stop.tv_usec - start.tv_usec);
     printf("elapsed time with copy : %lld microseconds\n", elapsed);
     printf("time per iter with copy: %f microseconds\n", (double)elapsed / iters);
diff --git a/tests/typecheck/Android.mk b/tests/typecheck/Android.mk
index c34e3e2..f63684e 100644
--- a/tests/typecheck/Android.mk
+++ b/tests/typecheck/Android.mk
@@ -11,6 +11,7 @@
 LOCAL_STATIC_LIBRARIES := \
 	libRScpp_static
 
+LOCAL_CFLAGS := -std=c++11
 LOCAL_LDFLAGS += -llog -ldl
 
 LOCAL_MODULE:= rstest-typecheck