Merge "Fix error check for surface type in setSurface"
diff --git a/cpp/rsDispatch.cpp b/cpp/rsDispatch.cpp
index e0d6788..6f379d0 100644
--- a/cpp/rsDispatch.cpp
+++ b/cpp/rsDispatch.cpp
@@ -389,6 +389,21 @@
             LOG_API("Couldn't initialize dispatchTab.ScriptGroup2Create");
             return false;
         }
+        dispatchTab.AllocationElementData = (AllocationElementDataFnPtr)dlsym(handle, "rsAllocationElementData");
+        if (dispatchTab.AllocationElementData == NULL) {
+            LOG_API("Couldn't initialize dispatchTab.AllocationElementData");
+            return false;
+        }
+        dispatchTab.AllocationElementRead = (AllocationElementReadFnPtr)dlsym(handle, "rsAllocationElementRead");
+        if (dispatchTab.AllocationElementRead == NULL) {
+            LOG_API("Couldn't initialize dispatchTab.AllocationElementRead");
+            return false;
+        }
+        dispatchTab.Allocation3DRead = (Allocation3DReadFnPtr)dlsym(handle, "rsAllocation3DRead");
+        if (dispatchTab.Allocation3DRead == NULL) {
+            LOG_API("Couldn't initialize dispatchTab.Allocation3DRead");
+            return false;
+        }
     }
 
     return true;
diff --git a/cpp/rsDispatch.h b/cpp/rsDispatch.h
index c1c8d77..5bda7c0 100644
--- a/cpp/rsDispatch.h
+++ b/cpp/rsDispatch.h
@@ -55,12 +55,15 @@
 typedef void (*AllocationCopyToBitmapFnPtr) (RsContext, RsAllocation, void*, size_t);
 typedef void (*Allocation1DDataFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, const void*, size_t);
 typedef void (*Allocation1DElementDataFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, const void*, size_t, size_t);
+typedef void (*AllocationElementDataFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t, const void*, size_t, size_t);
 typedef void (*Allocation2DDataFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, RsAllocationCubemapFace, uint32_t, uint32_t, const void*, size_t, size_t);
 typedef void (*Allocation3DDataFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, const void*, size_t, size_t);
 typedef void (*AllocationGenerateMipmapsFnPtr) (RsContext, RsAllocation);
 typedef void (*AllocationReadFnPtr) (RsContext, RsAllocation, void*, size_t);
 typedef void (*Allocation1DReadFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, void*, size_t);
+typedef void (*AllocationElementReadFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t, void*, size_t, size_t);
 typedef void (*Allocation2DReadFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, RsAllocationCubemapFace, uint32_t, uint32_t, void*, size_t, size_t);
+typedef void (*Allocation3DReadFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, void*, size_t, size_t);
 typedef void (*AllocationSyncAllFnPtr) (RsContext, RsAllocation, RsAllocationUsageType);
 typedef void (*AllocationResize1DFnPtr) (RsContext, RsAllocation, uint32_t);
 typedef void (*AllocationCopy2DRangeFnPtr) (RsContext, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, RsAllocation, uint32_t, uint32_t, uint32_t, uint32_t);
@@ -134,12 +137,15 @@
     AllocationCopyToBitmapFnPtr AllocationCopyToBitmap;
     Allocation1DDataFnPtr Allocation1DData;
     Allocation1DElementDataFnPtr Allocation1DElementData;
+    AllocationElementDataFnPtr AllocationElementData;
     Allocation2DDataFnPtr Allocation2DData;
     Allocation3DDataFnPtr Allocation3DData;
     AllocationGenerateMipmapsFnPtr AllocationGenerateMipmaps;
     AllocationReadFnPtr AllocationRead;
     Allocation1DReadFnPtr Allocation1DRead;
+    AllocationElementReadFnPtr AllocationElementRead;
     Allocation2DReadFnPtr Allocation2DRead;
+    Allocation3DReadFnPtr Allocation3DRead;
     AllocationSyncAllFnPtr AllocationSyncAll;
     AllocationResize1DFnPtr AllocationResize1D;
     AllocationCopy2DRangeFnPtr AllocationCopy2DRange;
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 6599932..a44db9c 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -20,6 +20,7 @@
 
 LOCAL_SRC_FILES:= \
 	rsCpuCore.cpp \
+	rsCpuExecutable.cpp \
 	rsCpuScript.cpp \
 	rsCpuRuntimeMath.cpp \
 	rsCpuRuntimeMathFuncs.cpp \
@@ -28,6 +29,7 @@
 	rsCpuScriptGroup2.cpp \
 	rsCpuIntrinsic.cpp \
 	rsCpuIntrinsic3DLUT.cpp \
+	rsCpuIntrinsicBLAS.cpp \
 	rsCpuIntrinsicBlend.cpp \
 	rsCpuIntrinsicBlur.cpp \
 	rsCpuIntrinsicColorMatrix.cpp \
@@ -82,12 +84,12 @@
 
 LOCAL_SHARED_LIBRARIES += libRS libcutils libutils liblog libsync libc++ libdl
 
-# these are not supported in 64-bit yet
-LOCAL_SHARED_LIBRARIES += libbcc libbcinfo
+LOCAL_SHARED_LIBRARIES += libbcc libbcinfo libblas
 
 
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
 LOCAL_C_INCLUDES += frameworks/rs
+LOCAL_C_INCLUDES += external/cblas/include
 
 ifneq ($(HOST_OS),windows)
 include external/libcxx/libcxx.mk
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 0ec7b28..2492c22 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -642,6 +642,8 @@
                                                  const Script *s, const Element *e);
 extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
                                               const Script *s, const Element *e);
+extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
+                                              const Script *s, const Element *e);
 
 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
                                     RsScriptIntrinsicID iid, Element *e) {
@@ -678,6 +680,11 @@
     case RS_SCRIPT_INTRINSIC_ID_RESIZE:
         i = rsdIntrinsic_Resize(this, s, e);
         break;
+#if !defined(RS_COMPATIBILITY_LIB)
+    case RS_SCRIPT_INTRINSIC_ID_BLAS:
+        i = rsdIntrinsic_BLAS(this, s, e);
+        break;
+#endif
 
     default:
         rsAssert(0);
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index ff087b0..d9b4f83 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -20,10 +20,10 @@
 #include "rsd_cpu.h"
 #include "rsSignal.h"
 #include "rsContext.h"
+#include "rsCppUtils.h"
 #include "rsElement.h"
 #include "rsScriptC.h"
 
-#include <string>
 
 #define RS_KERNEL_INPUT_LIMIT 8
 
@@ -198,10 +198,10 @@
     }
 
     virtual void setBccPluginName(const char *name) {
-        mBccPluginName.assign(name);
+        mBccPluginName.setTo(name);
     }
     virtual const char *getBccPluginName() const {
-        return mBccPluginName.c_str();
+        return mBccPluginName.string();
     }
     virtual bool getInForEach() { return mInForEach; }
 
@@ -233,7 +233,7 @@
     bcc::RSLinkRuntimeCallback mLinkRuntimeCallback;
     RSSelectRTCallback mSelectRTCallback;
     RSSetupCompilerCallback mSetupCompilerCallback;
-    std::string mBccPluginName;
+    String8 mBccPluginName;
 };
 
 
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
new file mode 100644
index 0000000..ae75b07
--- /dev/null
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -0,0 +1,578 @@
+#include "rsCpuExecutable.h"
+#include "rsCppUtils.h"
+
+#include <fstream>
+#include <set>
+#include <memory>
+
+#ifdef RS_COMPATIBILITY_LIB
+#include <stdio.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#else
+#include "bcc/Config/Config.h"
+#include <bcc/Renderscript/RSInfo.h>
+#include <sys/wait.h>
+#endif
+
+#include <dlfcn.h>
+
+namespace android {
+namespace renderscript {
+
+namespace {
+
+// Create a len length string containing random characters from [A-Za-z0-9].
+static std::string getRandomString(size_t len) {
+    char buf[len + 1];
+    for (size_t i = 0; i < len; i++) {
+        uint32_t r = arc4random() & 0xffff;
+        r %= 62;
+        if (r < 26) {
+            // lowercase
+            buf[i] = 'a' + r;
+        } else if (r < 52) {
+            // uppercase
+            buf[i] = 'A' + (r - 26);
+        } else {
+            // Use a number
+            buf[i] = '0' + (r - 52);
+        }
+    }
+    buf[len] = '\0';
+    return std::string(buf);
+}
+
+// Check if a path exists and attempt to create it if it doesn't.
+static bool ensureCacheDirExists(const char *path) {
+    if (access(path, R_OK | W_OK | X_OK) == 0) {
+        // Done if we can rwx the directory
+        return true;
+    }
+    if (mkdir(path, 0700) == 0) {
+        return true;
+    }
+    return false;
+}
+
+// Copy the file named \p srcFile to \p dstFile.
+// Return 0 on success and -1 if anything wasn't copied.
+static int copyFile(const char *dstFile, const char *srcFile) {
+    std::ifstream srcStream(srcFile);
+    if (!srcStream) {
+        ALOGE("Could not verify or read source file: %s", srcFile);
+        return -1;
+    }
+    std::ofstream dstStream(dstFile);
+    if (!dstStream) {
+        ALOGE("Could not verify or write destination file: %s", dstFile);
+        return -1;
+    }
+    dstStream << srcStream.rdbuf();
+    if (!dstStream) {
+        ALOGE("Could not write destination file: %s", dstFile);
+        return -1;
+    }
+
+    srcStream.close();
+    dstStream.close();
+
+    return 0;
+}
+
+static std::string findSharedObjectName(const char *cacheDir,
+                                        const char *resName) {
+#ifndef RS_SERVER
+    std::string scriptSOName(cacheDir);
+#if defined(RS_COMPATIBILITY_LIB) && !defined(__LP64__)
+    size_t cutPos = scriptSOName.rfind("cache");
+    if (cutPos != std::string::npos) {
+        scriptSOName.erase(cutPos);
+    } else {
+        ALOGE("Found peculiar cacheDir (missing \"cache\"): %s", cacheDir);
+    }
+    scriptSOName.append("/lib/librs.");
+#else
+    scriptSOName.append("/librs.");
+#endif // RS_COMPATIBILITY_LIB
+
+#else
+    std::string scriptSOName("lib");
+#endif // RS_SERVER
+    scriptSOName.append(resName);
+    scriptSOName.append(".so");
+
+    return scriptSOName;
+}
+
+}  // anonymous namespace
+
+const char* SharedLibraryUtils::LD_EXE_PATH = "/system/bin/ld.mc";
+const char* SharedLibraryUtils::RS_CACHE_DIR = "com.android.renderscript.cache";
+
+#ifndef RS_COMPATIBILITY_LIB
+
+bool SharedLibraryUtils::createSharedLibrary(const char *cacheDir, const char *resName) {
+    std::string sharedLibName = findSharedObjectName(cacheDir, resName);
+    std::string objFileName = cacheDir;
+    objFileName.append("/");
+    objFileName.append(resName);
+    objFileName.append(".o");
+
+    const char *compiler_rt = SYSLIBPATH"/libcompiler_rt.so";
+    std::vector<const char *> args = {
+        LD_EXE_PATH,
+        "-shared",
+        "-nostdlib",
+        compiler_rt,
+        "-mtriple", DEFAULT_TARGET_TRIPLE_STRING,
+        "-L", SYSLIBPATH,
+        "-lRSDriver", "-lm", "-lc",
+        objFileName.c_str(),
+        "-o", sharedLibName.c_str(),
+        nullptr
+    };
+
+    std::unique_ptr<const char> joined(
+        rsuJoinStrings(args.size()-1, args.data()));
+    std::string cmdLineStr (joined.get());
+
+    pid_t pid = fork();
+
+    switch (pid) {
+    case -1: {  // Error occurred (we attempt no recovery)
+        ALOGE("Couldn't fork for linker (%s) execution", LD_EXE_PATH);
+        return false;
+    }
+    case 0: {  // Child process
+        ALOGV("Invoking ld.mc with args '%s'", cmdLineStr.c_str());
+        execv(LD_EXE_PATH, (char* const*) args.data());
+
+        ALOGE("execv() failed: %s", strerror(errno));
+        abort();
+        return false;
+    }
+    default: {  // Parent process (actual driver)
+        // Wait on child process to finish compiling the source.
+        int status = 0;
+        pid_t w = waitpid(pid, &status, 0);
+        if (w == -1) {
+            ALOGE("Could not wait for linker (%s)", LD_EXE_PATH);
+            return false;
+        }
+
+        if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+            return true;
+        }
+
+        ALOGE("Linker (%s) terminated unexpectedly", LD_EXE_PATH);
+        return false;
+    }
+    }
+}
+
+#endif  // RS_COMPATIBILITY_LIB
+
+const char* RsdCpuScriptImpl::BCC_EXE_PATH = "/system/bin/bcc";
+
+void* SharedLibraryUtils::loadSharedLibrary(const char *cacheDir,
+                                            const char *resName,
+                                            const char *nativeLibDir) {
+    void *loaded = nullptr;
+
+#if defined(RS_COMPATIBILITY_LIB) && defined(__LP64__)
+    std::string scriptSOName = findSharedObjectName(nativeLibDir, resName);
+#else
+    std::string scriptSOName = findSharedObjectName(cacheDir, resName);
+#endif
+
+    // We should check if we can load the library from the standard app
+    // location for shared libraries first.
+    loaded = loadSOHelper(scriptSOName.c_str(), cacheDir, resName);
+
+    if (loaded == nullptr) {
+        ALOGE("Unable to open shared library (%s): %s",
+              scriptSOName.c_str(), dlerror());
+
+#ifdef RS_COMPATIBILITY_LIB
+        // One final attempt to find the library in "/system/lib".
+        // We do this to allow bundled applications to use the compatibility
+        // library fallback path. Those applications don't have a private
+        // library path, so they need to install to the system directly.
+        // Note that this is really just a testing path.
+        std::string scriptSONameSystem("/system/lib/librs.");
+        scriptSONameSystem.append(resName);
+        scriptSONameSystem.append(".so");
+        loaded = loadSOHelper(scriptSONameSystem.c_str(), cacheDir,
+                              resName);
+        if (loaded == nullptr) {
+            ALOGE("Unable to open system shared library (%s): %s",
+                  scriptSONameSystem.c_str(), dlerror());
+        }
+#endif
+    }
+
+    return loaded;
+}
+
+void* SharedLibraryUtils::loadSOHelper(const char *origName, const char *cacheDir,
+                                       const char *resName) {
+    // Keep track of which .so libraries have been loaded. Once a library is
+    // in the set (per-process granularity), we must instead make a copy of
+    // the original shared object (randomly named .so file) and load that one
+    // instead. If we don't do this, we end up aliasing global data between
+    // the various Script instances (which are supposed to be completely
+    // independent).
+    static std::set<std::string> LoadedLibraries;
+
+    void *loaded = nullptr;
+
+    // Skip everything if we don't even have the original library available.
+    if (access(origName, F_OK) != 0) {
+        return nullptr;
+    }
+
+    // Common path is that we have not loaded this Script/library before.
+    if (LoadedLibraries.find(origName) == LoadedLibraries.end()) {
+        loaded = dlopen(origName, RTLD_NOW | RTLD_LOCAL);
+        if (loaded) {
+            LoadedLibraries.insert(origName);
+        }
+        return loaded;
+    }
+
+    std::string newName(cacheDir);
+
+    // Append RS_CACHE_DIR only if it is not found in cacheDir
+    // In driver mode, RS_CACHE_DIR is already appended to cacheDir.
+    if (newName.find(RS_CACHE_DIR) == std::string::npos) {
+        newName.append("/");
+        newName.append(RS_CACHE_DIR);
+        newName.append("/");
+    }
+
+    if (!ensureCacheDirExists(newName.c_str())) {
+        ALOGE("Could not verify or create cache dir: %s", cacheDir);
+        return nullptr;
+    }
+
+    // Construct an appropriately randomized filename for the copy.
+    newName.append("librs.");
+    newName.append(resName);
+    newName.append("#");
+    newName.append(getRandomString(6));  // 62^6 potential filename variants.
+    newName.append(".so");
+
+    int r = copyFile(newName.c_str(), origName);
+    if (r != 0) {
+        ALOGE("Could not create copy %s -> %s", origName, newName.c_str());
+        return nullptr;
+    }
+    loaded = dlopen(newName.c_str(), RTLD_NOW | RTLD_LOCAL);
+    r = unlink(newName.c_str());
+    if (r != 0) {
+        ALOGE("Could not unlink copy %s", newName.c_str());
+    }
+    if (loaded) {
+        LoadedLibraries.insert(newName.c_str());
+    }
+
+    return loaded;
+}
+
+#define MAXLINE 500
+#define MAKE_STR_HELPER(S) #S
+#define MAKE_STR(S) MAKE_STR_HELPER(S)
+#define EXPORT_VAR_STR "exportVarCount: "
+#define EXPORT_FUNC_STR "exportFuncCount: "
+#define EXPORT_FOREACH_STR "exportForEachCount: "
+#define OBJECT_SLOT_STR "objectSlotCount: "
+#define PRAGMA_STR "pragmaCount: "
+#define THREADABLE_STR "isThreadable: "
+
+// Copy up to a newline or size chars from str -> s, updating str
+// Returns s when successful and nullptr when '\0' is finally reached.
+static char* strgets(char *s, int size, const char **ppstr) {
+    if (!ppstr || !*ppstr || **ppstr == '\0' || size < 1) {
+        return nullptr;
+    }
+
+    int i;
+    for (i = 0; i < (size - 1); i++) {
+        s[i] = **ppstr;
+        (*ppstr)++;
+        if (s[i] == '\0') {
+            return s;
+        } else if (s[i] == '\n') {
+            s[i+1] = '\0';
+            return s;
+        }
+    }
+
+    // size has been exceeded.
+    s[i] = '\0';
+
+    return s;
+}
+
+ScriptExecutable* ScriptExecutable::createFromSharedObject(
+    Context* RSContext, void* sharedObj) {
+    char line[MAXLINE];
+
+    size_t varCount = 0;
+    size_t funcCount = 0;
+    size_t forEachCount = 0;
+    size_t objectSlotCount = 0;
+    size_t pragmaCount = 0;
+    bool isThreadable = true;
+
+    void** fieldAddress = nullptr;
+    bool* fieldIsObject = nullptr;
+    InvokeFunc_t* invokeFunctions = nullptr;
+    ForEachFunc_t* forEachFunctions = nullptr;
+    uint32_t* forEachSignatures = nullptr;
+    const char ** pragmaKeys = nullptr;
+    const char ** pragmaValues = nullptr;
+
+    const char *rsInfo = (const char *) dlsym(sharedObj, ".rs.info");
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        return nullptr;
+    }
+    if (sscanf(line, EXPORT_VAR_STR "%zu", &varCount) != 1) {
+        ALOGE("Invalid export var count!: %s", line);
+        return nullptr;
+    }
+
+    fieldAddress = new void*[varCount];
+    if (fieldAddress == nullptr) {
+        return nullptr;
+    }
+
+    fieldIsObject = new bool[varCount];
+    if (fieldIsObject == nullptr) {
+        goto error;
+    }
+
+    for (size_t i = 0; i < varCount; ++i) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+        char *c = strrchr(line, '\n');
+        if (c) {
+            *c = '\0';
+        }
+        void* addr = dlsym(sharedObj, line);
+        if (addr == nullptr) {
+            ALOGE("Failed to find variable address for %s: %s",
+                  line, dlerror());
+            // Not a critical error if we don't find a global variable.
+        }
+        fieldAddress[i] = addr;
+        fieldIsObject[i] = false;
+    }
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        goto error;
+    }
+    if (sscanf(line, EXPORT_FUNC_STR "%zu", &funcCount) != 1) {
+        ALOGE("Invalid export func count!: %s", line);
+        goto error;
+    }
+
+    invokeFunctions = new InvokeFunc_t[funcCount];
+    if (invokeFunctions == nullptr) {
+        goto error;
+    }
+
+    for (size_t i = 0; i < funcCount; ++i) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+        char *c = strrchr(line, '\n');
+        if (c) {
+            *c = '\0';
+        }
+
+        invokeFunctions[i] = (InvokeFunc_t) dlsym(sharedObj, line);
+        if (invokeFunctions[i] == nullptr) {
+            ALOGE("Failed to get function address for %s(): %s",
+                  line, dlerror());
+            goto error;
+        }
+    }
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        goto error;
+    }
+    if (sscanf(line, EXPORT_FOREACH_STR "%zu", &forEachCount) != 1) {
+        ALOGE("Invalid export forEach count!: %s", line);
+        goto error;
+    }
+
+    forEachFunctions = new ForEachFunc_t[forEachCount];
+    if (forEachFunctions == nullptr) {
+        goto error;
+    }
+
+    forEachSignatures = new uint32_t[forEachCount];
+    if (forEachSignatures == nullptr) {
+        goto error;
+    }
+
+    for (size_t i = 0; i < forEachCount; ++i) {
+        unsigned int tmpSig = 0;
+        char tmpName[MAXLINE];
+
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+        if (sscanf(line, "%u - %" MAKE_STR(MAXLINE) "s",
+                   &tmpSig, tmpName) != 2) {
+          ALOGE("Invalid export forEach!: %s", line);
+          goto error;
+        }
+
+        // Lookup the expanded ForEach kernel.
+        strncat(tmpName, ".expand", MAXLINE-1-strlen(tmpName));
+        forEachSignatures[i] = tmpSig;
+        forEachFunctions[i] =
+            (ForEachFunc_t) dlsym(sharedObj, tmpName);
+        if (i != 0 && forEachFunctions[i] == nullptr) {
+            // Ignore missing root.expand functions.
+            // root() is always specified at location 0.
+            ALOGE("Failed to find forEach function address for %s: %s",
+                  tmpName, dlerror());
+            goto error;
+        }
+    }
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        goto error;
+    }
+    if (sscanf(line, OBJECT_SLOT_STR "%zu", &objectSlotCount) != 1) {
+        ALOGE("Invalid object slot count!: %s", line);
+        goto error;
+    }
+
+    for (size_t i = 0; i < objectSlotCount; ++i) {
+        uint32_t varNum = 0;
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            goto error;
+        }
+        if (sscanf(line, "%u", &varNum) != 1) {
+            ALOGE("Invalid object slot!: %s", line);
+            goto error;
+        }
+
+        if (varNum < varCount) {
+            fieldIsObject[varNum] = true;
+        }
+    }
+
+#ifndef RS_COMPATIBILITY_LIB
+    // Do not attempt to read pragmas or isThreadable flag in compat lib path.
+    // Neither is applicable for compat lib
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        goto error;
+    }
+
+    if (sscanf(line, PRAGMA_STR "%zu", &pragmaCount) != 1) {
+        ALOGE("Invalid pragma count!: %s", line);
+        goto error;
+    }
+
+    pragmaKeys = new const char*[pragmaCount];
+    if (pragmaKeys == nullptr) {
+        goto error;
+    }
+
+    pragmaValues = new const char*[pragmaCount];
+    if (pragmaValues == nullptr) {
+        goto error;
+    }
+
+    bzero(pragmaKeys, sizeof(char*) * pragmaCount);
+    bzero(pragmaValues, sizeof(char*) * pragmaCount);
+
+    for (size_t i = 0; i < pragmaCount; ++i) {
+        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+            ALOGE("Unable to read pragma at index %zu!", i);
+            goto error;
+        }
+
+        char key[MAXLINE];
+        char value[MAXLINE] = ""; // initialize in case value is empty
+
+        // pragmas can just have a key and no value.  Only check to make sure
+        // that the key is not empty
+        if (sscanf(line, "%" MAKE_STR(MAXLINE) "s - %" MAKE_STR(MAXLINE) "s",
+                   key, value) == 0 ||
+            strlen(key) == 0)
+        {
+            ALOGE("Invalid pragma value!: %s", line);
+
+            goto error;
+        }
+
+        char *pKey = new char[strlen(key)+1];
+        strcpy(pKey, key);
+        pragmaKeys[i] = pKey;
+
+        char *pValue = new char[strlen(value)+1];
+        strcpy(pValue, value);
+        pragmaValues[i] = pValue;
+        //ALOGE("Pragma %zu: Key: '%s' Value: '%s'", i, pKey, pValue);
+    }
+
+    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
+        goto error;
+    }
+
+    char tmpFlag[4];
+    if (sscanf(line, THREADABLE_STR "%4s", tmpFlag) != 1) {
+        ALOGE("Invalid threadable flag!: %s", line);
+        goto error;
+    }
+    if (strcmp(tmpFlag, "yes") == 0) {
+        isThreadable = true;
+    } else if (strcmp(tmpFlag, "no") == 0) {
+        isThreadable = false;
+    } else {
+        ALOGE("Invalid threadable flag!: %s", tmpFlag);
+        goto error;
+    }
+
+#endif  // RS_COMPATIBILITY_LIB
+
+    return new ScriptExecutable(
+        RSContext, fieldAddress, fieldIsObject, varCount,
+        invokeFunctions, funcCount,
+        forEachFunctions, forEachSignatures, forEachCount,
+        pragmaKeys, pragmaValues, pragmaCount,
+        isThreadable);
+
+error:
+
+#ifndef RS_COMPATIBILITY_LIB
+    for (size_t idx = 0; idx < pragmaCount; ++idx) {
+        delete [] pragmaKeys[idx];
+        delete [] pragmaValues[idx];
+    }
+
+    delete[] pragmaValues;
+    delete[] pragmaKeys;
+#endif  // RS_COMPATIBILITY_LIB
+
+    delete[] forEachSignatures;
+    delete[] forEachFunctions;
+    delete[] invokeFunctions;
+    delete[] fieldIsObject;
+    delete[] fieldAddress;
+
+    return nullptr;
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
new file mode 100644
index 0000000..acf092c
--- /dev/null
+++ b/cpu_ref/rsCpuExecutable.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_EXECUTABLE_H
+#define ANDROID_RENDERSCRIPT_EXECUTABLE_H
+
+#include <stdlib.h>
+
+#include "rsCpuScript.h"
+
+namespace android {
+namespace renderscript {
+
+class Context;
+
+class SharedLibraryUtils {
+public:
+#ifndef RS_COMPATIBILITY_LIB
+    static bool createSharedLibrary(const char* cacheDir, const char* resName);
+#endif
+
+    // Load the shared library referred to by cacheDir and resName. If we have
+    // already loaded this library, we instead create a new copy (in the
+    // cache dir) and then load that. We then immediately destroy the copy.
+    // This is required behavior to implement script instancing for the support
+    // library, since shared objects are loaded and de-duped by name only.
+
+    // For 64bit RS Support Lib, the shared lib path cannot be constructed from
+    // cacheDir, so nativeLibDir is needed to load shared libs.
+    static void* loadSharedLibrary(const char *cacheDir, const char *resName,
+                                   const char *nativeLibDir = nullptr);
+
+private:
+    // Attempt to load the shared library from origName, but then fall back to
+    // creating a copy of the shared library if necessary (to ensure instancing).
+    // This function returns the dlopen()-ed handle if successful.
+    static void *loadSOHelper(const char *origName, const char *cacheDir,
+                              const char *resName);
+
+    static const char* LD_EXE_PATH;
+    static const char* RS_CACHE_DIR;
+};
+
+class ScriptExecutable {
+public:
+    ScriptExecutable(Context* RSContext,
+                     void** fieldAddress, bool* fieldIsObject, size_t varCount,
+                     InvokeFunc_t* invokeFunctions, size_t funcCount,
+                     ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
+                     size_t forEachCount,
+                     const char ** pragmaKeys, const char ** pragmaValues,
+                     size_t pragmaCount,
+                     bool isThreadable) :
+        mFieldAddress(fieldAddress), mFieldIsObject(fieldIsObject),
+            mExportedVarCount(varCount),
+            mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
+            mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
+            mForEachCount(forEachCount),
+            mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
+            mPragmaCount(pragmaCount),
+            mIsThreadable(isThreadable), mRS(RSContext) {
+    }
+
+    ~ScriptExecutable() {
+        for (size_t i = 0; i < mExportedVarCount; ++i) {
+            if (mFieldIsObject[i]) {
+                if (mFieldAddress[i] != nullptr) {
+                    rs_object_base *obj_addr =
+                            reinterpret_cast<rs_object_base *>(mFieldAddress[i]);
+                    rsrClearObject(mRS, obj_addr);
+                }
+            }
+        }
+
+        for (size_t i = 0; i < mPragmaCount; ++i) {
+            delete [] mPragmaKeys[i];
+            delete [] mPragmaValues[i];
+        }
+
+        delete[] mPragmaValues;
+        delete[] mPragmaKeys;
+        delete[] mForEachSignatures;
+        delete[] mForEachFunctions;
+        delete[] mInvokeFunctions;
+        delete[] mFieldIsObject;
+        delete[] mFieldAddress;
+    }
+
+    static ScriptExecutable*
+            createFromSharedObject(Context* RSContext, void* sharedObj);
+
+    size_t getExportedVariableCount() const { return mExportedVarCount; }
+    size_t getExportedFunctionCount() const { return mFuncCount; }
+    size_t getExportedForEachCount() const { return mForEachCount; }
+    size_t getPragmaCount() const { return mPragmaCount; }
+
+    void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
+    bool getFieldIsObject(int slot) const { return mFieldIsObject[slot]; }
+    InvokeFunc_t getInvokeFunction(int slot) const { return mInvokeFunctions[slot]; }
+    ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; }
+    uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; }
+
+    const char ** getPragmaKeys() const { return mPragmaKeys; }
+    const char ** getPragmaValues() const { return mPragmaValues; }
+
+    bool getThreadable() const { return mIsThreadable; }
+
+private:
+    void** mFieldAddress;
+    bool* mFieldIsObject;
+    size_t mExportedVarCount;
+
+    InvokeFunc_t* mInvokeFunctions;
+    size_t mFuncCount;
+
+    ForEachFunc_t* mForEachFunctions;
+    uint32_t* mForEachSignatures;
+    size_t mForEachCount;
+
+    const char ** mPragmaKeys;
+    const char ** mPragmaValues;
+    size_t mPragmaCount;
+
+    bool mIsThreadable;
+
+    Context* mRS;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_EXECUTABLE_H
diff --git a/cpu_ref/rsCpuIntrinsicBLAS.cpp b/cpu_ref/rsCpuIntrinsicBLAS.cpp
new file mode 100644
index 0000000..486eed8
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsicBLAS.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsCpuIntrinsic.h"
+#include "rsCpuIntrinsicInlines.h"
+#include "cblas.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+namespace android {
+namespace renderscript {
+
+
+class RsdCpuScriptIntrinsicBLAS : public RsdCpuScriptIntrinsic {
+public:
+    virtual void invokeForEach(uint32_t slot,
+                               const Allocation ** ain,
+                               uint32_t inLen,
+                               Allocation * aout,
+                               const void * usr,
+                               uint32_t usrLen,
+                               const RsScriptCall *sc);
+
+    virtual void populateScript(Script *);
+    virtual ~RsdCpuScriptIntrinsicBLAS();
+    RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx, const Script *s);
+
+protected:
+
+
+};
+
+}
+}
+
+void RsdCpuScriptIntrinsicBLAS::populateScript(Script *s) {
+    s->mHal.info.exportedVariableCount = 0;
+}
+
+static void initABC(const Allocation ** ain,
+                    size_t size,
+                    void** A,
+                    void** B,
+                    void** C,
+                    int* lda,
+                    int* ldb,
+                    int* ldc)
+{
+    if (ain[0]) {
+        *A = ain[0]->mHal.drvState.lod[0].mallocPtr;
+        *lda = (int)(ain[0]->mHal.drvState.lod[0].stride/size);
+    }
+    if (ain[1]) {
+        *B = ain[1]->mHal.drvState.lod[0].mallocPtr;
+        *ldb = (int)(ain[1]->mHal.drvState.lod[0].stride/size);
+    }
+    if (ain[2]) {
+        *C = ain[2]->mHal.drvState.lod[0].mallocPtr;
+        *ldc = (int)(ain[2]->mHal.drvState.lod[0].stride/size);
+    }
+
+
+}
+
+void RsdCpuScriptIntrinsicBLAS::invokeForEach(uint32_t slot,
+                                              const Allocation ** ain,
+                                              uint32_t inLen,
+                                              Allocation * aout,
+                                              const void * usr,
+                                              uint32_t usrLen,
+                                              const RsScriptCall *sc) {
+    RsBlasCall* call = (RsBlasCall*) usr;
+    // setup BLAS enum args
+    enum CBLAS_TRANSPOSE TransA = (enum CBLAS_TRANSPOSE)call->transA;
+    enum CBLAS_TRANSPOSE TransB = (enum CBLAS_TRANSPOSE)call->transB;
+    enum CBLAS_UPLO Uplo = (enum CBLAS_UPLO)call->uplo;
+    enum CBLAS_DIAG Diag = (enum CBLAS_DIAG)call->diag;
+    enum CBLAS_SIDE Side = (enum CBLAS_SIDE)call->side;
+
+    void *A = nullptr;
+    void *B = nullptr;
+    void *C = nullptr;
+    void *X = nullptr;
+    void *Y = nullptr;
+
+    int lda = 0, ldb = 0, ldc = 0;
+
+    switch (call->func) {
+
+    // Level 1 BLAS: returns into a 1D Allocation
+
+
+    // Level 2 BLAS
+    case (RsBlas_sgemv):
+        initABC(ain, sizeof(float), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_sgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.f, (float*)A,
+                    lda, (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_sgbmv):
+        initABC(ain, sizeof(float), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_sgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    call->alpha.f, (float*)A, lda, (float*)X, call->incX,
+                    call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_strmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_strmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    case (RsBlas_stbmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_stpmv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_strsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_strsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A, lda,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_stbsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (float*)A,
+                    lda, (float*)X, call->incX);
+        break;
+    case (RsBlas_stpsv):
+        initABC(ain, sizeof(float), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_stpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (float*)A,
+                    (float*)X, call->incX);
+        break;
+    case (RsBlas_dgemv):
+        initABC(ain, sizeof(double), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_dgemv(CblasRowMajor, TransA, call->M, call->N, call->alpha.d, (double*)A,
+                    lda, (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dgbmv):
+        initABC(ain, sizeof(double), &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_dgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    call->alpha.d, (double*)A, lda, (double*)X, call->incX,
+                    call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dtrmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    case (RsBlas_dtbmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_dtpmv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_dtrsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A, lda,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_dtbsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (double*)A,
+                    lda, (double*)X, call->incX);
+        break;
+    case (RsBlas_dtpsv):
+        initABC(ain, sizeof(double), &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_dtpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (double*)A,
+                    (double*)X, call->incX);
+        break;
+    case (RsBlas_cgemv):
+        initABC(ain, sizeof(float)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_cgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.c, (void*)A,
+                    lda, (void*)X, call->incX, (void*)&call->beta.c, (void*)Y, call->incY);
+        break;
+    case (RsBlas_cgbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_cgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    (void*)&call->alpha.c, (void*)A, lda, (void*)X, call->incX,
+                    (void*)&call->beta.c, (void*)Y, call->incY);
+        break;
+    case (RsBlas_ctrmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ctbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_ctpmv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ctrsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ctbsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ctpsv):
+        initABC(ain, sizeof(float)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ctpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_zgemv):
+        initABC(ain, sizeof(double)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_zgemv(CblasRowMajor, TransA, call->M, call->N, (void*)&call->alpha.z, (void*)A,
+                    lda, (void*)X, call->incX, (void*)&call->beta.z, (void*)Y, call->incY);
+        break;
+    case (RsBlas_zgbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &C, &lda, &ldb, &ldc);
+        cblas_zgbmv(CblasRowMajor, TransA, call->M, call->N, call->KL, call->KU,
+                    (void*)&call->alpha.z, (void*)A, lda, (void*)X, call->incX,
+                    (void*)&call->beta.z, (void*)Y, call->incY);
+        break;
+    case (RsBlas_ztrmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ztbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztbmv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    // stpmv takes a packed 1D Allocation only
+    case (RsBlas_ztpmv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztpmv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ztrsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A, lda,
+                    (void*)X, call->incX);
+        break;
+    case (RsBlas_ztbsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztbsv(CblasRowMajor, Uplo, TransA, Diag, call->N, call->K, (void*)A,
+                    lda, (void*)X, call->incX);
+        break;
+    case (RsBlas_ztpsv):
+        initABC(ain, sizeof(double)*2, &A, &X, nullptr, &lda, &ldb, nullptr);
+        cblas_ztpsv(CblasRowMajor, Uplo, TransA, Diag, call->N, (void*)A,
+                    (void*)X, call->incX);
+        break;
+
+
+    // S and D only
+    case (RsBlas_ssymv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_ssymv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A, lda,
+                    (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    case (RsBlas_ssbmv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_ssbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.f,
+                    (float*)A, lda, (float*)X, call->incX, call->beta.f,
+                    (float*)Y, call->incY);
+        break;
+    //sspmv requires a packed 1D Allocation
+    case (RsBlas_sspmv):
+        initABC(ain, sizeof(float), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_sspmv(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)A,
+                    (float*)X, call->incX, call->beta.f, (float*)Y, call->incY);
+        break;
+    // following calls have init reordered because A is output matrix
+    case (RsBlas_sger):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_sger(CblasRowMajor, call->M, call->N, call->alpha.f, (float*)X,
+                   call->incX, (float*)Y, call->incY, (float*)A, lda);
+        break;
+    case (RsBlas_ssyr):
+        initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_ssyr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                   (float*)A, lda);
+        break;
+    // sspr is packed 1D Allocation A only
+    case (RsBlas_sspr):
+        initABC(ain, sizeof(float), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_sspr(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                   (float*)A);
+        break;
+    case (RsBlas_ssyr2):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_ssyr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                    (float*)Y, call->incY, (float*)A, lda);
+        break;
+    // sspr2 is packed 1D Allocation A only
+    case (RsBlas_sspr2):
+        initABC(ain, sizeof(float), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_sspr2(CblasRowMajor, Uplo, call->N, call->alpha.f, (float*)X, call->incX,
+                    (float*)Y, call->incY, (float*)A);
+        break;
+    case (RsBlas_dsymv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dsymv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A, lda,
+                    (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    case (RsBlas_dsbmv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dsbmv(CblasRowMajor, Uplo, call->N, call->K, call->alpha.d,
+                    (double*)A, lda, (double*)X, call->incX, call->beta.d,
+                    (double*)Y, call->incY);
+        break;
+    // dspmv requires a packed 1D Allocation
+    case (RsBlas_dspmv):
+        initABC(ain, sizeof(double), &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_dspmv(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)A,
+                    (double*)X, call->incX, call->beta.d, (double*)Y, call->incY);
+        break;
+    // following calls have init reordered because A is output matrix
+    case (RsBlas_dger):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dger(CblasRowMajor, call->M, call->N, call->alpha.d, (double*)X,
+                   call->incX, (double*)Y, call->incY, (double*)A, lda);
+        break;
+    case (RsBlas_dsyr):
+        initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_dsyr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                   (double*)A, lda);
+        break;
+    // dspr is packed 1D Allocation A only
+    case (RsBlas_dspr):
+        initABC(ain, sizeof(double), &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_dspr(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                   (double*)A);
+        break;
+    case (RsBlas_dsyr2):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dsyr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                    (double*)Y, call->incY, (double*)A, lda);
+        break;
+    // dspr2 is packed 1D Allocation A only
+    case (RsBlas_dspr2):
+        initABC(ain, sizeof(double), &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_dspr2(CblasRowMajor, Uplo, call->N, call->alpha.d, (double*)X, call->incX,
+                    (double*)Y, call->incY, (double*)A);
+        break;
+
+    // C and Z only
+    case (RsBlas_chemv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A, lda,
+                    X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_chbmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.c,
+                    A, lda, X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_chpmv):
+        initABC(ain, sizeof(float)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_chpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, A,
+                    X, call->incX, (void*)&call->beta.c, Y, call->incY);
+        break;
+    case (RsBlas_cgeru):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_cgerc):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.c,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_cher):
+        initABC(ain, sizeof(float)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_cher(CblasRowMajor, Uplo, call->N, call->alpha.f,
+                   X, call->incX, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_chpr):
+        initABC(ain, sizeof(float)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_chpr(CblasRowMajor, Uplo, call->N, call->alpha.f, X,
+                   call->incX, A);
+        break;
+    case (RsBlas_cher2):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_cher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c,
+                   X, call->incX, Y, call->incY, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_chpr2):
+        initABC(ain, sizeof(float)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_chpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.c, X,
+                   call->incX, Y, call->incY, A);
+        break;
+    case (RsBlas_zhemv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhemv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A, lda,
+                    X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zhbmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhbmv(CblasRowMajor, Uplo, call->N, call->K, (void*)&call->alpha.z,
+                    A, lda, X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zhpmv):
+        initABC(ain, sizeof(double)*2, &A, &X, &Y, &lda, &ldb, &ldc);
+        cblas_zhpmv(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, A,
+                    X, call->incX, (void*)&call->beta.z, Y, call->incY);
+        break;
+    case (RsBlas_zgeru):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zgeru(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_zgerc):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zgerc(CblasRowMajor, call->M, call->N, (void*)&call->alpha.z,
+                    X, call->incX, Y, call->incY, A, lda);
+        break;
+    case (RsBlas_zher):
+        initABC(ain, sizeof(double)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_zher(CblasRowMajor, Uplo, call->N, call->alpha.d,
+                   X, call->incX, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_zhpr):
+        initABC(ain, sizeof(double)*2, &X, &A, nullptr, &ldb, &lda, nullptr);
+        cblas_zhpr(CblasRowMajor, Uplo, call->N, call->alpha.d, X,
+                   call->incX, A);
+        break;
+    case (RsBlas_zher2):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zher2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z,
+                   X, call->incX, Y, call->incY, A, lda);
+        break;
+    // packed 1D Allocations only
+    case (RsBlas_zhpr2):
+        initABC(ain, sizeof(double)*2, &X, &Y, &A, &ldb, &ldc, &lda);
+        cblas_zhpr2(CblasRowMajor, Uplo, call->N, (void*)&call->alpha.z, X,
+                   call->incX, Y, call->incY, A);
+        break;
+
+    // Level 3 BLAS
+    case (RsBlas_sgemm):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        ALOGE("call->M = %d, call->N = %d, call->K = %d, lda = %d, ldb = %d, ldc = %d", call->M, call->N, call->K, lda, ldb, ldc);
+        cblas_sgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssymm):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_ssymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.f, (float*)A,
+                    lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssyrk):
+        initABC(ain, sizeof(float), &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_ssyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
+                    lda, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_ssyr2k):
+        initABC(ain, sizeof(float), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_ssyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, (float*)A,
+                     lda, (float*)B, ldb, call->beta.f, (float*)C, ldc);
+        break;
+    case (RsBlas_strmm):
+        initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_strmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb);
+        break;
+    case (RsBlas_strsm):
+        initABC(ain, sizeof(float), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_strsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.f,
+                    (float*)A, lda, (float*)B, ldb);
+        break;
+
+
+    case (RsBlas_dgemm):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsymm):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dsymm(CblasRowMajor, Side, Uplo, call->M, call->N, call->alpha.d, (double*)A,
+                    lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsyrk):
+        initABC(ain, sizeof(double), &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_dsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
+                    lda, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dsyr2k):
+        initABC(ain, sizeof(double), &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_dsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, (double*)A,
+                     lda, (double*)B, ldb, call->beta.d, (double*)C, ldc);
+        break;
+    case (RsBlas_dtrmm):
+        initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb);
+        break;
+    case (RsBlas_dtrsm):
+        initABC(ain, sizeof(double), &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_dtrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, call->alpha.d,
+                    (double*)A, lda, (double*)B, ldb);
+        break;
+
+    case (RsBlas_cgemm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_cgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.c,
+                    A, lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csymm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_csymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A,
+                    lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csyrk):
+        initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_csyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
+                    lda, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_csyr2k):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_csyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A,
+                     lda, B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_ctrmm):
+        initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
+                    A, lda, B, ldb);
+        break;
+    case (RsBlas_ctrsm):
+        initABC(ain, sizeof(float)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ctrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.c,
+                    A, lda, B, ldb);
+        break;
+
+    case (RsBlas_zgemm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zgemm(CblasRowMajor, TransA, TransB, call->M, call->N, call->K, (void*)&call->alpha.z,
+                    A, lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsymm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zsymm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A,
+                    lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsyrk):
+        initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_zsyrk(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
+                    lda, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zsyr2k):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zsyr2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A,
+                     lda, B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_ztrmm):
+        initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrmm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
+                    A, lda, B, ldb);
+        break;
+    case (RsBlas_ztrsm):
+        initABC(ain, sizeof(double)*2, &A, &B, nullptr, &lda, &ldb, nullptr);
+        cblas_ztrsm(CblasRowMajor, Side, Uplo, TransA, Diag, call->M, call->N, (void*)&call->alpha.z,
+                    A, lda, B, ldb);
+        break;
+
+    // Level 3 C and Z only
+    case (RsBlas_chemm):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_chemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.c, A, lda,
+                    B, ldb, (void*)&call->beta.c, C, ldc);
+        break;
+    case (RsBlas_cherk):
+        initABC(ain, sizeof(float)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_cherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.f, A, lda,
+                    call->beta.f, C, ldc);
+        break;
+    case (RsBlas_cher2k):
+        initABC(ain, sizeof(float)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_cher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.c, A, lda,
+                     B, ldb, call->beta.f, C, ldc);
+        break;
+
+    case (RsBlas_zhemm):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zhemm(CblasRowMajor, Side, Uplo, call->M, call->N, (void*)&call->alpha.z, A, lda,
+                    B, ldb, (void*)&call->beta.z, C, ldc);
+        break;
+    case (RsBlas_zherk):
+        initABC(ain, sizeof(double)*2, &A, nullptr, &C, &lda, nullptr, &ldc);
+        cblas_zherk(CblasRowMajor, Uplo, TransA, call->N, call->K, call->alpha.d, A, lda,
+                    call->beta.d, C, ldc);
+        break;
+    case (RsBlas_zher2k):
+        initABC(ain, sizeof(double)*2, &A, &B, &C, &lda, &ldb, &ldc);
+        cblas_zher2k(CblasRowMajor, Uplo, TransA, call->N, call->K, (void*)&call->alpha.z, A, lda,
+                     B, ldb, call->beta.d, C, ldc);
+        break;
+
+    default:
+        ALOGE("unimplemented\n");
+    }
+
+
+}
+
+
+RsdCpuScriptIntrinsicBLAS::RsdCpuScriptIntrinsicBLAS(RsdCpuReferenceImpl *ctx,
+                                                   const Script *s)
+            : RsdCpuScriptIntrinsic(ctx, s, nullptr, RS_SCRIPT_INTRINSIC_ID_BLAS) {
+
+
+}
+
+RsdCpuScriptIntrinsicBLAS::~RsdCpuScriptIntrinsicBLAS() {
+}
+
+
+
+
+
+RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
+                                    const Script *s, const Element *e) {
+
+    return new RsdCpuScriptIntrinsicBLAS(ctx, s);
+}
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a618a17..af8b640 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -16,6 +16,7 @@
 
 #include "rsCpuCore.h"
 #include "rsCpuScript.h"
+#include "rsCpuExecutable.h"
 
 #ifdef RS_COMPATIBILITY_LIB
     #include <stdio.h>
@@ -25,7 +26,6 @@
     #include <bcc/BCCContext.h>
     #include <bcc/Config/Config.h>
     #include <bcc/Renderscript/RSCompilerDriver.h>
-    #include <bcc/Renderscript/RSInfo.h>
     #include <bcinfo/MetadataExtractor.h>
     #include <cutils/properties.h>
 
@@ -42,7 +42,6 @@
 #include <dlfcn.h>
 #include <stdlib.h>
 #include <string.h>
-#include <fstream>
 #include <iostream>
 
 #ifdef __LP64__
@@ -52,90 +51,6 @@
 #endif
 
 namespace {
-
-// Create a len length string containing random characters from [A-Za-z0-9].
-static std::string getRandomString(size_t len) {
-    char buf[len + 1];
-    for (size_t i = 0; i < len; i++) {
-        uint32_t r = arc4random() & 0xffff;
-        r %= 62;
-        if (r < 26) {
-            // lowercase
-            buf[i] = 'a' + r;
-        } else if (r < 52) {
-            // uppercase
-            buf[i] = 'A' + (r - 26);
-        } else {
-            // Use a number
-            buf[i] = '0' + (r - 52);
-        }
-    }
-    buf[len] = '\0';
-    return std::string(buf);
-}
-
-// Check if a path exists and attempt to create it if it doesn't.
-static bool ensureCacheDirExists(const char *path) {
-    if (access(path, R_OK | W_OK | X_OK) == 0) {
-        // Done if we can rwx the directory
-        return true;
-    }
-    if (mkdir(path, 0700) == 0) {
-        return true;
-    }
-    return false;
-}
-
-// Copy the file named \p srcFile to \p dstFile.
-// Return 0 on success and -1 if anything wasn't copied.
-static int copyFile(const char *dstFile, const char *srcFile) {
-    std::ifstream srcStream(srcFile);
-    if (!srcStream) {
-        ALOGE("Could not verify or read source file: %s", srcFile);
-        return -1;
-    }
-    std::ofstream dstStream(dstFile);
-    if (!dstStream) {
-        ALOGE("Could not verify or write destination file: %s", dstFile);
-        return -1;
-    }
-    dstStream << srcStream.rdbuf();
-    if (!dstStream) {
-        ALOGE("Could not write destination file: %s", dstFile);
-        return -1;
-    }
-
-    srcStream.close();
-    dstStream.close();
-
-    return 0;
-}
-
-static std::string findSharedObjectName(const char *cacheDir,
-                                        const char *resName) {
-#ifndef RS_SERVER
-    std::string scriptSOName(cacheDir);
-#if defined(RS_COMPATIBILITY_LIB) && !defined(__LP64__)
-    size_t cutPos = scriptSOName.rfind("cache");
-    if (cutPos != std::string::npos) {
-        scriptSOName.erase(cutPos);
-    } else {
-        ALOGE("Found peculiar cacheDir (missing \"cache\"): %s", cacheDir);
-    }
-    scriptSOName.append("/lib/librs.");
-#else
-    scriptSOName.append("/librs.");
-#endif // RS_COMPATIBILITY_LIB
-
-#else
-    std::string scriptSOName("lib");
-#endif // RS_SERVER
-    scriptSOName.append(resName);
-    scriptSOName.append(".so");
-
-    return scriptSOName;
-}
-
 #ifndef RS_COMPATIBILITY_LIB
 
 static bool is_force_recompile() {
@@ -263,211 +178,6 @@
 namespace android {
 namespace renderscript {
 
-const char* SharedLibraryUtils::LD_EXE_PATH = "/system/bin/ld.mc";
-const char* SharedLibraryUtils::RS_CACHE_DIR = "com.android.renderscript.cache";
-
-#ifndef RS_COMPATIBILITY_LIB
-
-bool SharedLibraryUtils::createSharedLibrary(const char *cacheDir, const char *resName) {
-    std::string sharedLibName = findSharedObjectName(cacheDir, resName);
-    std::string objFileName = cacheDir;
-    objFileName.append("/");
-    objFileName.append(resName);
-    objFileName.append(".o");
-
-    const char *compiler_rt = SYSLIBPATH"/libcompiler_rt.so";
-    std::vector<const char *> args = {
-        LD_EXE_PATH,
-        "-shared",
-        "-nostdlib",
-        compiler_rt,
-        "-mtriple", DEFAULT_TARGET_TRIPLE_STRING,
-        "-L", SYSLIBPATH,
-        "-lRSDriver", "-lm", "-lc",
-        objFileName.c_str(),
-        "-o", sharedLibName.c_str(),
-        nullptr
-    };
-
-    std::string cmdLineStr = bcc::getCommandLine(args.size()-1, args.data());
-
-    pid_t pid = fork();
-
-    switch (pid) {
-    case -1: {  // Error occurred (we attempt no recovery)
-        ALOGE("Couldn't fork for linker (%s) execution", LD_EXE_PATH);
-        return false;
-    }
-    case 0: {  // Child process
-        ALOGV("Invoking ld.mc with args '%s'", cmdLineStr.c_str());
-        execv(LD_EXE_PATH, (char* const*) args.data());
-
-        ALOGE("execv() failed: %s", strerror(errno));
-        abort();
-        return false;
-    }
-    default: {  // Parent process (actual driver)
-        // Wait on child process to finish compiling the source.
-        int status = 0;
-        pid_t w = waitpid(pid, &status, 0);
-        if (w == -1) {
-            ALOGE("Could not wait for linker (%s)", LD_EXE_PATH);
-            return false;
-        }
-
-        if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
-            return true;
-        }
-
-        ALOGE("Linker (%s) terminated unexpectedly", LD_EXE_PATH);
-        return false;
-    }
-    }
-}
-
-#endif  // RS_COMPATIBILITY_LIB
-
-
-void* SharedLibraryUtils::loadSharedLibrary(const char *cacheDir, const char *resName, const char *nativeLibDir) {
-    void *loaded = nullptr;
-
-#if defined(RS_COMPATIBILITY_LIB) && defined(__LP64__)
-    std::string scriptSOName = findSharedObjectName(nativeLibDir, resName);
-#else
-    std::string scriptSOName = findSharedObjectName(cacheDir, resName);
-#endif
-
-    // We should check if we can load the library from the standard app
-    // location for shared libraries first.
-    loaded = loadSOHelper(scriptSOName.c_str(), cacheDir, resName);
-
-    if (loaded == nullptr) {
-        ALOGE("Unable to open shared library (%s): %s",
-              scriptSOName.c_str(), dlerror());
-
-#ifdef RS_COMPATIBILITY_LIB
-        // One final attempt to find the library in "/system/lib".
-        // We do this to allow bundled applications to use the compatibility
-        // library fallback path. Those applications don't have a private
-        // library path, so they need to install to the system directly.
-        // Note that this is really just a testing path.
-        std::string scriptSONameSystem("/system/lib/librs.");
-        scriptSONameSystem.append(resName);
-        scriptSONameSystem.append(".so");
-        loaded = loadSOHelper(scriptSONameSystem.c_str(), cacheDir,
-                              resName);
-        if (loaded == nullptr) {
-            ALOGE("Unable to open system shared library (%s): %s",
-                  scriptSONameSystem.c_str(), dlerror());
-        }
-#endif
-    }
-
-    return loaded;
-}
-
-void* SharedLibraryUtils::loadSOHelper(const char *origName, const char *cacheDir,
-                                       const char *resName) {
-    // Keep track of which .so libraries have been loaded. Once a library is
-    // in the set (per-process granularity), we must instead make a copy of
-    // the original shared object (randomly named .so file) and load that one
-    // instead. If we don't do this, we end up aliasing global data between
-    // the various Script instances (which are supposed to be completely
-    // independent).
-    static std::set<std::string> LoadedLibraries;
-
-    void *loaded = nullptr;
-
-    // Skip everything if we don't even have the original library available.
-    if (access(origName, F_OK) != 0) {
-        return nullptr;
-    }
-
-    // Common path is that we have not loaded this Script/library before.
-    if (LoadedLibraries.find(origName) == LoadedLibraries.end()) {
-        loaded = dlopen(origName, RTLD_NOW | RTLD_LOCAL);
-        if (loaded) {
-            LoadedLibraries.insert(origName);
-        }
-        return loaded;
-    }
-
-    std::string newName(cacheDir);
-
-    // Append RS_CACHE_DIR only if it is not found in cacheDir
-    // In driver mode, RS_CACHE_DIR is already appended to cacheDir.
-    if (newName.find(RS_CACHE_DIR) == std::string::npos) {
-        newName.append("/");
-        newName.append(RS_CACHE_DIR);
-        newName.append("/");
-    }
-
-    if (!ensureCacheDirExists(newName.c_str())) {
-        ALOGE("Could not verify or create cache dir: %s", cacheDir);
-        return nullptr;
-    }
-
-    // Construct an appropriately randomized filename for the copy.
-    newName.append("librs.");
-    newName.append(resName);
-    newName.append("#");
-    newName.append(getRandomString(6));  // 62^6 potential filename variants.
-    newName.append(".so");
-
-    int r = copyFile(newName.c_str(), origName);
-    if (r != 0) {
-        ALOGE("Could not create copy %s -> %s", origName, newName.c_str());
-        return nullptr;
-    }
-    loaded = dlopen(newName.c_str(), RTLD_NOW | RTLD_LOCAL);
-    r = unlink(newName.c_str());
-    if (r != 0) {
-        ALOGE("Could not unlink copy %s", newName.c_str());
-    }
-    if (loaded) {
-        LoadedLibraries.insert(newName.c_str());
-    }
-
-    return loaded;
-}
-
-const char* RsdCpuScriptImpl::BCC_EXE_PATH = "/system/bin/bcc";
-
-#define MAXLINE 500
-#define MAKE_STR_HELPER(S) #S
-#define MAKE_STR(S) MAKE_STR_HELPER(S)
-#define EXPORT_VAR_STR "exportVarCount: "
-#define EXPORT_FUNC_STR "exportFuncCount: "
-#define EXPORT_FOREACH_STR "exportForEachCount: "
-#define OBJECT_SLOT_STR "objectSlotCount: "
-#define PRAGMA_STR "pragmaCount: "
-#define THREADABLE_STR "isThreadable: "
-
-// Copy up to a newline or size chars from str -> s, updating str
-// Returns s when successful and nullptr when '\0' is finally reached.
-static char* strgets(char *s, int size, const char **ppstr) {
-    if (!ppstr || !*ppstr || **ppstr == '\0' || size < 1) {
-        return nullptr;
-    }
-
-    int i;
-    for (i = 0; i < (size - 1); i++) {
-        s[i] = **ppstr;
-        (*ppstr)++;
-        if (s[i] == '\0') {
-            return s;
-        } else if (s[i] == '\n') {
-            s[i+1] = '\0';
-            return s;
-        }
-    }
-
-    // size has been exceeded.
-    s[i] = '\0';
-
-    return s;
-}
-
 RsdCpuScriptImpl::RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s) {
     mCtx = ctx;
     mScript = s;
@@ -527,265 +237,6 @@
     return true;
 }
 
-ScriptExecutable* ScriptExecutable::createFromSharedObject(
-    Context* RSContext, void* sharedObj) {
-    char line[MAXLINE];
-
-    size_t varCount = 0;
-    size_t funcCount = 0;
-    size_t forEachCount = 0;
-    size_t objectSlotCount = 0;
-    size_t pragmaCount = 0;
-    bool isThreadable = true;
-
-    void** fieldAddress = nullptr;
-    bool* fieldIsObject = nullptr;
-    InvokeFunc_t* invokeFunctions = nullptr;
-    ForEachFunc_t* forEachFunctions = nullptr;
-    uint32_t* forEachSignatures = nullptr;
-    const char ** pragmaKeys = nullptr;
-    const char ** pragmaValues = nullptr;
-
-    const char *rsInfo = (const char *) dlsym(sharedObj, ".rs.info");
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        return nullptr;
-    }
-    if (sscanf(line, EXPORT_VAR_STR "%zu", &varCount) != 1) {
-        ALOGE("Invalid export var count!: %s", line);
-        return nullptr;
-    }
-
-    fieldAddress = new void*[varCount];
-    if (fieldAddress == nullptr) {
-        return nullptr;
-    }
-
-    fieldIsObject = new bool[varCount];
-    if (fieldIsObject == nullptr) {
-        goto error;
-    }
-
-    for (size_t i = 0; i < varCount; ++i) {
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        char *c = strrchr(line, '\n');
-        if (c) {
-            *c = '\0';
-        }
-        void* addr = dlsym(sharedObj, line);
-        if (addr == nullptr) {
-            ALOGE("Failed to find variable address for %s: %s",
-                  line, dlerror());
-            // Not a critical error if we don't find a global variable.
-        }
-        fieldAddress[i] = addr;
-        fieldIsObject[i] = false;
-    }
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-    if (sscanf(line, EXPORT_FUNC_STR "%zu", &funcCount) != 1) {
-        ALOGE("Invalid export func count!: %s", line);
-        goto error;
-    }
-
-    invokeFunctions = new InvokeFunc_t[funcCount];
-    if (invokeFunctions == nullptr) {
-        goto error;
-    }
-
-    for (size_t i = 0; i < funcCount; ++i) {
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        char *c = strrchr(line, '\n');
-        if (c) {
-            *c = '\0';
-        }
-
-        invokeFunctions[i] = (InvokeFunc_t) dlsym(sharedObj, line);
-        if (invokeFunctions[i] == nullptr) {
-            ALOGE("Failed to get function address for %s(): %s",
-                  line, dlerror());
-            goto error;
-        }
-    }
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-    if (sscanf(line, EXPORT_FOREACH_STR "%zu", &forEachCount) != 1) {
-        ALOGE("Invalid export forEach count!: %s", line);
-        goto error;
-    }
-
-    forEachFunctions = new ForEachFunc_t[forEachCount];
-    if (forEachFunctions == nullptr) {
-        goto error;
-    }
-
-    forEachSignatures = new uint32_t[forEachCount];
-    if (forEachSignatures == nullptr) {
-        goto error;
-    }
-
-    for (size_t i = 0; i < forEachCount; ++i) {
-        unsigned int tmpSig = 0;
-        char tmpName[MAXLINE];
-
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        if (sscanf(line, "%u - %" MAKE_STR(MAXLINE) "s",
-                   &tmpSig, tmpName) != 2) {
-          ALOGE("Invalid export forEach!: %s", line);
-          goto error;
-        }
-
-        // Lookup the expanded ForEach kernel.
-        strncat(tmpName, ".expand", MAXLINE-1-strlen(tmpName));
-        forEachSignatures[i] = tmpSig;
-        forEachFunctions[i] =
-            (ForEachFunc_t) dlsym(sharedObj, tmpName);
-        if (i != 0 && forEachFunctions[i] == nullptr) {
-            // Ignore missing root.expand functions.
-            // root() is always specified at location 0.
-            ALOGE("Failed to find forEach function address for %s: %s",
-                  tmpName, dlerror());
-            goto error;
-        }
-    }
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-    if (sscanf(line, OBJECT_SLOT_STR "%zu", &objectSlotCount) != 1) {
-        ALOGE("Invalid object slot count!: %s", line);
-        goto error;
-    }
-
-    for (size_t i = 0; i < objectSlotCount; ++i) {
-        uint32_t varNum = 0;
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            goto error;
-        }
-        if (sscanf(line, "%u", &varNum) != 1) {
-            ALOGE("Invalid object slot!: %s", line);
-            goto error;
-        }
-
-        if (varNum < varCount) {
-            fieldIsObject[varNum] = true;
-        }
-    }
-
-#ifndef RS_COMPATIBILITY_LIB
-    // Do not attempt to read pragmas or isThreadable flag in compat lib path.
-    // Neither is applicable for compat lib
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-
-    if (sscanf(line, PRAGMA_STR "%zu", &pragmaCount) != 1) {
-        ALOGE("Invalid pragma count!: %s", line);
-        goto error;
-    }
-
-    pragmaKeys = new const char*[pragmaCount];
-    if (pragmaKeys == nullptr) {
-        goto error;
-    }
-
-    pragmaValues = new const char*[pragmaCount];
-    if (pragmaValues == nullptr) {
-        goto error;
-    }
-
-    bzero(pragmaKeys, sizeof(char*) * pragmaCount);
-    bzero(pragmaValues, sizeof(char*) * pragmaCount);
-
-    for (size_t i = 0; i < pragmaCount; ++i) {
-        if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-            ALOGE("Unable to read pragma at index %zu!", i);
-            goto error;
-        }
-
-        char key[MAXLINE];
-        char value[MAXLINE] = ""; // initialize in case value is empty
-
-        // pragmas can just have a key and no value.  Only check to make sure
-        // that the key is not empty
-        if (sscanf(line, "%" MAKE_STR(MAXLINE) "s - %" MAKE_STR(MAXLINE) "s",
-                   key, value) == 0 ||
-            strlen(key) == 0)
-        {
-            ALOGE("Invalid pragma value!: %s", line);
-
-            goto error;
-        }
-
-        char *pKey = new char[strlen(key)+1];
-        strcpy(pKey, key);
-        pragmaKeys[i] = pKey;
-
-        char *pValue = new char[strlen(value)+1];
-        strcpy(pValue, value);
-        pragmaValues[i] = pValue;
-        //ALOGE("Pragma %zu: Key: '%s' Value: '%s'", i, pKey, pValue);
-    }
-
-    if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
-        goto error;
-    }
-
-    char tmpFlag[4];
-    if (sscanf(line, THREADABLE_STR "%4s", tmpFlag) != 1) {
-        ALOGE("Invalid threadable flag!: %s", line);
-        goto error;
-    }
-    if (strcmp(tmpFlag, "yes") == 0) {
-        isThreadable = true;
-    } else if (strcmp(tmpFlag, "no") == 0) {
-        isThreadable = false;
-    } else {
-        ALOGE("Invalid threadable flag!: %s", tmpFlag);
-        goto error;
-    }
-
-#endif  // RS_COMPATIBILITY_LIB
-
-    return new ScriptExecutable(
-        RSContext, fieldAddress, fieldIsObject, varCount,
-        invokeFunctions, funcCount,
-        forEachFunctions, forEachSignatures, forEachCount,
-        pragmaKeys, pragmaValues, pragmaCount,
-        isThreadable);
-
-error:
-
-#ifndef RS_COMPATIBILITY_LIB
-    for (size_t idx = 0; idx < pragmaCount; ++idx) {
-        delete [] pragmaKeys[idx];
-        delete [] pragmaValues[idx];
-    }
-
-    delete[] pragmaValues;
-    delete[] pragmaKeys;
-#endif  // RS_COMPATIBILITY_LIB
-
-    delete[] forEachSignatures;
-    delete[] forEachFunctions;
-    delete[] invokeFunctions;
-    delete[] fieldIsObject;
-    delete[] fieldAddress;
-
-    return nullptr;
-}
-
 bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir,
                             uint8_t const *bitcode, size_t bitcodeSize,
                             uint32_t flags, char const *bccPluginName) {
@@ -836,8 +287,9 @@
     setCompileArguments(&compileArguments, bcFileName, cacheDir, resName, core_lib,
                         useRSDebugContext, bccPluginName);
     // The last argument of compileArguments ia a nullptr, so remove 1 from the size.
-    std::string compileCommandLine =
-                bcc::getCommandLine(compileArguments.size() - 1, compileArguments.data());
+    std::unique_ptr<const char> joined(
+        rsuJoinStrings(compileArguments.size() - 1, compileArguments.data()));
+    std::string compileCommandLine (joined.get());
 
     if (!is_force_recompile() && !useRSDebugContext) {
         mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
@@ -868,7 +320,7 @@
         }
     }
 
-    mBitcodeFilePath = bcFileName;
+    mBitcodeFilePath.setTo(bcFileName.c_str());
 
     // Read RS symbol information from the .so.
     if ( !mScriptSO) {
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 829d049..03aabf4 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -26,8 +26,6 @@
 
 #include "rsCpuCore.h"
 
-#include <vector>
-
 namespace bcc {
     class BCCContext;
     class RSCompilerDriver;
@@ -40,118 +38,7 @@
 namespace android {
 namespace renderscript {
 
-class SharedLibraryUtils {
- public:
-#ifndef RS_COMPATIBILITY_LIB
-  static bool createSharedLibrary(const char* cacheDir, const char* resName);
-#endif
-
-  // Load the shared library referred to by cacheDir and resName. If we have
-  // already loaded this library, we instead create a new copy (in the
-  // cache dir) and then load that. We then immediately destroy the copy.
-  // This is required behavior to implement script instancing for the support
-  // library, since shared objects are loaded and de-duped by name only.
-
-  // For 64bit RS Support Lib, the shared lib path cannot be constructed from
-  // cacheDir, so nativeLibDir is needed to load shared libs.
-  static void* loadSharedLibrary(const char *cacheDir, const char *resName,
-                                 const char *nativeLibDir = nullptr);
-
- private:
-  // Attempt to load the shared library from origName, but then fall back to
-  // creating a copy of the shared library if necessary (to ensure instancing).
-  // This function returns the dlopen()-ed handle if successful.
-  static void *loadSOHelper(const char *origName, const char *cacheDir,
-                            const char *resName);
-
-  static const char* LD_EXE_PATH;
-  static const char* RS_CACHE_DIR;
-};
-
-class ScriptExecutable {
- public:
-  ScriptExecutable(Context* RSContext,
-                   void** fieldAddress, bool* fieldIsObject, size_t varCount,
-                   InvokeFunc_t* invokeFunctions, size_t funcCount,
-                   ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
-                   size_t forEachCount,
-                   const char ** pragmaKeys, const char ** pragmaValues,
-                   size_t pragmaCount,
-                   bool isThreadable) :
-      mFieldAddress(fieldAddress), mFieldIsObject(fieldIsObject),
-      mExportedVarCount(varCount),
-      mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
-      mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
-      mForEachCount(forEachCount),
-      mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
-      mPragmaCount(pragmaCount),
-      mIsThreadable(isThreadable), mRS(RSContext) {
-  }
-
-  ~ScriptExecutable() {
-      for (size_t i = 0; i < mExportedVarCount; ++i) {
-          if (mFieldIsObject[i]) {
-              if (mFieldAddress[i] != nullptr) {
-                  rs_object_base *obj_addr =
-                      reinterpret_cast<rs_object_base *>(mFieldAddress[i]);
-                  rsrClearObject(mRS, obj_addr);
-              }
-          }
-      }
-
-      for (size_t i = 0; i < mPragmaCount; ++i) {
-          delete [] mPragmaKeys[i];
-          delete [] mPragmaValues[i];
-      }
-
-      delete[] mPragmaValues;
-      delete[] mPragmaKeys;
-      delete[] mForEachSignatures;
-      delete[] mForEachFunctions;
-      delete[] mInvokeFunctions;
-      delete[] mFieldIsObject;
-      delete[] mFieldAddress;
-  }
-
-  static ScriptExecutable*
-  createFromSharedObject(Context* RSContext, void* sharedObj);
-
-  size_t getExportedVariableCount() const { return mExportedVarCount; }
-  size_t getExportedFunctionCount() const { return mFuncCount; }
-  size_t getExportedForEachCount() const { return mForEachCount; }
-  size_t getPragmaCount() const { return mPragmaCount; }
-
-  void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
-  bool getFieldIsObject(int slot) const { return mFieldIsObject[slot]; }
-  InvokeFunc_t getInvokeFunction(int slot) const { return mInvokeFunctions[slot]; }
-  ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; }
-  uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; }
-
-  const char ** getPragmaKeys() const { return mPragmaKeys; }
-  const char ** getPragmaValues() const { return mPragmaValues; }
-
-  bool getThreadable() const { return mIsThreadable; }
-
- private:
-  void** mFieldAddress;
-  bool* mFieldIsObject;
-  size_t mExportedVarCount;
-
-  InvokeFunc_t* mInvokeFunctions;
-  size_t mFuncCount;
-
-  ForEachFunc_t* mForEachFunctions;
-  uint32_t* mForEachSignatures;
-  size_t mForEachCount;
-
-  const char ** mPragmaKeys;
-  const char ** mPragmaValues;
-  size_t mPragmaCount;
-
-  bool mIsThreadable;
-
-  Context* mRS;
-};
+class ScriptExecutable;
 
 class RsdCpuScriptImpl : public RsdCpuReferenceImpl::CpuScript {
 public:
@@ -242,12 +129,12 @@
     void * mIntrinsicData;
     bool mIsThreadable;
 
- public:
-  static const char* BCC_EXE_PATH;
-  const std::string& getBitcodeFilePath() const { return mBitcodeFilePath; }
+public:
+    static const char* BCC_EXE_PATH;
+    const char* getBitcodeFilePath() const { return mBitcodeFilePath.string(); }
 
- private:
-  std::string mBitcodeFilePath;
+private:
+    String8 mBitcodeFilePath;
 };
 
 Allocation * rsdScriptGetAllocationForPointer(
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 166c80d..281a715 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -124,11 +124,11 @@
 
 
 void CpuScriptGroupImpl::execute() {
-    std::vector<Allocation *> ins;
-    std::vector<char> inExts;
-    std::vector<Allocation *> outs;
-    std::vector<char> outExts;
-    std::vector<const ScriptKernelID *> kernels;
+    Vector<Allocation *> ins;
+    Vector<bool> inExts;
+    Vector<Allocation *> outs;
+    Vector<bool> outExts;
+    Vector<const ScriptKernelID *> kernels;
     bool fieldDep = false;
 
     for (size_t ct=0; ct < mSG->mNodes.size(); ct++) {
@@ -194,11 +194,11 @@
             rsAssert((k->mHasKernelOutput == (aout != nullptr)) &&
                      (k->mHasKernelInput == (ain != nullptr)));
 
-            ins.push_back(ain);
-            inExts.push_back(inExt);
-            outs.push_back(aout);
-            outExts.push_back(outExt);
-            kernels.push_back(k);
+            ins.add(ain);
+            inExts.add(inExt);
+            outs.add(aout);
+            outExts.add(outExt);
+            kernels.add(k);
         }
 
     }
@@ -237,16 +237,10 @@
         }
     } else {
         ScriptList sl;
-
-        /*
-         * TODO: This is a hacky way of doing this and should be replaced by a
-         *       call to std::vector's data() member once we have a C++11
-         *       version of the STL.
-         */
-        sl.ins     = &ins.front();
-        sl.outs    = &outs.front();
-        sl.kernels = &kernels.front();
-        sl.count   = kernels.size();
+        sl.ins = ins.array();
+        sl.outs = outs.array();
+        sl.kernels = kernels.array();
+        sl.count = kernels.size();
 
         uint32_t inLen;
         const Allocation **ains;
@@ -260,27 +254,25 @@
             ains  = const_cast<const Allocation**>(&ins[0]);
         }
 
-        std::vector<const void *> usrPtrs;
-        std::vector<const void *> fnPtrs;
-        std::vector<uint32_t> sigs;
+        Vector<const void *> usrPtrs;
+        Vector<const void *> fnPtrs;
+        Vector<uint32_t> sigs;
         for (size_t ct=0; ct < kernels.size(); ct++) {
             Script *s = kernels[ct]->mScript;
             RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
 
             si->forEachKernelSetup(kernels[ct]->mSlot, &mtls);
-            fnPtrs.push_back((void *)mtls.kernel);
-            usrPtrs.push_back(mtls.fep.usr);
-            sigs.push_back(mtls.fep.usrLen);
+            fnPtrs.add((void *)mtls.kernel);
+            usrPtrs.add(mtls.fep.usr);
+            sigs.add(mtls.fep.usrLen);
             si->preLaunch(kernels[ct]->mSlot, ains, inLen, outs[ct],
                           mtls.fep.usr, mtls.fep.usrLen, nullptr);
         }
-
-        sl.sigs    = &sigs.front();
-        sl.usrPtrs = &usrPtrs.front();
-        sl.fnPtrs  = &fnPtrs.front();
-
-        sl.inExts  = (bool*)&inExts.front();
-        sl.outExts = (bool*)&outExts.front();
+        sl.sigs = sigs.array();
+        sl.usrPtrs = usrPtrs.array();
+        sl.fnPtrs = fnPtrs.array();
+        sl.inExts = inExts.array();
+        sl.outExts = outExts.array();
 
         Script *s = kernels[0]->mScript;
         RsdCpuScriptImpl *si = (RsdCpuScriptImpl *)mCtx->lookupScript(s);
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 80c46a0..7222eb9 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -1,6 +1,9 @@
 #include "rsCpuScriptGroup2.h"
 
 #include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
 
 #include <string>
 #include <vector>
@@ -14,6 +17,7 @@
 #include "rsClosure.h"
 #include "rsContext.h"
 #include "rsCpuCore.h"
+#include "rsCpuExecutable.h"
 #include "rsCpuScript.h"
 #include "rsScript.h"
 #include "rsScriptGroup2.h"
@@ -210,16 +214,6 @@
     args->push_back(nullptr);
 }
 
-string convertListToString(int n, const char* const* strs) {
-    string ret;
-    ret.append(strs[0]);
-    for (int i = 1; i < n; i++) {
-        ret.append(" ");
-        ret.append(strs[i]);
-    }
-    return ret;
-}
-
 bool fuseAndCompile(const char** arguments,
                     const string& commandLine) {
     const pid_t pid = fork();
@@ -288,18 +282,27 @@
         slots.push_back(kernelID->mSlot);
     }
 
-    string outputPath(tempnam(cacheDir, "fused"));
-    string outputFileName = getFileName(outputPath);
-    string objFilePath(outputPath);
-    objFilePath.append(".o");
+    rsAssert(cacheDir != nullptr);
+    string objFilePath(cacheDir);
+    objFilePath.append("/fusedXXXXXX.o");
+    // Find unique object file name, to make following file names unique.
+    int tempfd = mkstemps(&objFilePath[0], 2);
+    if (tempfd == -1) {
+      return;
+    }
+    TEMP_FAILURE_RETRY(close(tempfd));
+
+    string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
     string rsLibPath(SYSLIBPATH"/libclcore.bc");
     vector<const char*> arguments;
     setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
                           &arguments);
-    string commandLine =
-            convertListToString(arguments.size() - 1, arguments.data());
+    std::unique_ptr<const char> joined(
+        rsuJoinStrings(arguments.size() - 1, arguments.data()));
+    string commandLine (joined.get());
 
     if (!fuseAndCompile(arguments.data(), commandLine)) {
+        unlink(objFilePath.c_str());
         return;
     }
 
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 196f969..33a1b11 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -1121,13 +1121,12 @@
                                      srcXoff, srcYoff, srcZoff, srcLod);
 }
 
-void rsdAllocationElementData1D(const Context *rsc, const Allocation *alloc,
-                                uint32_t x,
-                                const void *data, uint32_t cIdx, size_t sizeBytes) {
+void rsdAllocationElementData(const Context *rsc, const Allocation *alloc,
+                              uint32_t x, uint32_t y, uint32_t z,
+                              const void *data, uint32_t cIdx, size_t sizeBytes) {
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
-    size_t eSize = alloc->mHal.state.elementSizeBytes;
-    uint8_t * ptr = GetOffsetPtr(alloc, x, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
+    uint8_t * ptr = GetOffsetPtr(alloc, x, y, z, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
 
     const Element * e = alloc->mHal.state.type->getElement()->getField(cIdx);
     ptr += alloc->mHal.state.type->getElement()->getFieldOffsetBytes(cIdx);
@@ -1141,24 +1140,17 @@
     drv->uploadDeferred = true;
 }
 
-void rsdAllocationElementData2D(const Context *rsc, const Allocation *alloc,
-                                uint32_t x, uint32_t y,
-                                const void *data, uint32_t cIdx, size_t sizeBytes) {
+void rsdAllocationElementRead(const Context *rsc, const Allocation *alloc,
+                              uint32_t x, uint32_t y, uint32_t z,
+                              void *data, uint32_t cIdx, size_t sizeBytes) {
     DrvAllocation *drv = (DrvAllocation *)alloc->mHal.drv;
 
-    size_t eSize = alloc->mHal.state.elementSizeBytes;
-    uint8_t * ptr = GetOffsetPtr(alloc, x, y, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
+    uint8_t * ptr = GetOffsetPtr(alloc, x, y, z, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X);
 
     const Element * e = alloc->mHal.state.type->getElement()->getField(cIdx);
     ptr += alloc->mHal.state.type->getElement()->getFieldOffsetBytes(cIdx);
 
-    if (alloc->mHal.state.hasReferences) {
-        e->incRefs(data);
-        e->decRefs(ptr);
-    }
-
-    memcpy(ptr, data, sizeBytes);
-    drv->uploadDeferred = true;
+    memcpy(data, ptr, sizeBytes);
 }
 
 static void mip565(const Allocation *alloc, int lod, RsAllocationCubemapFace face) {
diff --git a/driver/rsdAllocation.h b/driver/rsdAllocation.h
index 0b6d8bf..eff5e30 100644
--- a/driver/rsdAllocation.h
+++ b/driver/rsdAllocation.h
@@ -161,14 +161,15 @@
                                uint32_t srcXoff, uint32_t srcYoff, uint32_t srcZoff,
                                uint32_t srcLod);
 
-void rsdAllocationElementData1D(const android::renderscript::Context *rsc,
-                                const android::renderscript::Allocation *alloc,
-                                uint32_t x,
-                                const void *data, uint32_t elementOff, size_t sizeBytes);
-void rsdAllocationElementData2D(const android::renderscript::Context *rsc,
-                                const android::renderscript::Allocation *alloc,
-                                uint32_t x, uint32_t y,
-                                const void *data, uint32_t elementOff, size_t sizeBytes);
+void rsdAllocationElementData(const android::renderscript::Context *rsc,
+                              const android::renderscript::Allocation *alloc,
+                              uint32_t x, uint32_t y, uint32_t z,
+                              const void *data, uint32_t elementOff, size_t sizeBytes);
+
+void rsdAllocationElementRead(const android::renderscript::Context *rsc,
+                              const android::renderscript::Allocation *alloc,
+                              uint32_t x, uint32_t y, uint32_t z,
+                              void *data, uint32_t elementOff, size_t sizeBytes);
 
 void rsdAllocationGenerateMipmaps(const android::renderscript::Context *rsc,
                                   const android::renderscript::Allocation *alloc);
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 811fa3e..a57409d 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <vector>
-
 #include "../cpu_ref/rsd_cpu.h"
 
 #include "rsdCore.h"
@@ -28,6 +26,7 @@
 #include "rsScriptC.h"
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+#include "utils/Vector.h"
 #include "utils/Timers.h"
 #include "utils/StopWatch.h"
 #endif
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index 8812446..5390faf 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -108,8 +108,8 @@
         rsdAllocationData1D_alloc,
         rsdAllocationData2D_alloc,
         rsdAllocationData3D_alloc,
-        rsdAllocationElementData1D,
-        rsdAllocationElementData2D,
+        rsdAllocationElementData,
+        rsdAllocationElementRead,
         rsdAllocationGenerateMipmaps,
         rsdAllocationUpdateCachedObject,
         rsdAllocationAdapterOffset
diff --git a/driver/rsdMeshObj.cpp b/driver/rsdMeshObj.cpp
index ac1780c..974b41d 100644
--- a/driver/rsdMeshObj.cpp
+++ b/driver/rsdMeshObj.cpp
@@ -112,9 +112,9 @@
             mAttribs[userNum].type = rsdTypeToGLType(f->mHal.state.dataType);
             mAttribs[userNum].normalized = f->mHal.state.dataType != RS_TYPE_FLOAT_32;
             mAttribs[userNum].stride = stride;
-            std::string tmp(RS_SHADER_ATTR);
+            String8 tmp(RS_SHADER_ATTR);
             tmp.append(elem->mHal.state.fieldNames[fieldI]);
-            mAttribs[userNum].name = tmp.c_str();
+            mAttribs[userNum].name = tmp.string();
 
             // Remember which allocation this attribute came from
             mAttribAllocationIndex[userNum] = ct;
diff --git a/driver/rsdShader.cpp b/driver/rsdShader.cpp
index 7511883..eaf679a 100644
--- a/driver/rsdShader.cpp
+++ b/driver/rsdShader.cpp
@@ -41,14 +41,13 @@
     init(textureNames, textureNamesCount, textureNamesLength);
 
     for(size_t i=0; i < textureNamesCount; i++) {
-        mTextureNames.push_back(std::string(textureNames[i],
-                                            textureNamesLength[i]));
+        mTextureNames.push(String8(textureNames[i], textureNamesLength[i]));
     }
 }
 
 RsdShader::~RsdShader() {
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        StateBasedKey *state = mStateBasedShaders[i];
+        StateBasedKey *state = mStateBasedShaders.itemAt(i);
         if (state->mShaderID) {
             glDeleteShader(state->mShaderID);
         }
@@ -77,7 +76,7 @@
     RsdShader::StateBasedKey *returnKey = nullptr;
 
     for (uint32_t i = 0; i < mStateBasedShaders.size(); i ++) {
-        returnKey = mStateBasedShaders[i];
+        returnKey = mStateBasedShaders.itemAt(i);
 
         for (uint32_t ct = 0; ct < mRSProgram->mHal.state.texturesCount; ct ++) {
             uint32_t texType = 0;
@@ -109,7 +108,7 @@
     // We have not created a shader for this particular state yet
     state = new StateBasedKey(mTextureCount);
     mCurrentState = state;
-    mStateBasedShaders.push_back(state);
+    mStateBasedShaders.add(state);
     createShader();
     loadShader(rsc);
     return mCurrentState->mShaderID;
diff --git a/driver/rsdShader.h b/driver/rsdShader.h
index 0dc5102..caccc09 100644
--- a/driver/rsdShader.h
+++ b/driver/rsdShader.h
@@ -49,7 +49,7 @@
     // Add ability to get all ID's to clean up the cached program objects
     uint32_t getStateBasedIDCount() const { return mStateBasedShaders.size(); }
     uint32_t getStateBasedID(uint32_t index) const {
-        return mStateBasedShaders[index]->mShaderID;
+        return mStateBasedShaders.itemAt(index)->mShaderID;
     }
 
     uint32_t getAttribCount() const {return mAttribCount;}
@@ -116,9 +116,9 @@
     std::string *mUniformNames;
     uint32_t *mUniformArraySizes;
 
-    std::vector<std::string> mTextureNames;
+    android::Vector<android::String8> mTextureNames;
 
-    std::vector<StateBasedKey*> mStateBasedShaders;
+    android::Vector<StateBasedKey*> mStateBasedShaders;
 
     int32_t mTextureUniformIndexStart;
 
@@ -133,3 +133,7 @@
 };
 
 #endif //ANDROID_RSD_SHADER_H
+
+
+
+
diff --git a/driver/rsdShaderCache.cpp b/driver/rsdShaderCache.cpp
index 608922c..c6052e2 100644
--- a/driver/rsdShaderCache.cpp
+++ b/driver/rsdShaderCache.cpp
@@ -29,7 +29,7 @@
 
 
 RsdShaderCache::RsdShaderCache() {
-    mEntries.reserve(16);
+    mEntries.setCapacity(16);
     mVertexDirty = true;
     mFragmentDirty = true;
 }
@@ -139,7 +139,7 @@
     ProgramEntry *e = new ProgramEntry(vtx->getAttribCount(),
                                        vtx->getUniformCount(),
                                        frag->getUniformCount());
-    mEntries.push_back(e);
+    mEntries.push(e);
     mCurrent = e;
     e->vtx = vID;
     e->frag = fID;
@@ -237,7 +237,7 @@
     return true;
 }
 
-int32_t RsdShaderCache::vtxAttribSlot(const std::string &attrName) const {
+int32_t RsdShaderCache::vtxAttribSlot(const String8 &attrName) const {
     for (uint32_t ct=0; ct < mCurrent->vtxAttrCount; ct++) {
         if (attrName == mCurrent->vtxAttrs[ct].name) {
             return mCurrent->vtxAttrs[ct].slot;
@@ -247,45 +247,46 @@
 }
 
 void RsdShaderCache::cleanupVertex(RsdShader *s) {
+    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
+        for (int32_t ct = 0; ct < numEntries; ct ++) {
+            if (mEntries[ct]->vtx == id) {
+                glDeleteProgram(mEntries[ct]->program);
 
-        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
-            if ((*entry)->vtx == id) {
-                glDeleteProgram((*entry)->program);
-
-                delete *entry;
-                entry = mEntries.erase(entry);
-            } else {
-                entry++;
+                delete mEntries[ct];
+                mEntries.removeAt(ct);
+                numEntries = (int32_t)mEntries.size();
+                ct --;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupFragment(RsdShader *s) {
+    int32_t numEntries = (int32_t)mEntries.size();
     uint32_t numShaderIDs = s->getStateBasedIDCount();
     for (uint32_t sId = 0; sId < numShaderIDs; sId ++) {
         uint32_t id = s->getStateBasedID(sId);
+        for (int32_t ct = 0; ct < numEntries; ct ++) {
+            if (mEntries[ct]->frag == id) {
+                glDeleteProgram(mEntries[ct]->program);
 
-        for (auto entry = mEntries.begin(); entry != mEntries.end();) {
-            if ((*entry)->frag == id) {
-                glDeleteProgram((*entry)->program);
-
-                delete *entry;
-                entry = mEntries.erase(entry);
-            } else {
-                entry++;
+                delete mEntries[ct];
+                mEntries.removeAt(ct);
+                numEntries = (int32_t)mEntries.size();
+                ct --;
             }
         }
     }
 }
 
 void RsdShaderCache::cleanupAll() {
-    for (auto entry : mEntries) {
-        glDeleteProgram(entry->program);
-        delete entry;
+    for (uint32_t ct=0; ct < mEntries.size(); ct++) {
+        glDeleteProgram(mEntries[ct]->program);
+        free(mEntries[ct]);
     }
     mEntries.clear();
 }
+
diff --git a/driver/rsdShaderCache.h b/driver/rsdShaderCache.h
index de195e6..29f91bb 100644
--- a/driver/rsdShaderCache.h
+++ b/driver/rsdShaderCache.h
@@ -17,9 +17,6 @@
 #ifndef ANDROID_RSD_SHADER_CACHE_H
 #define ANDROID_RSD_SHADER_CACHE_H
 
-#include <string>
-#include <vector>
-
 namespace android {
 namespace renderscript {
 
@@ -28,7 +25,11 @@
 }
 }
 
-#if defined(RS_SERVER) || defined(RS_COMPATIBILITY_LIB)
+
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+#include <utils/String8.h>
+#include <utils/Vector.h>
+#else
 #include "rsUtils.h"
 #endif
 class RsdShader;
@@ -58,7 +59,7 @@
 
     void cleanupAll();
 
-    int32_t vtxAttribSlot(const std::string &attrName) const;
+    int32_t vtxAttribSlot(const android::String8 &attrName) const;
     int32_t vtxUniformSlot(uint32_t a) const {return mCurrent->vtxUniforms[a].slot;}
     uint32_t vtxUniformSize(uint32_t a) const {return mCurrent->vtxUniforms[a].arraySize;}
     int32_t fragUniformSlot(uint32_t a) const {return mCurrent->fragUniforms[a].slot;}
@@ -143,7 +144,7 @@
         UniformData *fragUniforms;
         bool *fragUniformIsSTO;
     };
-    std::vector<ProgramEntry*> mEntries;
+    android::Vector<ProgramEntry*> mEntries;
     ProgramEntry *mCurrent;
 
     bool hasArrayUniforms(RsdShader *vtx, RsdShader *frag);
@@ -156,3 +157,7 @@
 
 
 #endif //ANDROID_RSD_SHADER_CACHE_H
+
+
+
+
diff --git a/driver/rsdVertexArray.cpp b/driver/rsdVertexArray.cpp
index c18a062..64fc3aa 100644
--- a/driver/rsdVertexArray.cpp
+++ b/driver/rsdVertexArray.cpp
@@ -48,7 +48,7 @@
     stride = 0;
     ptr = nullptr;
     normalized = false;
-    name = "";
+    name.setTo("");
 }
 
 void RsdVertexArray::Attrib::set(uint32_t type, uint32_t size, uint32_t stride,
@@ -60,7 +60,7 @@
     this->offset = offset;
     this->normalized = normalized;
     this->stride = stride;
-    this->name = name;
+    this->name.setTo(name);
 }
 
 void RsdVertexArray::logAttrib(uint32_t idx, uint32_t slot) const {
@@ -69,7 +69,7 @@
     }
     ALOGV("va %i: slot=%i name=%s buf=%i ptr=%p size=%i  type=0x%x  stride=0x%x  norm=%i  offset=0x%p",
           idx, slot,
-          mAttribs[idx].name.c_str(),
+          mAttribs[idx].name.string(),
           mAttribs[idx].buffer,
           mAttribs[idx].ptr,
           mAttribs[idx].size,
@@ -135,3 +135,4 @@
         mAttrsEnabled[ct] = false;
     }
 }
+
diff --git a/driver/rsdVertexArray.h b/driver/rsdVertexArray.h
index 1bafe3b..975121b 100644
--- a/driver/rsdVertexArray.h
+++ b/driver/rsdVertexArray.h
@@ -17,8 +17,6 @@
 #ifndef ANDROID_RSD_VERTEX_ARRAY_H
 #define ANDROID_RSD_VERTEX_ARRAY_H
 
-#include <string>
-
 #include "rsUtils.h"
 
 namespace android {
@@ -41,7 +39,7 @@
         uint32_t size;
         uint32_t stride;
         bool normalized;
-        std::string name;
+        android::String8 name;
 
         Attrib();
         void clear();
@@ -76,3 +74,6 @@
 
 
 #endif //ANDROID_RSD_VERTEX_ARRAY_H
+
+
+
diff --git a/rs.spec b/rs.spec
index 841b89f..8054faa 100644
--- a/rs.spec
+++ b/rs.spec
@@ -174,6 +174,16 @@
     param size_t comp_offset
     }
 
+AllocationElementData {
+    param RsAllocation va
+    param uint32_t x
+    param uint32_t y
+    param uint32_t z
+    param uint32_t lod
+    param const void *data
+    param size_t comp_offset
+    }
+
 Allocation2DData {
     param RsAllocation va
     param uint32_t xoff
@@ -216,6 +226,16 @@
     param void *data
     }
 
+AllocationElementRead {
+    param RsAllocation va
+    param uint32_t x
+    param uint32_t y
+    param uint32_t z
+    param uint32_t lod
+    param void *data
+    param size_t comp_offset
+    }
+
 Allocation2DRead {
     param RsAllocation va
     param uint32_t xoff
@@ -228,6 +248,18 @@
     param size_t stride
 }
 
+Allocation3DRead {
+    param RsAllocation va
+    param uint32_t xoff
+    param uint32_t yoff
+    param uint32_t zoff
+    param uint32_t lod
+    param uint32_t w
+    param uint32_t h
+    param uint32_t d
+    param void *data
+    param size_t stride
+    }
 
 AllocationSyncAll {
     param RsAllocation va
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 5712a17..1d3daaf 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -256,20 +256,30 @@
 
 }
 
-void Allocation::elementData(Context *rsc, uint32_t x, const void *data,
-                                uint32_t cIdx, size_t sizeBytes) {
+void Allocation::elementData(Context *rsc, uint32_t x, uint32_t y, uint32_t z,
+                             const void *data, uint32_t cIdx, size_t sizeBytes) {
     size_t eSize = mHal.state.elementSizeBytes;
 
-    if (cIdx >= mHal.state.type->getElement()->getFieldCount()) {
-        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData component out of range.");
-        return;
-    }
-
     if (x >= mHal.drvState.lod[0].dimX) {
         rsc->setError(RS_ERROR_BAD_VALUE, "subElementData X offset out of range.");
         return;
     }
 
+    if (y > 0 && y >= mHal.drvState.lod[0].dimY) {
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData Y offset out of range.");
+        return;
+    }
+
+    if (z > 0 && z >= mHal.drvState.lod[0].dimZ) {
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData Z offset out of range.");
+        return;
+    }
+
+    if (cIdx >= mHal.state.type->getElement()->getFieldCount()) {
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData component out of range.");
+        return;
+    }
+
     const Element * e = mHal.state.type->getElement()->getField(cIdx);
     uint32_t elemArraySize = mHal.state.type->getElement()->getFieldArraySize(cIdx);
     if (sizeBytes != e->getSizeBytes() * elemArraySize) {
@@ -277,12 +287,12 @@
         return;
     }
 
-    rsc->mHal.funcs.allocation.elementData1D(rsc, this, x, data, cIdx, sizeBytes);
+    rsc->mHal.funcs.allocation.elementData(rsc, this, x, y, z, data, cIdx, sizeBytes);
     sendDirty(rsc);
 }
 
-void Allocation::elementData(Context *rsc, uint32_t x, uint32_t y,
-                                const void *data, uint32_t cIdx, size_t sizeBytes) {
+void Allocation::elementRead(Context *rsc, uint32_t x, uint32_t y, uint32_t z,
+                             void *data, uint32_t cIdx, size_t sizeBytes) {
     size_t eSize = mHal.state.elementSizeBytes;
 
     if (x >= mHal.drvState.lod[0].dimX) {
@@ -290,41 +300,39 @@
         return;
     }
 
-    if (y >= mHal.drvState.lod[0].dimY) {
-        rsc->setError(RS_ERROR_BAD_VALUE,
-                      "subElementData X offset out of range.");
+    if (y > 0 && y >= mHal.drvState.lod[0].dimY) {
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData Y offset out of range.");
+        return;
+    }
+
+    if (z > 0 && z >= mHal.drvState.lod[0].dimZ) {
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData Z offset out of range.");
         return;
     }
 
     if (cIdx >= mHal.state.type->getElement()->getFieldCount()) {
-        rsc->setError(RS_ERROR_BAD_VALUE,
-                      "subElementData component out of range.");
+        rsc->setError(RS_ERROR_BAD_VALUE, "subElementData component out of range.");
         return;
     }
 
     const Element * e = mHal.state.type->getElement()->getField(cIdx);
-    uint32_t elemArraySize =
-        mHal.state.type->getElement()->getFieldArraySize(cIdx);
+    uint32_t elemArraySize = mHal.state.type->getElement()->getFieldArraySize(cIdx);
     if (sizeBytes != e->getSizeBytes() * elemArraySize) {
         rsc->setError(RS_ERROR_BAD_VALUE, "subElementData bad size.");
         return;
     }
 
-    rsc->mHal.funcs.allocation.elementData2D(rsc, this, x, y, data, cIdx,
-                                             sizeBytes);
-    sendDirty(rsc);
+    rsc->mHal.funcs.allocation.elementRead(rsc, this, x, y, z, data, cIdx, sizeBytes);
 }
 
 void Allocation::addProgramToDirty(const Program *p) {
-    mToDirtyList.push_back(p);
+    mToDirtyList.push(p);
 }
 
 void Allocation::removeProgramToDirty(const Program *p) {
-    for (auto entryIter = mToDirtyList.begin(), endIter = mToDirtyList.end();
-         entryIter != endIter; entryIter++) {
-
-        if (p == *entryIter) {
-            mToDirtyList.erase(entryIter);
+    for (size_t ct=0; ct < mToDirtyList.size(); ct++) {
+        if (mToDirtyList[ct] == p) {
+            mToDirtyList.removeAt(ct);
             return;
         }
     }
@@ -342,8 +350,7 @@
         }
     }
     ALOGV("%s allocation ptr=%p  mUsageFlags=0x04%x, mMipmapControl=0x%04x",
-          prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags,
-          mHal.state.mipmapControl);
+         prefix, mHal.drvState.lod[0].mallocPtr, mHal.state.usageFlags, mHal.state.mipmapControl);
 }
 
 uint32_t Allocation::getPackedSize() const {
@@ -660,16 +667,16 @@
     a->data(rsc, xoff, lod, count, data, sizeBytes);
 }
 
-void rsi_Allocation2DElementData(Context *rsc, RsAllocation va, uint32_t x, uint32_t y, uint32_t lod, RsAllocationCubemapFace face,
-                                 const void *data, size_t sizeBytes, size_t eoff) {
+void rsi_Allocation1DElementData(Context *rsc, RsAllocation va, uint32_t x,
+                                 uint32_t lod, const void *data, size_t sizeBytes, size_t eoff) {
     Allocation *a = static_cast<Allocation *>(va);
-    a->elementData(rsc, x, y, data, eoff, sizeBytes);
+    a->elementData(rsc, x, 0, 0, data, eoff, sizeBytes);
 }
 
-void rsi_Allocation1DElementData(Context *rsc, RsAllocation va, uint32_t x, uint32_t lod,
-                                 const void *data, size_t sizeBytes, size_t eoff) {
+void rsi_AllocationElementData(Context *rsc, RsAllocation va, uint32_t x, uint32_t y, uint32_t z,
+                               uint32_t lod, const void *data, size_t sizeBytes, size_t eoff) {
     Allocation *a = static_cast<Allocation *>(va);
-    a->elementData(rsc, x, data, eoff, sizeBytes);
+    a->elementData(rsc, x, y, z, data, eoff, sizeBytes);
 }
 
 void rsi_Allocation2DData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff, uint32_t lod, RsAllocationCubemapFace face,
@@ -688,7 +695,10 @@
 void rsi_AllocationRead(Context *rsc, RsAllocation va, void *data, size_t sizeBytes) {
     Allocation *a = static_cast<Allocation *>(va);
     const Type * t = a->getType();
-    if(t->getDimY()) {
+    if(t->getDimZ()) {
+        a->read(rsc, 0, 0, 0, 0, t->getDimX(), t->getDimY(), t->getDimZ(),
+                data, sizeBytes, 0);
+    } else if(t->getDimY()) {
         a->read(rsc, 0, 0, 0, RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
                 t->getDimX(), t->getDimY(), data, sizeBytes, 0);
     } else {
@@ -847,6 +857,12 @@
     rsc->mHal.funcs.allocation.read1D(rsc, a, xoff, lod, count, data, sizeBytes);
 }
 
+void rsi_AllocationElementRead(Context *rsc, RsAllocation va, uint32_t x, uint32_t y, uint32_t z,
+                                 uint32_t lod, void *data, size_t sizeBytes, size_t eoff) {
+    Allocation *a = static_cast<Allocation *>(va);
+    a->elementRead(rsc, x, y, z, data, eoff, sizeBytes);
+}
+
 void rsi_Allocation2DRead(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff,
                           uint32_t lod, RsAllocationCubemapFace face, uint32_t w,
                           uint32_t h, void *data, size_t sizeBytes, size_t stride) {
@@ -854,6 +870,14 @@
     a->read(rsc, xoff, yoff, lod, face, w, h, data, sizeBytes, stride);
 }
 
+void rsi_Allocation3DRead(Context *rsc, RsAllocation va,
+                          uint32_t xoff, uint32_t yoff, uint32_t zoff,
+                          uint32_t lod, uint32_t w, uint32_t h, uint32_t d,
+                          void *data, size_t sizeBytes, size_t stride) {
+    Allocation *a = static_cast<Allocation *>(va);
+    a->read(rsc, xoff, yoff, zoff, lod, w, h, d, data, sizeBytes, stride);
+}
+
 RsAllocation rsi_AllocationAdapterCreate(Context *rsc, RsType vwindow, RsAllocation vbase) {
 
 
diff --git a/rsAllocation.h b/rsAllocation.h
index 3714852..a06e28d 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -138,10 +138,11 @@
     void read(Context *rsc, uint32_t xoff, uint32_t yoff, uint32_t zoff, uint32_t lod,
               uint32_t w, uint32_t h, uint32_t d, void *data, size_t sizeBytes, size_t stride);
 
-    void elementData(Context *rsc, uint32_t x,
+    void elementData(Context *rsc, uint32_t x, uint32_t y, uint32_t z,
                      const void *data, uint32_t elementOff, size_t sizeBytes);
-    void elementData(Context *rsc, uint32_t x, uint32_t y,
-                     const void *data, uint32_t elementOff, size_t sizeBytes);
+
+    void elementRead(Context *rsc, uint32_t x, uint32_t y, uint32_t z,
+                     void *data, uint32_t elementOff, size_t sizeBytes);
 
     void addProgramToDirty(const Program *);
     void removeProgramToDirty(const Program *);
@@ -200,7 +201,7 @@
     bool hasSameDims(const Allocation *Other) const;
 
 protected:
-    std::vector<const Program *> mToDirtyList;
+    Vector<const Program *> mToDirtyList;
     ObjectBaseRef<const Type> mType;
     void setType(const Type *t) {
         mType.set(t);
diff --git a/rsContext.cpp b/rsContext.cpp
index 7243118..5818b66 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -762,15 +762,13 @@
 void Context::assignName(ObjectBase *obj, const char *name, uint32_t len) {
     rsAssert(!obj->getName());
     obj->setName(name, len);
-    mNames.push_back(obj);
+    mNames.add(obj);
 }
 
 void Context::removeName(ObjectBase *obj) {
-    for (auto nameIter = mNames.begin(), endIter = mNames.end();
-         nameIter != endIter; nameIter++) {
-
-        if (obj == *nameIter) {
-            mNames.erase(nameIter);
+    for (size_t ct=0; ct < mNames.size(); ct++) {
+        if (obj == mNames[ct]) {
+            mNames.removeAt(ct);
             return;
         }
     }
@@ -1012,3 +1010,4 @@
     ObjectBase *ob = static_cast<ObjectBase *>(obj);
     (*name) = ob->getName();
 }
+
diff --git a/rsContext.h b/rsContext.h
index c3fc115..3084a9a 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -307,7 +307,7 @@
     bool mHasSurface;
     bool mIsContextLite;
 
-    std::vector<ObjectBase *> mNames;
+    Vector<ObjectBase *> mNames;
 
     uint64_t mTimers[_RS_TIMER_TOTAL];
     Timers mTimerActive;
diff --git a/rsCppUtils.cpp b/rsCppUtils.cpp
index 79cb4d3..c9a19c2 100644
--- a/rsCppUtils.cpp
+++ b/rsCppUtils.cpp
@@ -17,6 +17,8 @@
 #include "rsUtils.h"
 #include "rsCppUtils.h"
 
+#include <string>
+
 #include <string.h>
 
 namespace android {
@@ -33,6 +35,16 @@
     return n;
 }
 
+const char* rsuJoinStrings(int n, const char* const* strs) {
+    std::string tmp;
+    for (int i = 0; i < n; i++) {
+        if (i > 0) {
+            tmp.append(" ");
+        }
+        tmp.append(strs[i]);
+    }
+    return strndup(tmp.c_str(), tmp.size());
+}
 
 }
 }
diff --git a/rsCppUtils.h b/rsCppUtils.h
index 8b49056..cc6d6cf 100644
--- a/rsCppUtils.h
+++ b/rsCppUtils.h
@@ -19,6 +19,8 @@
 
 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
 #include <utils/Log.h>
+#include <utils/String8.h>
+#include <utils/Vector.h>
 #include <cutils/atomic.h>
 #endif
 
@@ -52,6 +54,96 @@
 #define ALOGV(...) \
     __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__);
 
+namespace android {
+
+    // server has no Vector or String8 classes; implement on top of STL
+    class String8: public std::string {
+    public:
+    String8(const char *ptr) : std::string(ptr) {
+
+        }
+    String8(const char *ptr, size_t len) : std::string(ptr, len) {
+
+        }
+    String8() : std::string() {
+
+        }
+
+        const char* string() const {
+            return this->c_str();
+        }
+
+        void setTo(const char* str, ssize_t len) {
+            this->assign(str, len);
+        }
+        void setTo(const char* str) {
+            this->assign(str);
+        }
+        String8 getPathDir(void) const {
+            const char* cp;
+            const char*const str = this->c_str();
+
+            cp = strrchr(str, OS_PATH_SEPARATOR);
+            if (cp == NULL)
+                return String8("");
+            else
+                return String8(str, cp - str);
+        }
+    };
+
+    template <class T> class Vector: public std::vector<T> {
+    public:
+        void push(T obj) {
+            this->push_back(obj);
+        }
+        void removeAt(uint32_t index) {
+            this->erase(this->begin() + index);
+        }
+        ssize_t add(const T& obj) {
+            this->push_back(obj);
+            return this->size() - 1;
+        }
+        void setCapacity(ssize_t capacity) {
+            this->resize(capacity);
+        }
+
+        T* editArray() {
+            return (T*)(this->begin());
+        }
+
+        const T* array() {
+            return (const T*)(this->begin());
+        }
+
+    };
+
+    template<> class Vector<bool>: public std::vector<char> {
+    public:
+        void push(bool obj) {
+            this->push_back(obj);
+        }
+        void removeAt(uint32_t index) {
+            this->erase(this->begin() + index);
+        }
+        ssize_t add(const bool& obj) {
+            this->push_back(obj);
+            return this->size() - 1;
+        }
+        void setCapacity(ssize_t capacity) {
+            this->resize(capacity);
+        }
+
+        bool* editArray() {
+            return (bool*)(this->begin());
+        }
+
+        const bool* array() {
+            return (const bool*)(this->begin());
+        }
+    };
+
+}
+
 typedef int64_t nsecs_t;  // nano-seconds
 
 enum {
@@ -88,7 +180,6 @@
     return secs/1000000;
 }
 
-
 #endif // RS_SERVER || RS_COMPATIBILITY_LIB
 
 namespace android {
@@ -190,7 +281,11 @@
     return (r >> 2) | ((g >> 2) << 8) | ((b >> 2) << 16) | ((a >> 2) << 24);
 }
 
+const char* rsuJoinStrings(int n, const char* const* strs);
+
 }
 }
 
 #endif //ANDROID_RS_OBJECT_BASE_H
+
+
diff --git a/rsDefines.h b/rsDefines.h
index 69a62d6..4ccdeb8 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -254,7 +254,209 @@
     RS_CONTEXT_LOW_POWER        = 0x0004
 };
 
+enum RsBlasTranspose {
+    RsBlasNoTrans=111,
+    RsBlasTrans=112,
+    RsBlasConjTrans=113
+};
 
+enum RsBlasUplo {
+    RsBlasUpper=121,
+    RsBlasLower=122
+};
+
+enum RsBlasDiag {
+    RsBlasNonUnit=131,
+    RsBlasUnit=132
+};
+
+enum RsBlasSide {
+    RsBlasLeft=141,
+    RsBlasRight=142
+};
+
+enum RsBlasFunction {
+    RsBlas_nop = 0,
+    RsBlas_sdsdot,
+    RsBlas_dsdot,
+    RsBlas_sdot,
+    RsBlas_ddot,
+    RsBlas_cdotu_sub,
+    RsBlas_cdotc_sub,
+    RsBlas_zdotu_sub,
+    RsBlas_zdotc_sub,
+    RsBlas_snrm2,
+    RsBlas_sasum,
+    RsBlas_dnrm2,
+    RsBlas_dasum,
+    RsBlas_scnrm2,
+    RsBlas_scasum,
+    RsBlas_dznrm2,
+    RsBlas_dzasum,
+    RsBlas_isamax,
+    RsBlas_idamax,
+    RsBlas_icamax,
+    RsBlas_izamax,
+    RsBlas_sswap,
+    RsBlas_scopy,
+    RsBlas_saxpy,
+    RsBlas_dswap,
+    RsBlas_dcopy,
+    RsBlas_daxpy,
+    RsBlas_cswap,
+    RsBlas_ccopy,
+    RsBlas_caxpy,
+    RsBlas_zswap,
+    RsBlas_zcopy,
+    RsBlas_zaxpy,
+    RsBlas_srotg,
+    RsBlas_srotmg,
+    RsBlas_srot,
+    RsBlas_srotm,
+    RsBlas_drotg,
+    RsBlas_drotmg,
+    RsBlas_drot,
+    RsBlas_drotm,
+    RsBlas_sscal,
+    RsBlas_dscal,
+    RsBlas_cscal,
+    RsBlas_zscal,
+    RsBlas_csscal,
+    RsBlas_zdscal,
+    RsBlas_sgemv,
+    RsBlas_sgbmv,
+    RsBlas_strmv,
+    RsBlas_stbmv,
+    RsBlas_stpmv,
+    RsBlas_strsv,
+    RsBlas_stbsv,
+    RsBlas_stpsv,
+    RsBlas_dgemv,
+    RsBlas_dgbmv,
+    RsBlas_dtrmv,
+    RsBlas_dtbmv,
+    RsBlas_dtpmv,
+    RsBlas_dtrsv,
+    RsBlas_dtbsv,
+    RsBlas_dtpsv,
+    RsBlas_cgemv,
+    RsBlas_cgbmv,
+    RsBlas_ctrmv,
+    RsBlas_ctbmv,
+    RsBlas_ctpmv,
+    RsBlas_ctrsv,
+    RsBlas_ctbsv,
+    RsBlas_ctpsv,
+    RsBlas_zgemv,
+    RsBlas_zgbmv,
+    RsBlas_ztrmv,
+    RsBlas_ztbmv,
+    RsBlas_ztpmv,
+    RsBlas_ztrsv,
+    RsBlas_ztbsv,
+    RsBlas_ztpsv,
+    RsBlas_ssymv,
+    RsBlas_ssbmv,
+    RsBlas_sspmv,
+    RsBlas_sger,
+    RsBlas_ssyr,
+    RsBlas_sspr,
+    RsBlas_ssyr2,
+    RsBlas_sspr2,
+    RsBlas_dsymv,
+    RsBlas_dsbmv,
+    RsBlas_dspmv,
+    RsBlas_dger,
+    RsBlas_dsyr,
+    RsBlas_dspr,
+    RsBlas_dsyr2,
+    RsBlas_dspr2,
+    RsBlas_chemv,
+    RsBlas_chbmv,
+    RsBlas_chpmv,
+    RsBlas_cgeru,
+    RsBlas_cgerc,
+    RsBlas_cher,
+    RsBlas_chpr,
+    RsBlas_cher2,
+    RsBlas_chpr2,
+    RsBlas_zhemv,
+    RsBlas_zhbmv,
+    RsBlas_zhpmv,
+    RsBlas_zgeru,
+    RsBlas_zgerc,
+    RsBlas_zher,
+    RsBlas_zhpr,
+    RsBlas_zher2,
+    RsBlas_zhpr2,
+    RsBlas_sgemm,
+    RsBlas_ssymm,
+    RsBlas_ssyrk,
+    RsBlas_ssyr2k,
+    RsBlas_strmm,
+    RsBlas_strsm,
+    RsBlas_dgemm,
+    RsBlas_dsymm,
+    RsBlas_dsyrk,
+    RsBlas_dsyr2k,
+    RsBlas_dtrmm,
+    RsBlas_dtrsm,
+    RsBlas_cgemm,
+    RsBlas_csymm,
+    RsBlas_csyrk,
+    RsBlas_csyr2k,
+    RsBlas_ctrmm,
+    RsBlas_ctrsm,
+    RsBlas_zgemm,
+    RsBlas_zsymm,
+    RsBlas_zsyrk,
+    RsBlas_zsyr2k,
+    RsBlas_ztrmm,
+    RsBlas_ztrsm,
+    RsBlas_chemm,
+    RsBlas_cherk,
+    RsBlas_cher2k,
+    RsBlas_zhemm,
+    RsBlas_zherk,
+    RsBlas_zher2k
+};
+
+// custom complex types because of NDK support
+typedef struct {
+    float r;
+    float i;
+} RsFloatComplex;
+
+typedef struct {
+    double r;
+    double i;
+} RsDoubleComplex;
+
+typedef union { 
+    float f;
+    RsFloatComplex c;
+    double d;
+    RsDoubleComplex z;
+} RsBlasScalar;
+
+typedef struct {
+    RsBlasFunction func;
+    RsBlasTranspose transA;
+    RsBlasTranspose transB;
+    RsBlasUplo uplo;
+    RsBlasDiag diag;
+    RsBlasSide side;
+    int M;
+    int N;
+    int K;
+    RsBlasScalar alpha;
+    RsBlasScalar beta;
+    int incX;
+    int incY;
+    int KL;
+    int KU;
+} RsBlasCall;
+          
 #ifdef __cplusplus
 };
 #endif
diff --git a/rsDevice.cpp b/rsDevice.cpp
index 1ba005a..2688890 100644
--- a/rsDevice.cpp
+++ b/rsDevice.cpp
@@ -28,16 +28,14 @@
 }
 
 void Device::addContext(Context *rsc) {
-    mContexts.push_back(rsc);
+    mContexts.push(rsc);
 }
 
 void Device::removeContext(Context *rsc) {
-    for (auto ctxIter = mContexts.begin(), endIter = mContexts.end();
-         ctxIter != endIter; ctxIter++) {
-
-        if (rsc == *ctxIter) {
-            mContexts.erase(ctxIter);
-            return;
+    for (size_t idx=0; idx < mContexts.size(); idx++) {
+        if (mContexts[idx] == rsc) {
+            mContexts.removeAt(idx);
+            break;
         }
     }
 }
@@ -60,3 +58,4 @@
     }
     rsAssert(0);
 }
+
diff --git a/rsDevice.h b/rsDevice.h
index 5961336..ffb514b 100644
--- a/rsDevice.h
+++ b/rsDevice.h
@@ -17,8 +17,6 @@
 #ifndef ANDROID_RS_DEVICE_H
 #define ANDROID_RS_DEVICE_H
 
-#include <vector>
-
 #include "rsUtils.h"
 
 // ---------------------------------------------------------------------------
@@ -38,7 +36,7 @@
     bool mForceSW;
 
 protected:
-    std::vector<Context *> mContexts;
+    Vector<Context *> mContexts;
 };
 
 }
diff --git a/rsElement.cpp b/rsElement.cpp
index 907e3d2..a734400 100644
--- a/rsElement.cpp
+++ b/rsElement.cpp
@@ -42,14 +42,10 @@
 }
 
 void Element::preDestroy() const {
-    auto &elements = mRSC->mStateElement.mElements;
-
-    for (auto elIter = elements.begin(), endIter = elements.end();
-         elIter != endIter; elIter++) {
-
-        if (this == *elIter) {
-            elements.erase(elIter);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateElement.mElements.size(); ct++) {
+        if (mRSC->mStateElement.mElements[ct] == this) {
+            mRSC->mStateElement.mElements.removeAt(ct);
+            break;
         }
     }
 }
@@ -268,7 +264,7 @@
 
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push_back(e);
+    rsc->mStateElement.mElements.push(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -343,7 +339,7 @@
     e->compute();
 
     ObjectBase::asyncLock();
-    rsc->mStateElement.mElements.push_back(e);
+    rsc->mStateElement.mElements.push(e);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsElement.h b/rsElement.h
index ccf69eb..9374c64 100644
--- a/rsElement.h
+++ b/rsElement.h
@@ -17,8 +17,6 @@
 #ifndef ANDROID_STRUCTURED_ELEMENT_H
 #define ANDROID_STRUCTURED_ELEMENT_H
 
-#include <vector>
-
 #include "rsComponent.h"
 #include "rsUtils.h"
 #include "rsInternalDefines.h"
@@ -172,7 +170,7 @@
     ~ElementState();
 
     // Cache of all existing elements.
-    std::vector<Element *> mElements;
+    Vector<Element *> mElements;
 };
 
 
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index f85fffe..a81d0f9 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -87,7 +87,7 @@
             entry->mLength = headerStream->loadU32();
         }
         entry->mRsObj = nullptr;
-        mIndex.push_back(entry);
+        mIndex.push(entry);
     }
 }
 
@@ -385,7 +385,7 @@
     indexEntry->mType = obj->getClassId();
     indexEntry->mOffset = mWriteStream->getPos();
     indexEntry->mRsObj = obj;
-    mWriteIndex.push_back(indexEntry);
+    mWriteIndex.push(indexEntry);
     obj->serialize(con, mWriteStream);
     indexEntry->mLength = mWriteStream->getPos() - indexEntry->mOffset;
     mWriteStream->align(4);
diff --git a/rsFileA3D.h b/rsFileA3D.h
index 0c8b3d6..8bf36b9 100644
--- a/rsFileA3D.h
+++ b/rsFileA3D.h
@@ -88,13 +88,15 @@
     Asset *mAsset;
 
     OStream *mWriteStream;
-    std::vector<A3DIndexEntry*> mWriteIndex;
+    Vector<A3DIndexEntry*> mWriteIndex;
 
     IStream *mReadStream;
-    std::vector<A3DIndexEntry*> mIndex;
+    Vector<A3DIndexEntry*> mIndex;
 };
 
 
 }
 }
 #endif //ANDROID_RS_FILE_A3D_H
+
+
diff --git a/rsFont.cpp b/rsFont.cpp
index 8f39ca9..8b38fde 100644
--- a/rsFont.cpp
+++ b/rsFont.cpp
@@ -33,7 +33,7 @@
 using namespace android;
 using namespace android::renderscript;
 
-Font::Font(Context *rsc) : ObjectBase(rsc) {
+Font::Font(Context *rsc) : ObjectBase(rsc), mCachedGlyphs(NULL) {
     mInitialized = false;
     mHasKerning = false;
     mFace = nullptr;
@@ -76,21 +76,17 @@
 }
 
 void Font::preDestroy() const {
-    auto &activeFonts = mRSC->mStateFont.mActiveFonts;
-
-    for (auto font = activeFonts.begin(), end = activeFonts.end(); font != end;
-         font++) {
-
-        if (this == *font) {
-            activeFonts.erase(font);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateFont.mActiveFonts.size(); ct++) {
+        if (mRSC->mStateFont.mActiveFonts[ct] == this) {
+            mRSC->mStateFont.mActiveFonts.removeAt(ct);
+            break;
         }
     }
 }
 
 void Font::invalidateTextureCache() {
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        mCachedGlyphs[i]->mIsValid = false;
+        mCachedGlyphs.valueAt(i)->mIsValid = false;
     }
 }
 
@@ -228,7 +224,7 @@
 
 Font::CachedGlyphInfo* Font::getCachedUTFChar(int32_t utfChar) {
 
-    CachedGlyphInfo *cachedGlyph = mCachedGlyphs[(uint32_t)utfChar];
+    CachedGlyphInfo *cachedGlyph = mCachedGlyphs.valueFor((uint32_t)utfChar);
     if (cachedGlyph == nullptr) {
         cachedGlyph = cacheGlyph((uint32_t)utfChar);
     }
@@ -287,7 +283,7 @@
 
 Font::CachedGlyphInfo *Font::cacheGlyph(uint32_t glyph) {
     CachedGlyphInfo *newGlyph = new CachedGlyphInfo();
-    mCachedGlyphs[glyph] = newGlyph;
+    mCachedGlyphs.add(glyph, newGlyph);
 #ifndef ANDROID_RS_SERIALIZE
     newGlyph->mGlyphIndex = FT_Get_Char_Index(mFace, glyph);
     newGlyph->mIsValid = false;
@@ -300,14 +296,11 @@
 Font * Font::create(Context *rsc, const char *name, float fontSize, uint32_t dpi,
                     const void *data, uint32_t dataLen) {
     rsc->mStateFont.checkInit();
-    std::vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
+    Vector<Font*> &activeFonts = rsc->mStateFont.mActiveFonts;
 
     for (uint32_t i = 0; i < activeFonts.size(); i ++) {
         Font *ithFont = activeFonts[i];
-        if (ithFont->mFontName == name &&
-            ithFont->mFontSize == fontSize &&
-            ithFont->mDpi == dpi) {
-
+        if (ithFont->mFontName == name && ithFont->mFontSize == fontSize && ithFont->mDpi == dpi) {
             return ithFont;
         }
     }
@@ -315,7 +308,7 @@
     Font *newFont = new Font(rsc);
     bool isInitialized = newFont->init(name, fontSize, dpi, data, dataLen);
     if (isInitialized) {
-        activeFonts.push_back(newFont);
+        activeFonts.push(newFont);
         rsc->mStateFont.precacheLatin(newFont);
         return newFont;
     }
@@ -332,7 +325,7 @@
 #endif
 
     for (uint32_t i = 0; i < mCachedGlyphs.size(); i ++) {
-        CachedGlyphInfo *glyph = mCachedGlyphs[i];
+        CachedGlyphInfo *glyph = mCachedGlyphs.valueAt(i);
         delete glyph;
     }
 }
@@ -561,33 +554,25 @@
     mCacheBuffer = new uint8_t[mCacheWidth * mCacheHeight];
 
 
-    Allocation *cacheAlloc =
-        Allocation::createAllocation(mRSC, texType.get(),
-                                     RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
+    Allocation *cacheAlloc = Allocation::createAllocation(mRSC, texType.get(),
+                                RS_ALLOCATION_USAGE_GRAPHICS_TEXTURE);
     mTextTexture.set(cacheAlloc);
 
     // Split up our cache texture into lines of certain widths
     int32_t nextLine = 0;
-    mCacheLines.push_back(new CacheTextureLine(16, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(24, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(32, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(40, texType->getDimX(),
-                          nextLine, 0));
-    nextLine += mCacheLines.back()->mMaxHeight;
-    mCacheLines.push_back(new CacheTextureLine(texType->getDimY() - nextLine,
-                          texType->getDimX(), nextLine, 0));
+    mCacheLines.push(new CacheTextureLine(16, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(24, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(32, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(40, texType->getDimX(), nextLine, 0));
+    nextLine += mCacheLines.top()->mMaxHeight;
+    mCacheLines.push(new CacheTextureLine(texType->getDimY() - nextLine, texType->getDimX(), nextLine, 0));
 }
 
 // Avoid having to reallocate memory and render quad by quad
diff --git a/rsFont.h b/rsFont.h
index bc343c1..0f17340 100644
--- a/rsFont.h
+++ b/rsFont.h
@@ -17,10 +17,9 @@
 #ifndef ANDROID_RS_FONT_H
 #define ANDROID_RS_FONT_H
 
-#include <map>
-#include <vector>
-
 #include "rsStream.h"
+#include <utils/Vector.h>
+#include <utils/KeyedVector.h>
 
 struct FT_LibraryRec_;
 struct FT_FaceRec_;
@@ -125,7 +124,7 @@
     bool mInitialized;
     bool mHasKerning;
 
-    std::map<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
+    DefaultKeyedVector<uint32_t, CachedGlyphInfo* > mCachedGlyphs;
     CachedGlyphInfo* getCachedUTFChar(int32_t utfChar);
 
     CachedGlyphInfo *cacheGlyph(uint32_t glyph);
@@ -179,7 +178,7 @@
         bool fitBitmap(FT_Bitmap_ *bitmap, uint32_t *retOriginX, uint32_t *retOriginY);
     };
 
-    std::vector<CacheTextureLine*> mCacheLines;
+    Vector<CacheTextureLine*> mCacheLines;
     uint32_t getRemainingCacheCapacity();
 
     void precacheLatin(Font *font);
@@ -204,7 +203,7 @@
     FT_LibraryRec_ *mLibrary;
     FT_LibraryRec_ *getLib();
 #endif //ANDROID_RS_SERIALIZE
-    std::vector<Font*> mActiveFonts;
+    Vector<Font*> mActiveFonts;
 
     // Render state for the font
     ObjectBaseRef<Allocation> mFontShaderFConstant;
diff --git a/rsGrallocConsumer.h b/rsGrallocConsumer.h
index 527e734..3ffee28 100644
--- a/rsGrallocConsumer.h
+++ b/rsGrallocConsumer.h
@@ -17,12 +17,12 @@
 #ifndef ANDROID_RS_GRALLOC_CONSUMER_H
 #define ANDROID_RS_GRALLOC_CONSUMER_H
 
-#include <vector>
-
 #include <gui/ConsumerBase.h>
 
 #include <ui/GraphicBuffer.h>
 
+#include <utils/String8.h>
+#include <utils/Vector.h>
 #include <utils/threads.h>
 
 
@@ -75,3 +75,4 @@
 } // namespace android
 
 #endif // ANDROID_RS_GRALLOC_CONSUMER_H
+
diff --git a/rsInternalDefines.h b/rsInternalDefines.h
index 2a3f3fd..8a62e40 100644
--- a/rsInternalDefines.h
+++ b/rsInternalDefines.h
@@ -189,7 +189,7 @@
     RS_SCRIPT_INTRINSIC_ID_HISTOGRAM = 9,
     // unused 10, 11
     RS_SCRIPT_INTRINSIC_ID_RESIZE = 12,
-
+    RS_SCRIPT_INTRINSIC_ID_BLAS = 13,
     RS_SCRIPT_INTRINSIC_ID_OEM_START = 0x10000000
 };
 
diff --git a/rsProgramFragment.h b/rsProgramFragment.h
index 1357bfc..e7456b9 100644
--- a/rsProgramFragment.h
+++ b/rsProgramFragment.h
@@ -55,7 +55,7 @@
     void deinit(Context *rsc);
 
     ObjectBaseRef<ProgramFragment> mDefault;
-    std::vector<ProgramFragment *> mPrograms;
+    Vector<ProgramFragment *> mPrograms;
 
     ObjectBaseRef<ProgramFragment> mLast;
 };
@@ -63,3 +63,7 @@
 }
 }
 #endif
+
+
+
+
diff --git a/rsProgramRaster.cpp b/rsProgramRaster.cpp
index d47e588..c9a24bf 100644
--- a/rsProgramRaster.cpp
+++ b/rsProgramRaster.cpp
@@ -31,14 +31,10 @@
 }
 
 void ProgramRaster::preDestroy() const {
-    auto &rasters = mRSC->mStateRaster.mRasterPrograms;
-
-    for (auto prIter = rasters.begin(), endIter = rasters.end();
-         prIter != endIter; prIter++) {
-
-        if (this == *prIter) {
-            rasters.erase(prIter);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateRaster.mRasterPrograms.size(); ct++) {
+        if (mRSC->mStateRaster.mRasterPrograms[ct] == this) {
+            mRSC->mStateRaster.mRasterPrograms.removeAt(ct);
+            break;
         }
     }
 }
@@ -98,7 +94,7 @@
     returnRef.set(pr);
 
     ObjectBase::asyncLock();
-    rsc->mStateRaster.mRasterPrograms.push_back(pr);
+    rsc->mStateRaster.mRasterPrograms.push(pr);
     ObjectBase::asyncUnlock();
 
     return returnRef;
@@ -115,3 +111,4 @@
 
 }
 }
+
diff --git a/rsProgramRaster.h b/rsProgramRaster.h
index 207d74c..e9a524b 100644
--- a/rsProgramRaster.h
+++ b/rsProgramRaster.h
@@ -75,10 +75,14 @@
     ObjectBaseRef<ProgramRaster> mLast;
 
     // Cache of all existing raster programs.
-    std::vector<ProgramRaster *> mRasterPrograms;
+    Vector<ProgramRaster *> mRasterPrograms;
 };
 
 
 }
 }
 #endif
+
+
+
+
diff --git a/rsProgramStore.cpp b/rsProgramStore.cpp
index b07f820..3ee75cc 100644
--- a/rsProgramStore.cpp
+++ b/rsProgramStore.cpp
@@ -42,14 +42,10 @@
 }
 
 void ProgramStore::preDestroy() const {
-    auto &stores = mRSC->mStateFragmentStore.mStorePrograms;
-
-    for (auto psIter = stores.begin(), endIter = stores.end();
-         psIter != endIter; psIter++) {
-
-        if (this == *psIter) {
-            stores.erase(psIter);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateFragmentStore.mStorePrograms.size(); ct++) {
+        if (mRSC->mStateFragmentStore.mStorePrograms[ct] == this) {
+            mRSC->mStateFragmentStore.mStorePrograms.removeAt(ct);
+            break;
         }
     }
 }
@@ -122,7 +118,7 @@
     pfs->init();
 
     ObjectBase::asyncLock();
-    rsc->mStateFragmentStore.mStorePrograms.push_back(pfs);
+    rsc->mStateFragmentStore.mStorePrograms.push(pfs);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsProgramStore.h b/rsProgramStore.h
index 06824fe..9a7f7f1 100644
--- a/rsProgramStore.h
+++ b/rsProgramStore.h
@@ -92,9 +92,12 @@
     ObjectBaseRef<ProgramStore> mLast;
 
     // Cache of all existing store programs.
-    std::vector<ProgramStore *> mStorePrograms;
+    Vector<ProgramStore *> mStorePrograms;
 };
 
 }
 }
 #endif
+
+
+
diff --git a/rsSampler.cpp b/rsSampler.cpp
index 924ba86..858658d 100644
--- a/rsSampler.cpp
+++ b/rsSampler.cpp
@@ -49,14 +49,10 @@
 }
 
 void Sampler::preDestroy() const {
-    auto &samplers = mRSC->mStateSampler.mAllSamplers;
-
-    for (auto sampleIter = samplers.begin(), endIter = samplers.end();
-         sampleIter != endIter; sampleIter++) {
-
-        if (this == *sampleIter) {
-            samplers.erase(sampleIter);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateSampler.mAllSamplers.size(); ct++) {
+        if (mRSC->mStateSampler.mAllSamplers[ct] == this) {
+            mRSC->mStateSampler.mAllSamplers.removeAt(ct);
+            break;
         }
     }
 }
@@ -117,7 +113,7 @@
 #endif
 
     ObjectBase::asyncLock();
-    rsc->mStateSampler.mAllSamplers.push_back(s);
+    rsc->mStateSampler.mAllSamplers.push(s);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsSampler.h b/rsSampler.h
index 3f5855f..2fdf707 100644
--- a/rsSampler.h
+++ b/rsSampler.h
@@ -96,9 +96,12 @@
         }
     }
     // Cache of all existing raster programs.
-    std::vector<Sampler *> mAllSamplers;
+    Vector<Sampler *> mAllSamplers;
 };
 
 }
 }
 #endif //ANDROID_RS_SAMPLER_H
+
+
+
diff --git a/rsScript.h b/rsScript.h
index 2212032..80bc622 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -18,6 +18,7 @@
 #define ANDROID_RS_SCRIPT_H
 
 #include "rsAllocation.h"
+#include "rsMap.h"
 
 #include <utility>
 
@@ -87,7 +88,7 @@
             size_t exportedPragmaCount;
             char const **exportedPragmaKeyList;
             char const **exportedPragmaValueList;
-            const std::pair<const char *, uint32_t> *exportedForeachFuncList;
+            const Pair<const char *, uint32_t> *exportedForeachFuncList;
 
             int (* root)();
         };
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index 71761f1..4b204d3 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <string>
-
 #include "rsContext.h"
 #include "rsScriptC.h"
 
@@ -31,6 +29,8 @@
 
 #include <sys/stat.h>
 
+#include <string>
+
 #ifdef USE_MINGW
 /* Define the default path separator for the platform. */
 #define OS_PATH_SEPARATOR     '\\'
@@ -195,14 +195,15 @@
     // Trace this function call.
     // To avoid overhead we only build the string if tracing is actually
     // enabled.
-    std::string *traceString = nullptr;
-    const char  *stringData  = "";
+    String8 *AString = NULL;
+    const char *String = "";
     if (ATRACE_ENABLED()) {
-        traceString = new std::string("runForEach_");
-        traceString->append(mHal.info.exportedForeachFuncList[slot].first);
-        stringData = traceString->c_str();
+        AString = new String8("runForEach_");
+        AString->append(mHal.info.exportedForeachFuncList[slot].first);
+        String = AString->string();
     }
-    ATRACE_NAME(stringData);
+    ATRACE_NAME(String);
+    (void)String;
 
     Context::PushState ps(rsc);
 
@@ -222,8 +223,8 @@
                       "Driver support for multi-input not present");
     }
 
-    if (traceString) {
-        delete traceString;
+    if (AString) {
+        delete AString;
     }
 }
 
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
index d4a6cd1..26ab6a8 100644
--- a/rsScriptGroup.cpp
+++ b/rsScriptGroup.cpp
@@ -17,6 +17,7 @@
 #include "rsScriptGroup.h"
 
 #include "rsContext.h"
+// TODO: Is this header needed here?
 #include "rsScriptGroup2.h"
 
 #include <algorithm>
@@ -33,8 +34,8 @@
         mRSC->mHal.funcs.scriptgroup.destroy(mRSC, this);
     }
 
-    for (auto link : mLinks) {
-        delete link;
+    for (size_t ct=0; ct < mLinks.size(); ct++) {
+        delete mLinks[ct];
     }
 
     for (auto input : mInputs) {
@@ -57,10 +58,12 @@
 }
 
 ScriptGroup::Node * ScriptGroup::findNode(Script *s) const {
-    for (auto node : mNodes) {
-        for (auto kernelRef : node->mKernels) {
-            if (kernelRef->mScript == s) {
-                return node;
+    //ALOGE("find %p   %i", s, (int)mNodes.size());
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            if (n->mKernels[ct2]->mScript == s) {
+                return n;
             }
         }
     }
@@ -68,105 +71,138 @@
     return nullptr;
 }
 
-bool ScriptGroup::calcOrderRecurse(Node *node0, int depth) {
-    node0->mSeen = true;
-    if (node0->mOrder < depth) {
-        node0->mOrder = depth;
+bool ScriptGroup::calcOrderRecurse(Node *n, int depth) {
+    n->mSeen = true;
+    if (n->mOrder < depth) {
+        n->mOrder = depth;
     }
     bool ret = true;
 
-    for (auto link : node0->mOutputs) {
-        Node *node1 = nullptr;
-        if (link->mDstField.get()) {
-            node1 = findNode(link->mDstField->mScript);
+    for (size_t ct=0; ct < n->mOutputs.size(); ct++) {
+        const Link *l = n->mOutputs[ct];
+        Node *nt = NULL;
+        if (l->mDstField.get()) {
+            nt = findNode(l->mDstField->mScript);
         } else {
-            node1 = findNode(link->mDstKernel->mScript);
+            nt = findNode(l->mDstKernel->mScript);
         }
-        if (node1->mSeen) {
+        if (nt->mSeen) {
             return false;
         }
-        ret &= calcOrderRecurse(node1, node0->mOrder + 1);
+        ret &= calcOrderRecurse(nt, n->mOrder + 1);
     }
-
     return ret;
 }
 
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+static int CompareNodeForSort(ScriptGroup::Node *const* lhs,
+                              ScriptGroup::Node *const* rhs) {
+    if (lhs[0]->mOrder > rhs[0]->mOrder) {
+        return 1;
+    }
+    return 0;
+}
+#else
+class NodeCompare {
+public:
+    bool operator() (const ScriptGroup::Node* lhs,
+                     const ScriptGroup::Node* rhs) {
+        if (lhs->mOrder > rhs->mOrder) {
+            return true;
+        }
+        return false;
+    }
+};
+#endif
+
 bool ScriptGroup::calcOrder() {
     // Make nodes
 
-    for (auto kernelRef : mKernels) {
-        const ScriptKernelID *kernel = kernelRef.get();
-        Node *node = findNode(kernel->mScript);
-        if (node == nullptr) {
-            node = new Node(kernel->mScript);
-            mNodes.push_back(node);
+    for (size_t ct=0; ct < mKernels.size(); ct++) {
+        const ScriptKernelID *k = mKernels[ct].get();
+        //ALOGE(" kernel %i, %p  s=%p", (int)ct, k, mKernels[ct]->mScript);
+        Node *n = findNode(k->mScript);
+        //ALOGE("    n = %p", n);
+        if (n == NULL) {
+            n = new Node(k->mScript);
+            mNodes.add(n);
         }
-        node->mKernels.push_back(kernel);
+        n->mKernels.add(k);
     }
 
     // add links
-    for (auto link : mLinks) {
-        Node *node = findNode(link->mSource->mScript);
-        node->mOutputs.push_back(link);
+    //ALOGE("link count %i", (int)mLinks.size());
+    for (size_t ct=0; ct < mLinks.size(); ct++) {
+        Link *l = mLinks[ct];
+        //ALOGE("link  %i %p", (int)ct, l);
+        Node *n = findNode(l->mSource->mScript);
+        //ALOGE("link n %p", n);
+        n->mOutputs.add(l);
 
-        if (link->mDstKernel.get()) {
-            node = findNode(link->mDstKernel->mScript);
-            node->mInputs.push_back(link);
+        if (l->mDstKernel.get()) {
+            //ALOGE("l->mDstKernel.get() %p", l->mDstKernel.get());
+            n = findNode(l->mDstKernel->mScript);
+            //ALOGE("  n1 %p", n);
+            n->mInputs.add(l);
         } else {
-            node = findNode(link->mDstField->mScript);
-            node->mInputs.push_back(link);
+            n = findNode(l->mDstField->mScript);
+            //ALOGE("  n2 %p", n);
+            n->mInputs.add(l);
         }
     }
 
+    //ALOGE("node count %i", (int)mNodes.size());
     // Order nodes
     bool ret = true;
-    for (auto n0 : mNodes) {
-        if (n0->mInputs.size() == 0) {
-            for (auto n1 : mNodes) {
-                n1->mSeen = false;
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        if (n->mInputs.size() == 0) {
+            for (size_t ct2=0; ct2 < mNodes.size(); ct2++) {
+                mNodes[ct2]->mSeen = false;
             }
-            ret &= calcOrderRecurse(n0, 1);
+            ret &= calcOrderRecurse(n, 0);
         }
     }
 
-    for (auto kernelRef : mKernels) {
-        const ScriptKernelID *kernel = kernelRef.get();
-        const Node *node = findNode(kernel->mScript);
+    for (size_t ct=0; ct < mKernels.size(); ct++) {
+        const ScriptKernelID *k = mKernels[ct].get();
+        const Node *n = findNode(k->mScript);
 
-        if (kernel->mHasKernelOutput) {
+        if (k->mHasKernelOutput) {
             bool found = false;
-            for (auto output : node->mOutputs) {
-                if (output->mSource.get() == kernel) {
+            for (size_t ct2=0; ct2 < n->mOutputs.size(); ct2++) {
+                if (n->mOutputs[ct2]->mSource.get() == k) {
                     found = true;
                     break;
                 }
             }
-
             if (!found) {
-                mOutputs.push_back(new IO(kernel));
+                //ALOGE("add io out %p", k);
+                mOutputs.add(new IO(k));
             }
         }
 
-        if (kernel->mHasKernelInput) {
+        if (k->mHasKernelInput) {
             bool found = false;
-            for (auto input : node->mInputs) {
-                if (input->mDstKernel.get() == kernel) {
+            for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
+                if (n->mInputs[ct2]->mDstKernel.get() == k) {
                     found = true;
                     break;
                 }
             }
             if (!found) {
-                mInputs.push_back(new IO(kernel));
+                //ALOGE("add io in %p", k);
+                mInputs.add(new IO(k));
             }
         }
     }
 
     // sort
-    std::stable_sort(mNodes.begin(), mNodes.end(),
-                     [](const ScriptGroup::Node* lhs,
-                        const ScriptGroup::Node* rhs) {
-        return lhs->mOrder < rhs->mOrder;
-    });
+#if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
+    mNodes.sort(&CompareNodeForSort);
+#else
+    std::sort(mNodes.begin(), mNodes.end(), NodeCompare());
+#endif
 
     return ret;
 }
@@ -190,7 +226,7 @@
 
     sg->mKernels.reserve(kernelCount);
     for (size_t ct=0; ct < kernelCount; ct++) {
-        sg->mKernels.push_back(kernels[ct]);
+        sg->mKernels.add(kernels[ct]);
     }
 
     sg->mLinks.reserve(linkCount);
@@ -200,7 +236,7 @@
         l->mSource = src[ct];
         l->mDstField = dstF[ct];
         l->mDstKernel = dstK[ct];
-        sg->mLinks.push_back(l);
+        sg->mLinks.add(l);
     }
 
     sg->calcOrder();
@@ -235,9 +271,9 @@
 }
 
 void ScriptGroup::setInput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (auto input : mInputs) {
-        if (input->mKernel == kid) {
-            input->mAlloc = a;
+    for (size_t ct=0; ct < mInputs.size(); ct++) {
+        if (mInputs[ct]->mKernel == kid) {
+            mInputs[ct]->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setInput) {
                 rsc->mHal.funcs.scriptgroup.setInput(rsc, this, kid, a);
@@ -249,9 +285,9 @@
 }
 
 void ScriptGroup::setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
-    for (auto output : mOutputs) {
-        if (output->mKernel == kid) {
-            output->mAlloc = a;
+    for (size_t ct=0; ct < mOutputs.size(); ct++) {
+        if (mOutputs[ct]->mKernel == kid) {
+            mOutputs[ct]->mAlloc = a;
 
             if (rsc->mHal.funcs.scriptgroup.setOutput) {
                 rsc->mHal.funcs.scriptgroup.setOutput(rsc, this, kid, a);
@@ -290,45 +326,52 @@
         return;
     }
 
-    for (auto node : mNodes) {
-        for (auto kernel : node->mKernels) {
-            Allocation *ain  = nullptr;
-            Allocation *aout = nullptr;
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
 
-            for (auto nodeInput : node->mInputs) {
-                if (nodeInput->mDstKernel.get() == kernel) {
-                    ain = nodeInput->mAlloc.get();
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            const ScriptKernelID *k = n->mKernels[ct2];
+            Allocation *ain = NULL;
+            Allocation *aout = NULL;
+
+            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
+                if (n->mInputs[ct3]->mDstKernel.get() == k) {
+                    ain = n->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" link in %p", ain);
+                }
+            }
+            for (size_t ct3=0; ct3 < mInputs.size(); ct3++) {
+                if (mInputs[ct3]->mKernel == k) {
+                    ain = mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" io in %p", ain);
                 }
             }
 
-            for (auto sgInput : mInputs) {
-                if (sgInput->mKernel == kernel) {
-                    ain = sgInput->mAlloc.get();
+            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == k) {
+                    aout = n->mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" link out %p", aout);
+                }
+            }
+            for (size_t ct3=0; ct3 < mOutputs.size(); ct3++) {
+                if (mOutputs[ct3]->mKernel == k) {
+                    aout = mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" io out %p", aout);
                 }
             }
 
-            for (auto nodeOutput : node->mOutputs) {
-                if (nodeOutput->mSource.get() == kernel) {
-                    aout = nodeOutput->mAlloc.get();
-                }
-            }
+            if (ain == NULL) {
+                n->mScript->runForEach(rsc, k->mSlot, NULL, 0, aout, NULL, 0);
 
-            for (auto sgOutput : mOutputs) {
-                if (sgOutput->mKernel == kernel) {
-                    aout = sgOutput->mAlloc.get();
-                }
-            }
-
-            if (ain == nullptr) {
-                node->mScript->runForEach(rsc, kernel->mSlot, nullptr, 0, aout,
-                                          nullptr, 0);
             } else {
                 const Allocation *ains[1] = {ain};
-                node->mScript->runForEach(rsc, kernel->mSlot, ains,
-                                          sizeof(ains) / sizeof(RsAllocation),
-                                          aout, nullptr, 0);
+                n->mScript->runForEach(rsc, k->mSlot, ains,
+                                       sizeof(ains) / sizeof(RsAllocation),
+                                       aout, NULL, 0);
             }
         }
+
     }
 
 }
@@ -362,12 +405,14 @@
 
 void rsi_ScriptGroupSetInput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
+    //ALOGE("rsi_ScriptGroupSetInput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setInput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
 
 void rsi_ScriptGroupSetOutput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
         RsAllocation alloc) {
+    //ALOGE("rsi_ScriptGroupSetOutput");
     ScriptGroup *s = (ScriptGroup *)sg;
     s->setOutput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
 }
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
index ff0259a..68783f3 100644
--- a/rsScriptGroup.h
+++ b/rsScriptGroup.h
@@ -41,7 +41,7 @@
     virtual SG_API_Version getApiVersion() const { return SG_V1; }
     virtual void execute(Context *rsc);
 
-    std::vector<ObjectBaseRef<ScriptKernelID> > mKernels;
+    Vector<ObjectBaseRef<ScriptKernelID> > mKernels;
 
     class Link {
     public:
@@ -58,9 +58,9 @@
     public:
         Node(Script *);
 
-        std::vector<const ScriptKernelID *> mKernels;
-        std::vector<Link *> mOutputs;
-        std::vector<Link *> mInputs;
+        Vector<const ScriptKernelID *> mKernels;
+        Vector<Link *> mOutputs;
+        Vector<Link *> mInputs;
         bool mSeen;
         int mOrder;
         Script *mScript;
@@ -74,10 +74,10 @@
         ObjectBaseRef<Allocation> mAlloc;
     };
 
-    std::vector<Link *> mLinks;
-    std::vector<Node *> mNodes;
-    std::vector<IO *> mInputs;
-    std::vector<IO *> mOutputs;
+    Vector<Link *> mLinks;
+    Vector<Node *> mNodes;
+    Vector<IO *> mInputs;
+    Vector<IO *> mOutputs;
 
     static ScriptGroup * create(Context *rsc,
                            ScriptKernelID ** kernels, size_t kernelsSize,
@@ -110,3 +110,4 @@
 }
 }
 #endif
+
diff --git a/rsType.cpp b/rsType.cpp
index 39c91cd..d45de2d 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -33,14 +33,10 @@
 }
 
 void Type::preDestroy() const {
-    auto &types = mRSC->mStateType.mTypes;
-
-    for (auto typeIter = types.begin(), endIter = types.end();
-         typeIter != endIter; typeIter++) {
-
-        if (this == *typeIter) {
-            types.erase(typeIter);
-            return;
+    for (uint32_t ct = 0; ct < mRSC->mStateType.mTypes.size(); ct++) {
+        if (mRSC->mStateType.mTypes[ct] == this) {
+            mRSC->mStateType.mTypes.removeAt(ct);
+            break;
         }
     }
 }
@@ -291,7 +287,7 @@
     nt->compute();
 
     ObjectBase::asyncLock();
-    stc->mTypes.push_back(nt);
+    stc->mTypes.push(nt);
     ObjectBase::asyncUnlock();
 
     return returnRef;
diff --git a/rsType.h b/rsType.h
index 445f4ff..6ae8446 100644
--- a/rsType.h
+++ b/rsType.h
@@ -161,7 +161,7 @@
     ~TypeState();
 
     // Cache of all existing types.
-    std::vector<Type *> mTypes;
+    Vector<Type *> mTypes;
 };
 
 
diff --git a/rs_hal.h b/rs_hal.h
index 9a4e9a5..5e8fee8 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -236,10 +236,12 @@
                             uint32_t srcXoff, uint32_t srcYoff, uint32_t srcZoff,
                             uint32_t srcLod);
 
-        void (*elementData1D)(const Context *rsc, const Allocation *alloc, uint32_t x,
-                              const void *data, uint32_t elementOff, size_t sizeBytes);
-        void (*elementData2D)(const Context *rsc, const Allocation *alloc, uint32_t x, uint32_t y,
-                              const void *data, uint32_t elementOff, size_t sizeBytes);
+        void (*elementData)(const Context *rsc, const Allocation *alloc,
+                            uint32_t x, uint32_t y, uint32_t z,
+                            const void *data, uint32_t elementOff, size_t sizeBytes);
+        void (*elementRead)(const Context *rsc, const Allocation *alloc,
+                            uint32_t x, uint32_t y, uint32_t z,
+                            void *data, uint32_t elementOff, size_t sizeBytes);
 
         void (*generateMipmaps)(const Context *rsc, const Allocation *alloc);