Runtime support for Script Group as single module

Also made two other cleanups:
- Changed KernelID/InvokeID into IDBase in class Closure
    Rather than having two fields in class Closure, one of type ScriptKernelID and
    the other InovkeID, use a single field of the common base class IDBase. Added a
    boolean field to indicate whether it is kernel or invoke, since -fno-rtti is on.
- Removed user pointer from CPU closure

Change-Id: I5553f86b2e58325f85649078d48685a38f12d62f
diff --git a/cpu_ref/rsCpuExecutable.cpp b/cpu_ref/rsCpuExecutable.cpp
index e5009d8..75f5e61 100644
--- a/cpu_ref/rsCpuExecutable.cpp
+++ b/cpu_ref/rsCpuExecutable.cpp
@@ -329,6 +329,7 @@
 
     void** fieldAddress = nullptr;
     bool* fieldIsObject = nullptr;
+    char** fieldName = nullptr;
     InvokeFunc_t* invokeFunctions = nullptr;
     ForEachFunc_t* forEachFunctions = nullptr;
     uint32_t* forEachSignatures = nullptr;
@@ -356,6 +357,11 @@
         goto error;
     }
 
+    fieldName = new char*[varCount];
+    if (fieldName == nullptr) {
+        goto error;
+    }
+
     for (size_t i = 0; i < varCount; ++i) {
         if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
             goto error;
@@ -372,6 +378,8 @@
         }
         fieldAddress[i] = addr;
         fieldIsObject[i] = false;
+        fieldName[i] = new char[strlen(line)+1];
+        strcpy(fieldName[i], line);
     }
 
     if (strgets(line, MAXLINE, &rsInfo) == nullptr) {
@@ -440,7 +448,8 @@
         forEachSignatures[i] = tmpSig;
         forEachFunctions[i] =
             (ForEachFunc_t) dlsym(sharedObj, tmpName);
-        if (i != 0 && forEachFunctions[i] == nullptr) {
+        if (i != 0 && forEachFunctions[i] == nullptr &&
+            strcmp(tmpName, "root.expand")) {
             // Ignore missing root.expand functions.
             // root() is always specified at location 0.
             ALOGE("Failed to find forEach function address for %s: %s",
@@ -503,7 +512,6 @@
             ALOGE("Unable to read pragma at index %zu!", i);
             goto error;
         }
-
         char key[MAXLINE];
         char value[MAXLINE] = ""; // initialize in case value is empty
 
@@ -561,15 +569,15 @@
         char *checksumStart = &line[strlen(CHECKSUM_STR)];
         checksum = new char[strlen(checksumStart) + 1];
         strcpy(checksum, checksumStart);
-    }
-    else {
+    } else {
+        ALOGE("Missing checksum in shared obj file");
         goto error;
     }
 
 #endif  // RS_COMPATIBILITY_LIB
 
     return new ScriptExecutable(
-        RSContext, fieldAddress, fieldIsObject, varCount,
+        RSContext, fieldAddress, fieldIsObject, fieldName, varCount,
         invokeFunctions, funcCount,
         forEachFunctions, forEachSignatures, forEachCount,
         pragmaKeys, pragmaValues, pragmaCount,
@@ -591,12 +599,28 @@
 
     delete[] forEachSignatures;
     delete[] forEachFunctions;
+
     delete[] invokeFunctions;
+
+    for (size_t i = 0; i < varCount; i++) {
+        delete[] fieldName[i];
+    }
+    delete[] fieldName;
     delete[] fieldIsObject;
     delete[] fieldAddress;
 
     return nullptr;
 }
 
+void* ScriptExecutable::getFieldAddress(const char* name) const {
+    // TODO: improve this by using a hash map.
+    for (size_t i = 0; i < mExportedVarCount; i++) {
+        if (strcmp(name, mFieldName[i]) == 0) {
+            return mFieldAddress[i];
+        }
+    }
+    return nullptr;
+}
+
 }  // namespace renderscript
 }  // namespace android
diff --git a/cpu_ref/rsCpuExecutable.h b/cpu_ref/rsCpuExecutable.h
index cdf6fd6..ed6904d 100644
--- a/cpu_ref/rsCpuExecutable.h
+++ b/cpu_ref/rsCpuExecutable.h
@@ -57,22 +57,22 @@
 class ScriptExecutable {
 public:
     ScriptExecutable(Context* RSContext,
-                     void** fieldAddress, bool* fieldIsObject, size_t varCount,
+                     void** fieldAddress, bool* fieldIsObject,
+                     const char* const * fieldName, size_t varCount,
                      InvokeFunc_t* invokeFunctions, size_t funcCount,
                      ForEachFunc_t* forEachFunctions, uint32_t* forEachSignatures,
                      size_t forEachCount,
-                     const char ** pragmaKeys, const char ** pragmaValues,
+                     const char** pragmaKeys, const char** pragmaValues,
                      size_t pragmaCount,
                      bool isThreadable, const char *buildChecksum) :
         mFieldAddress(fieldAddress), mFieldIsObject(fieldIsObject),
-            mExportedVarCount(varCount),
-            mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
-            mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
-            mForEachCount(forEachCount),
-            mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
-            mPragmaCount(pragmaCount),
-            mIsThreadable(isThreadable), mBuildChecksum(buildChecksum),
-            mRS(RSContext) {
+        mFieldName(fieldName), mExportedVarCount(varCount),
+        mInvokeFunctions(invokeFunctions), mFuncCount(funcCount),
+        mForEachFunctions(forEachFunctions), mForEachSignatures(forEachSignatures),
+        mForEachCount(forEachCount),
+        mPragmaKeys(pragmaKeys), mPragmaValues(pragmaValues),
+        mPragmaCount(pragmaCount), mIsThreadable(isThreadable),
+        mBuildChecksum(buildChecksum), mRS(RSContext) {
     }
 
     ~ScriptExecutable() {
@@ -91,12 +91,18 @@
             delete [] mPragmaKeys[i];
             delete [] mPragmaValues[i];
         }
-
         delete[] mPragmaValues;
         delete[] mPragmaKeys;
+
         delete[] mForEachSignatures;
         delete[] mForEachFunctions;
+
         delete[] mInvokeFunctions;
+
+        for (size_t i = 0; i < mExportedVarCount; i++) {
+            delete[] mFieldName[i];
+        }
+        delete[] mFieldName;
         delete[] mFieldIsObject;
         delete[] mFieldAddress;
     }
@@ -110,8 +116,12 @@
     size_t getPragmaCount() const { return mPragmaCount; }
 
     void* getFieldAddress(int slot) const { return mFieldAddress[slot]; }
+    void* getFieldAddress(const char* name) const;
     bool getFieldIsObject(int slot) const { return mFieldIsObject[slot]; }
+    const char* getFieldName(int slot) const { return mFieldName[slot]; }
+
     InvokeFunc_t getInvokeFunction(int slot) const { return mInvokeFunctions[slot]; }
+
     ForEachFunc_t getForEachFunction(int slot) const { return mForEachFunctions[slot]; }
     uint32_t getForEachSignature(int slot) const { return mForEachSignatures[slot]; }
 
@@ -129,6 +139,7 @@
 private:
     void** mFieldAddress;
     bool* mFieldIsObject;
+    const char* const * mFieldName;
     size_t mExportedVarCount;
 
     InvokeFunc_t* mInvokeFunctions;
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index ae7e597..481c54d 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -855,6 +855,10 @@
     rsrSetObject(mCtx->getContext(), (rs_object_base *)destPtr, data);
 }
 
+const char* RsdCpuScriptImpl::getFieldName(uint32_t slot) const {
+    return mScriptExec->getFieldName(slot);
+}
+
 RsdCpuScriptImpl::~RsdCpuScriptImpl() {
 #ifndef RS_COMPATIBILITY_LIB
     if (mCompilerDriver) {
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 44df8a9..aaaa2a2 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -87,6 +87,7 @@
     virtual void setGlobalBind(uint32_t slot, Allocation *data);
     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
 
+    const char* getFieldName(uint32_t slot) const;
 
     virtual ~RsdCpuScriptImpl();
     RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s);
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 7222eb9..6bc98b4 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -5,6 +5,8 @@
 #include <stdlib.h>
 #include <unistd.h>
 
+#include <set>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -75,25 +77,25 @@
 
         mutable_kparams->out = (void*)ptr;
 
-        mutable_kparams->usr = cpuClosure->mUsrPtr;
-
         cpuClosure->mFunc(kparams, xstart, xend, ostep);
     }
 
     mutable_kparams->ins        = oldIns;
     mutable_kparams->inEStrides = oldStrides;
-    mutable_kparams->usr        = &closures;
 }
 
 }  // namespace
 
+Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
+    mGroup(group), mFunc(nullptr) {
+    mName = strndup(name, strlen(name));
+}
+
 Batch::~Batch() {
     for (CPUClosure* c : mClosures) {
         delete c;
     }
-    if (mScriptObj) {
-        dlclose(mScriptObj);
-    }
+    free(mName);
 }
 
 bool Batch::conflict(CPUClosure* cpuClosure) const {
@@ -103,8 +105,7 @@
 
     const Closure* closure = cpuClosure->mClosure;
 
-    if (closure->mKernelID.get() == nullptr ||
-        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
+    if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
         // An invoke should be in a batch by itself, so it conflicts with any other
         // closure.
         return true;
@@ -134,30 +135,30 @@
 
 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
                                          const ScriptGroupBase *sg) :
-    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
+    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
+    mExecutable(nullptr), mScriptObj(nullptr) {
     rsAssert(!mGroup->mClosures.empty());
 
-    Batch* batch = new Batch(this);
+    Batch* batch = new Batch(this, "Batch0");
+    int i = 0;
     for (Closure* closure: mGroup->mClosures) {
-        const ScriptKernelID* kernelID = closure->mKernelID.get();
-        RsdCpuScriptImpl* si;
         CPUClosure* cc;
-        if (kernelID != nullptr) {
-            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
+        const IDBase* funcID = closure->mFunctionID.get();
+        RsdCpuScriptImpl* si =
+                (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
+        if (closure->mIsKernel) {
             MTLaunchStruct mtls;
-            si->forEachKernelSetup(kernelID->mSlot, &mtls);
-            // TODO: Is mtls.fep.usrLen ever used?
-            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
-                                mtls.fep.usr, mtls.fep.usrLen);
+            si->forEachKernelSetup(funcID->mSlot, &mtls);
+            cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
         } else {
-            si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(
-                    closure->mInvokeID->mScript);
             cc = new CPUClosure(closure, si);
         }
 
         if (batch->conflict(cc)) {
             mBatches.push_back(batch);
-            batch = new Batch(this);
+            std::stringstream ss;
+            ss << "Batch" << ++i;
+            batch = new Batch(this, ss.str().c_str());
         }
 
         batch->mClosures.push_back(cc);
@@ -167,16 +168,33 @@
     mBatches.push_back(batch);
 
 #ifndef RS_COMPATIBILITY_LIB
-    for (Batch* batch : mBatches) {
-        batch->tryToCreateFusedKernel(mGroup->mCacheDir);
+    compile(mGroup->mCacheDir);
+    if (mScriptObj != nullptr && mExecutable != nullptr) {
+        for (Batch* batch : mBatches) {
+            batch->resolveFuncPtr(mScriptObj);
+        }
     }
-#endif
+#endif  // RS_COMPATIBILITY_LIB
+}
+
+void Batch::resolveFuncPtr(void* sharedObj) {
+    std::string funcName(mName);
+    if (mClosures.front()->mClosure->mIsKernel) {
+        funcName.append(".expand");
+    }
+    mFunc = dlsym(sharedObj, funcName.c_str());
+    rsAssert (mFunc != nullptr);
 }
 
 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
     for (Batch* batch : mBatches) {
         delete batch;
     }
+    // TODO: move this dlclose into ~ScriptExecutable().
+    if (mScriptObj != nullptr) {
+        dlclose(mScriptObj);
+    }
+    delete mExecutable;
 }
 
 namespace {
@@ -189,7 +207,8 @@
 }
 
 void setupCompileArguments(
-        const vector<string>& inputs, const vector<int>& kernels,
+        const vector<string>& inputs, const vector<string>& kernelBatches,
+        const vector<string>& invokeBatches,
         const string& output_dir, const string& output_filename,
         const string& rsLib, vector<const char*>* args) {
     args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
@@ -202,10 +221,13 @@
     for (const string& input : inputs) {
         args->push_back(input.c_str());
     }
-    for (int kernel : kernels) {
-        args->push_back("-k");
-        string strKernel = std::to_string(kernel);
-        args->push_back(strKernel.c_str());
+    for (const string& batch : kernelBatches) {
+        args->push_back("-merge");
+        args->push_back(batch.c_str());
+    }
+    for (const string& batch : invokeBatches) {
+        args->push_back("-invoke");
+        args->push_back(batch.c_str());
     }
     args->push_back("-output_path");
     args->push_back(output_dir.c_str());
@@ -247,13 +269,32 @@
 
     return true;
 }
-#endif
+
+void generateSourceSlot(const Closure& closure,
+                        const std::vector<std::string>& inputs,
+                        std::stringstream& ss) {
+    const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
+    const Script* script = funcID->mScript;
+
+    rsAssert (!script->isIntrinsic());
+
+    const RsdCpuScriptImpl *cpuScript =
+            (const RsdCpuScriptImpl*)script->mHal.drv;
+    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
+
+    const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
+            inputs.begin();
+
+    ss << index << "," << funcID->mSlot << ".";
+}
+
+#endif  // RS_COMPATIBILTY_LIB
 
 }  // anonymous namespace
 
-void Batch::tryToCreateFusedKernel(const char *cacheDir) {
+void CpuScriptGroup2Impl::compile(const char* cacheDir) {
 #ifndef RS_COMPATIBILITY_LIB
-    if (mClosures.size() < 2) {
+    if (mGroup->mClosures.size() < 2) {
         return;
     }
 
@@ -261,25 +302,43 @@
     // Fuse the input kernels and generate native code in an object file
     //===--------------------------------------------------------------------===//
 
-    std::vector<string> inputFiles;
-    std::vector<int> slots;
+    std::set<string> inputSet;
+    for (Closure* closure : mGroup->mClosures) {
+        const Script* script = closure->mFunctionID.get()->mScript;
 
-    for (CPUClosure* cpuClosure : mClosures) {
-        const Closure* closure = cpuClosure->mClosure;
-        const ScriptKernelID* kernelID = closure->mKernelID.get();
-        const Script* script = kernelID->mScript;
-
+        // If any script is an intrinsic, give up trying fusing the kernels.
         if (script->isIntrinsic()) {
             return;
         }
 
         const RsdCpuScriptImpl *cpuScript =
                 (const RsdCpuScriptImpl*)script->mHal.drv;
-
         const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
+        inputSet.insert(bitcodeFilename);
+    }
 
-        inputFiles.push_back(bitcodeFilename);
-        slots.push_back(kernelID->mSlot);
+    std::vector<string> inputs(inputSet.begin(), inputSet.end());
+
+    std::vector<string> kernelBatches;
+    std::vector<string> invokeBatches;
+
+    int i = 0;
+    for (const auto& batch : mBatches) {
+        rsAssert(batch->size() > 0);
+
+        std::stringstream ss;
+        ss << batch->mName << ":";
+
+        if (!batch->mClosures.front()->mClosure->mIsKernel) {
+            rsAssert(batch->size() == 1);
+            generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss);
+            invokeBatches.push_back(ss.str());
+        } else {
+            for (const auto& cpuClosure : batch->mClosures) {
+                generateSourceSlot(*cpuClosure->mClosure, inputs, ss);
+            }
+            kernelBatches.push_back(ss.str());
+        }
     }
 
     rsAssert(cacheDir != nullptr);
@@ -295,8 +354,8 @@
     string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2));
     string rsLibPath(SYSLIBPATH"/libclcore.bc");
     vector<const char*> arguments;
-    setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
-                          &arguments);
+    setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
+                          outputFileName, rsLibPath, &arguments);
     std::unique_ptr<const char> joined(
         rsuJoinStrings(arguments.size() - 1, arguments.data()));
     string commandLine (joined.get());
@@ -317,15 +376,15 @@
         return;
     }
 
-    void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
-    if (mSharedObj == nullptr) {
+    mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
+    if (mScriptObj == nullptr) {
         ALOGE("Unable to load '%s'", resName);
         return;
     }
 
     mExecutable = ScriptExecutable::createFromSharedObject(
-                                                           nullptr,  // RS context. Unused.
-                                                           mSharedObj);
+        nullptr,  // RS context. Unused.
+        mScriptObj);
 
 #endif  // RS_COMPATIBILITY_LIB
 }
@@ -340,13 +399,8 @@
 void Batch::setGlobalsForBatch() {
     for (CPUClosure* cpuClosure : mClosures) {
         const Closure* closure = cpuClosure->mClosure;
-        const ScriptKernelID* kernelID = closure->mKernelID.get();
-        Script* s;
-        if (kernelID != nullptr) {
-            s = kernelID->mScript;
-        } else {
-            s = cpuClosure->mClosure->mInvokeID->mScript;
-        }
+        const IDBase* funcID = closure->mFunctionID.get();
+        Script* s = funcID->mScript;;
         for (const auto& p : closure->mGlobals) {
             const void* value = p.second.first;
             int size = p.second.second;
@@ -360,18 +414,54 @@
             rsAssert(p.first != nullptr);
             ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)",
                   closure, p.first, p.first->mScript, p.first->mSlot);
-            // We use -1 size to indicate an ObjectBase rather than a primitive type
-            if (size < 0) {
-                s->setVarObj(p.first->mSlot, (ObjectBase*)value);
+            Script* script = p.first->mScript;
+            const RsdCpuScriptImpl *cpuScript =
+                    (const RsdCpuScriptImpl*)script->mHal.drv;
+            int slot = p.first->mSlot;
+            ScriptExecutable* exec = mGroup->getExecutable();
+            if (exec != nullptr) {
+                const char* varName = cpuScript->getFieldName(slot);
+                void* addr = exec->getFieldAddress(varName);
+                if (size < 0) {
+                    rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
+                                 (rs_object_base*)addr, (ObjectBase*)value);
+                } else {
+                    memcpy(addr, (const void*)&value, size);
+                }
             } else {
-                s->setVar(p.first->mSlot, (const void*)&value, size);
+                // We use -1 size to indicate an ObjectBase rather than a primitive type
+                if (size < 0) {
+                    s->setVarObj(slot, (ObjectBase*)value);
+                } else {
+                    s->setVar(slot, (const void*)&value, size);
+                }
             }
         }
     }
 }
 
 void Batch::run() {
-    if (mExecutable != nullptr) {
+    if (!mClosures.front()->mClosure->mIsKernel) {
+        rsAssert(mClosures.size() == 1);
+
+        // This batch contains a single closure for an invoke function
+        CPUClosure* cc = mClosures.front();
+        const Closure* c = cc->mClosure;
+
+        if (mFunc != nullptr) {
+            // TODO: Need align pointers for x86_64.
+            // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
+            ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
+        } else {
+            const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
+            rsAssert(invokeID != nullptr);
+            cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
+        }
+
+        return;
+    }
+
+    if (mFunc != nullptr) {
         MTLaunchStruct mtls;
         const CPUClosure* firstCpuClosure = mClosures.front();
         const CPUClosure* lastCpuClosure = mClosures.back();
@@ -384,7 +474,7 @@
 
         mtls.script = nullptr;
         mtls.fep.usr = nullptr;
-        mtls.kernel = mExecutable->getForEachFunction(0);
+        mtls.kernel = (ForEachFunc_t)mFunc;
 
         mGroup->getCpuRefImpl()->launchThreads(
                 (const Allocation**)firstCpuClosure->mClosure->mArgs,
@@ -395,25 +485,14 @@
         return;
     }
 
-    if (mClosures.size() == 1 &&
-        mClosures.front()->mClosure->mKernelID.get() == nullptr) {
-        // This closure is for an invoke function
-        CPUClosure* cc = mClosures.front();
-        const Closure* c = cc->mClosure;
-        const ScriptInvokeID* invokeID = c->mInvokeID;
-        rsAssert(invokeID != nullptr);
-        cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
-        return;
-    }
-
     for (CPUClosure* cpuClosure : mClosures) {
         const Closure* closure = cpuClosure->mClosure;
-        const ScriptKernelID* kernelID = closure->mKernelID.get();
+        const ScriptKernelID* kernelID =
+                (const ScriptKernelID*)closure->mFunctionID.get();
         cpuClosure->mSi->preLaunch(kernelID->mSlot,
                                    (const Allocation**)closure->mArgs,
                                    closure->mNumArg, closure->mReturnValue,
-                                   cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
-                                   nullptr);
+                                   nullptr, 0, nullptr);
     }
 
     const CPUClosure* cpuClosure = mClosures.front();
@@ -434,7 +513,8 @@
 
     for (CPUClosure* cpuClosure : mClosures) {
         const Closure* closure = cpuClosure->mClosure;
-        const ScriptKernelID* kernelID = closure->mKernelID.get();
+        const ScriptKernelID* kernelID =
+                (const ScriptKernelID*)closure->mFunctionID.get();
         cpuClosure->mSi->postLaunch(kernelID->mSlot,
                                     (const Allocation**)closure->mArgs,
                                     closure->mNumArg, closure->mReturnValue,
diff --git a/cpu_ref/rsCpuScriptGroup2.h b/cpu_ref/rsCpuScriptGroup2.h
index 9ff16c4..f8d36fd 100644
--- a/cpu_ref/rsCpuScriptGroup2.h
+++ b/cpu_ref/rsCpuScriptGroup2.h
@@ -21,44 +21,40 @@
 
 class CPUClosure {
 public:
-    CPUClosure(const Closure* closure, RsdCpuScriptImpl* si, ExpandFuncTy func,
-               const void* usrPtr, const size_t usrSize) :
-        mClosure(closure), mSi(si), mFunc(func),
-        mUsrPtr(usrPtr), mUsrSize(usrSize) {}
+    CPUClosure(const Closure* closure, RsdCpuScriptImpl* si, ExpandFuncTy func) :
+        mClosure(closure), mSi(si), mFunc(func) {}
 
     CPUClosure(const Closure* closure, RsdCpuScriptImpl* si) :
-        mClosure(closure), mSi(si), mFunc(nullptr),
-        mUsrPtr(nullptr), mUsrSize(0) {}
+        mClosure(closure), mSi(si), mFunc(nullptr) {}
 
     // It's important to do forwarding here than inheritance for unbound value
     // binding to work.
     const Closure* mClosure;
     RsdCpuScriptImpl* mSi;
     const ExpandFuncTy mFunc;
-    const void* mUsrPtr;
-    const size_t mUsrSize;
 };
 
 class CpuScriptGroup2Impl;
 
 class Batch {
 public:
-    Batch(CpuScriptGroup2Impl* group) : mGroup(group), mExecutable(nullptr) {}
-
+    Batch(CpuScriptGroup2Impl* group, const char* name);
     ~Batch();
 
     // Returns true if closure depends on any closure in this batch for a global
     // variable
     bool conflict(CPUClosure* closure) const;
 
-    void tryToCreateFusedKernel(const char* cacheDir);
+    void resolveFuncPtr(void* sharedObj);
     void setGlobalsForBatch();
     void run();
 
+    size_t size() const { return mClosures.size(); }
+
     CpuScriptGroup2Impl* mGroup;
-    ScriptExecutable* mExecutable;
-    void* mScriptObj;
     List<CPUClosure*> mClosures;
+    char* mName;
+    void* mFunc;
 };
 
 class CpuScriptGroup2Impl : public RsdCpuReference::CpuScriptGroup2 {
@@ -70,11 +66,16 @@
     virtual void execute();
 
     RsdCpuReferenceImpl* getCpuRefImpl() const { return mCpuRefImpl; }
+    ScriptExecutable* getExecutable() const { return mExecutable; }
+
+    void compile(const char* cacheDir);
 
 private:
     RsdCpuReferenceImpl* mCpuRefImpl;
     const ScriptGroup2* mGroup;
     List<Batch*> mBatches;
+    ScriptExecutable* mExecutable;
+    void* mScriptObj;
 };
 
 }  // namespace renderscript
diff --git a/rsClosure.cpp b/rsClosure.cpp
index ece92d5..8fb12b8 100644
--- a/rsClosure.cpp
+++ b/rsClosure.cpp
@@ -66,8 +66,8 @@
                  const size_t* sizes,
                  const Closure** depClosures,
                  const ScriptFieldID** depFieldIDs) :
-    ObjectBase(context), mContext(context), mKernelID((ScriptKernelID*)kernelID),
-    mInvokeID(nullptr), mReturnValue(returnValue), mParams(nullptr),
+    ObjectBase(context), mContext(context), mFunctionID((IDBase*)kernelID),
+    mIsKernel(true), mReturnValue(returnValue), mParams(nullptr),
     mParamLength(0) {
     size_t i;
 
@@ -92,8 +92,6 @@
         j++;
     }
 
-    // mDependences.insert(depClosures, depClosures + numValues);
-
     for (i = 0; i < mNumArg; i++) {
         const Closure* dep = depClosures[i];
         if (dep != nullptr) {
@@ -128,7 +126,7 @@
                  const void* params, const size_t paramLength,
                  const size_t numValues, const ScriptFieldID** fieldIDs,
                  const void** values, const size_t* sizes) :
-    ObjectBase(context), mContext(context), mKernelID(nullptr), mInvokeID(invokeID),
+    ObjectBase(context), mContext(context), mFunctionID((IDBase*)invokeID), mIsKernel(false),
     mReturnValue(nullptr), mParams(params), mParamLength(paramLength) {
     for (size_t i = 0; i < numValues; i++) {
         mGlobals[fieldIDs[i]] = make_pair(values[i], sizes[i]);
diff --git a/rsClosure.h b/rsClosure.h
index b14c2aa..d9e41a5 100644
--- a/rsClosure.h
+++ b/rsClosure.h
@@ -10,6 +10,7 @@
 
 class Allocation;
 class Context;
+class IDBase;
 class ObjectBase;
 class ScriptFieldID;
 class ScriptInvokeID;
@@ -48,12 +49,11 @@
 
     Context* mContext;
 
-    // If mKernelID is not null, this is a closure for a kernel. Otherwise, it is
-    // a closure for an invoke function, whose id is the next field. At least one
-    // of these fields has to be non-null.
-    const ObjectBaseRef<ScriptKernelID> mKernelID;
-    // TODO(yangni): ObjectBaseRef<ScriptInvokeID>
-    const ScriptInvokeID* mInvokeID;
+    // KernelId or InvokeID
+    const ObjectBaseRef<IDBase> mFunctionID;
+    // Flag indicating if this closure is for a kernel (true) or invocable
+    // function (false)
+    const bool mIsKernel;
 
     // Values referrenced in arguments and globals cannot be futures. They must be
     // either a known value or unbound value.
@@ -66,9 +66,6 @@
 
     Allocation* mReturnValue;
 
-    // All the other closures that this closure depends on
-    // set<const Closure*> mDependences;
-
     // All the other closures which this closure depends on for one of its
     // arguments, and the fields which it depends on.
     Map<const Closure*, Map<int, const ObjectBaseRef<ScriptFieldID>*>*> mArgDeps;