New Script Group API: runtime and cpu driver support.

Change-Id: I9c612cf8874aabaf0ca7d1640567464c71ed3070
diff --git a/Android.mk b/Android.mk
index d9af8cc..36f9730 100644
--- a/Android.mk
+++ b/Android.mk
@@ -141,6 +141,7 @@
 	rsAnimation.cpp \
 	rsComponent.cpp \
 	rsContext.cpp \
+	rsClosure.cpp \
 	rsCppUtils.cpp \
 	rsDevice.cpp \
 	rsElement.cpp \
@@ -167,6 +168,7 @@
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
 	rsScriptGroup.cpp \
+	rsScriptGroup2.cpp \
 	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
@@ -245,6 +247,7 @@
 	rsAnimation.cpp \
 	rsComponent.cpp \
 	rsContext.cpp \
+	rsClosure.cpp \
 	rsDevice.cpp \
 	rsElement.cpp \
 	rsFBOCache.cpp \
@@ -269,6 +272,7 @@
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
 	rsScriptGroup.cpp \
+	rsScriptGroup2.cpp \
 	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
diff --git a/cpp/rsDispatch.h b/cpp/rsDispatch.h
index 659591b..53d72d4 100644
--- a/cpp/rsDispatch.h
+++ b/cpp/rsDispatch.h
@@ -28,7 +28,9 @@
 typedef void (*DeviceSetConfigFnPtr) (RsDevice dev, RsDeviceParam p, int32_t value);
 typedef RsContext (*ContextCreateFnPtr)(RsDevice vdev, uint32_t version, uint32_t sdkVersion, RsContextType ct, uint32_t flags);
 typedef void (*GetNameFnPtr)(RsContext, void * obj, const char **name);
-
+typedef RsClosure (*ClosureCreateFnPtr)(RsContext, RsScriptKernelID, RsAllocation, RsScriptFieldID*, size_t, uintptr_t*, size_t, size_t*, size_t, RsClosure*, size_t, RsScriptFieldID*, size_t);
+typedef void (*ClosureSetArgFnPtr)(RsContext, RsClosure, uint32_t, uintptr_t, size_t);
+typedef void (*ClosureSetGlobalFnPtr)(RsContext, RsClosure, RsScriptFieldID, uintptr_t, size_t);
 typedef void (*ContextDestroyFnPtr) (RsContext);
 typedef RsMessageToClientType (*ContextGetMessageFnPtr) (RsContext, void*, size_t, size_t*, size_t, uint32_t*, size_t);
 typedef RsMessageToClientType (*ContextPeekMessageFnPtr) (RsContext, size_t*, size_t, uint32_t*, size_t);
@@ -80,6 +82,7 @@
 typedef RsScriptKernelID (*ScriptKernelIDCreateFnPtr) (RsContext, RsScript, int, int);
 typedef RsScriptFieldID (*ScriptFieldIDCreateFnPtr) (RsContext, RsScript, int);
 typedef RsScriptGroup (*ScriptGroupCreateFnPtr) (RsContext, RsScriptKernelID*, size_t, RsScriptKernelID*, size_t, RsScriptKernelID*, size_t, RsScriptFieldID*, size_t, const RsType*, size_t);
+typedef RsScriptGroup2 (*ScriptGroup2CreateFnPtr)(RsContext, RsClosure*, size_t);
 typedef void (*ScriptGroupSetOutputFnPtr) (RsContext, RsScriptGroup, RsScriptKernelID, RsAllocation);
 typedef void (*ScriptGroupSetInputFnPtr) (RsContext, RsScriptGroup, RsScriptKernelID, RsAllocation);
 typedef void (*ScriptGroupExecuteFnPtr) (RsContext, RsScriptGroup);
@@ -113,6 +116,9 @@
     AllocationCubeCreateFromBitmapFnPtr AllocationCubeCreateFromBitmap;
     AllocationGetSurfaceFnPtr AllocationGetSurface;
     AllocationSetSurfaceFnPtr AllocationSetSurface;
+    ClosureCreateFnPtr ClosureCreate;
+    ClosureSetArgFnPtr ClosureSetArg;
+    ClosureSetGlobalFnPtr ClosureSetGlobal;
     ContextFinishFnPtr ContextFinish;
     ContextDumpFnPtr ContextDump;
     ContextSetPriorityFnPtr ContextSetPriority;
@@ -152,6 +158,7 @@
     ScriptKernelIDCreateFnPtr ScriptKernelIDCreate;
     ScriptFieldIDCreateFnPtr ScriptFieldIDCreate;
     ScriptGroupCreateFnPtr ScriptGroupCreate;
+    ScriptGroup2CreateFnPtr ScriptGroup2Create;
     ScriptGroupSetOutputFnPtr ScriptGroupSetOutput;
     ScriptGroupSetInputFnPtr ScriptGroupSetInput;
     ScriptGroupExecuteFnPtr ScriptGroupExecute;
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 5de964f..27243f9 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -29,6 +29,7 @@
 	rsCpuRuntimeMath.cpp \
 	rsCpuRuntimeStubs.cpp \
 	rsCpuScriptGroup.cpp \
+	rsCpuScriptGroup2.cpp \
 	rsCpuIntrinsic.cpp \
 	rsCpuIntrinsic3DLUT.cpp \
 	rsCpuIntrinsicBlend.cpp \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 47bc1c0..84c2416 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -17,6 +17,7 @@
 #include "rsCpuCore.h"
 #include "rsCpuScript.h"
 #include "rsCpuScriptGroup.h"
+#include "rsCpuScriptGroup2.h"
 
 #include <malloc.h>
 #include "rsContext.h"
@@ -660,11 +661,19 @@
     return i;
 }
 
-RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
-    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
-    if (!sgi->init()) {
+void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
+  switch (sg->getApiVersion()) {
+    case ScriptGroupBase::SG_V1: {
+      CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
+      if (!sgi->init()) {
         delete sgi;
         return nullptr;
+      }
+      return sgi;
     }
-    return sgi;
+    case ScriptGroupBase::SG_V2: {
+      return new CpuScriptGroup2Impl(this, sg);
+    }
+  }
+  return nullptr;
 }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index bfd5e51..e069658 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -182,7 +182,7 @@
                                      uint32_t flags);
     virtual CpuScript * createIntrinsic(const Script *s,
                                         RsScriptIntrinsicID iid, Element *e);
-    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg);
+    virtual void* createScriptGroup(const ScriptGroupBase *sg);
 
     const RsdCpuReference::CpuSymbol *symLookup(const char *);
 
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 751bafb..3d32a51 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -18,15 +18,13 @@
 #include "rsCpuScript.h"
 #include "rsScriptGroup.h"
 #include "rsCpuScriptGroup.h"
-//#include "rsdBcc.h"
-//#include "rsdAllocation.h"
 
 using namespace android;
 using namespace android::renderscript;
 
-CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg) {
+CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroupBase *sg) {
     mCtx = ctx;
-    mSG = sg;
+    mSG = (ScriptGroup*)sg;
 }
 
 CpuScriptGroupImpl::~CpuScriptGroupImpl() {
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 1a4af05..50ba2ac 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -30,7 +30,7 @@
     virtual void execute();
     virtual ~CpuScriptGroupImpl();
 
-    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
+    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroupBase *sg);
     bool init();
 
     static void scriptGroupRoot(const RsExpandKernelParams *p,
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
new file mode 100644
index 0000000..9dc4d90
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -0,0 +1,192 @@
+#include "rsCpuScriptGroup2.h"
+
+#include "cpu_ref/rsCpuCore.h"
+#include "rsClosure.h"
+#include "rsContext.h"
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsScript.h"
+#include "rsScriptGroup2.h"
+
+namespace android {
+namespace renderscript {
+
+namespace {
+
+static const size_t DefaultKernelArgCount = 2;
+
+void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
+               uint32_t xend, uint32_t outstep) {
+  const list<CPUClosure*>& closures = *(list<CPUClosure*>*)kparams->usr;
+  RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
+  const void **oldIns  = kparams->ins;
+  uint32_t *oldStrides = kparams->inEStrides;
+
+  std::vector<const void*> ins(DefaultKernelArgCount);
+  std::vector<uint32_t> strides(DefaultKernelArgCount);
+
+  for (CPUClosure* cpuClosure : closures) {
+    const Closure* closure = cpuClosure->mClosure;
+
+    auto in_iter = ins.begin();
+    auto stride_iter = strides.begin();
+
+    for (const auto& arg : closure->mArgs) {
+      const Allocation* a = (const Allocation*)arg;
+      const uint32_t eStride = a->mHal.state.elementSizeBytes;
+      const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
+          eStride * xstart;
+      if (kparams->dimY > 1) {
+        ptr += a->mHal.drvState.lod[0].stride * kparams->y;
+      }
+      *in_iter++ = ptr;
+      *stride_iter++ = eStride;
+    }
+
+    mutable_kparams->ins = &ins[0];
+    mutable_kparams->inEStrides = &strides[0];
+
+    const Allocation* out = closure->mReturnValue;
+    const uint32_t ostep = out->mHal.state.elementSizeBytes;
+    const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
+           ostep * xstart;
+    if (kparams->dimY > 1) {
+      ptr += out->mHal.drvState.lod[0].stride * kparams->y;
+    }
+
+    mutable_kparams->out = (void*)ptr;
+
+    mutable_kparams->usr = cpuClosure->mUsrPtr;
+
+    cpuClosure->mFunc(kparams, xstart, xend, ostep);
+  }
+
+  mutable_kparams->ins        = oldIns;
+  mutable_kparams->inEStrides = oldStrides;
+  mutable_kparams->usr        = &closures;
+}
+
+/*
+  Returns true if closure depends on any closure in batch via a glboal variable
+  TODO: this probably should go into class Closure.
+ */
+bool conflict(const list<CPUClosure*> &batch, CPUClosure* closure) {
+  for (const auto &p : closure->mClosure->mGlobalDeps) {
+    const Closure* dep = p.first;
+    for (CPUClosure* c : batch) {
+      if (c->mClosure == dep) {
+        return true;
+      }
+    }
+  }
+  for (const auto &p : closure->mClosure->mArgDeps) {
+    const Closure* dep = p.first;
+    for (CPUClosure* c : batch) {
+      if (c->mClosure == dep) {
+        for (const auto &p1 : *p.second) {
+          if (p1.second != nullptr) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
+                                         const ScriptGroupBase *sg) :
+    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
+  list<CPUClosure*>* batch = new list<CPUClosure*>();
+  for (Closure* closure: mGroup->mClosures) {
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    RsdCpuScriptImpl* si =
+        (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
+
+    MTLaunchStruct mtls;
+    si->forEachKernelSetup(kernelID->mSlot, &mtls);
+    // TODO: Is mtls.fep.usrLen ever used?
+    CPUClosure* cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
+                                    mtls.fep.usr, mtls.fep.usrLen);
+    if (conflict(*batch, cc)) {
+      mBatches.push_back(batch);
+      batch = new list<CPUClosure*>();
+    }
+    batch->push_back(cc);
+  }
+  mBatches.push_back(batch);
+}
+
+CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
+  for (list<CPUClosure*>* batch : mBatches) {
+    for (CPUClosure* c : *batch) {
+      delete c;
+    }
+  }
+}
+
+void CpuScriptGroup2Impl::execute() {
+  for (list<CPUClosure*>* batch : mBatches) {
+    setGlobalsForBatch(*batch);
+    runBatch(*batch);
+  }
+}
+
+void CpuScriptGroup2Impl::setGlobalsForBatch(const list<CPUClosure*>& batch) {
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    Script* s = kernelID->mScript;
+    for (const auto& p : closure->mGlobals) {
+      const void* value = p.second.first;
+      int size = p.second.second;
+      // We use -1 size to indicate an ObjectBase rather than a primitive type
+      if (size < 0) {
+        s->setVarObj(p.first->mSlot, (ObjectBase*)value);
+      } else {
+        s->setVar(p.first->mSlot, (const void*)&value, size);
+      }
+    }
+  }
+}
+
+void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) {
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    cpuClosure->mSi->preLaunch(kernelID->mSlot,
+                               (const Allocation**)&closure->mArgs[0],
+                               closure->mArgs.size(), closure->mReturnValue,
+                               cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
+                               nullptr);
+  }
+
+  const CPUClosure* cpuClosure = batch.front();
+  const Closure* closure = cpuClosure->mClosure;
+  MTLaunchStruct mtls;
+
+  cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
+                                    closure->mArgs.size(),
+                                    closure->mReturnValue,
+                                    nullptr, 0, nullptr, &mtls);
+
+  mtls.script = nullptr;
+  mtls.kernel = (void (*)())&groupRoot;
+  mtls.fep.usr = &batch;
+
+  mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    cpuClosure->mSi->postLaunch(kernelID->mSlot,
+                                (const Allocation**)&closure->mArgs[0],
+                                closure->mArgs.size(), closure->mReturnValue,
+                                nullptr, 0, nullptr);
+  }
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/cpu_ref/rsCpuScriptGroup2.h b/cpu_ref/rsCpuScriptGroup2.h
new file mode 100644
index 0000000..6cb72a6
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup2.h
@@ -0,0 +1,60 @@
+#ifndef CPU_REF_CPUSCRIPTGROUP2IMPL_H_
+#define CPU_REF_CPUSCRIPTGROUP2IMPL_H_
+
+#include <list>
+
+#include "rsd_cpu.h"
+
+using std::list;
+
+namespace android {
+namespace renderscript {
+
+class Closure;
+class RsdCpuScriptImpl;
+class RsdCpuReferenceImpl;
+class ScriptGroup2;
+
+struct RsExpandKernelParams;
+
+typedef void (*ExpandFuncTy)(const RsExpandKernelParams*, uint32_t, uint32_t,
+                             uint32_t);
+
+class CPUClosure {
+ public:
+  CPUClosure(const Closure* closure, RsdCpuScriptImpl* si, ExpandFuncTy func,
+             const void* usrPtr, const size_t usrSize) :
+      mClosure(closure), mSi(si), mFunc(func), mUsrPtr(usrPtr),
+      mUsrSize(usrSize) {}
+
+  // It's important to do forwarding here than inheritance for unbound value
+  // binding to work.
+  const Closure* mClosure;
+  RsdCpuScriptImpl* mSi;
+  const ExpandFuncTy mFunc;
+  const void* mUsrPtr;
+  const size_t mUsrSize;
+};
+
+class CpuScriptGroup2Impl : public RsdCpuReference::CpuScriptGroup2 {
+ public:
+  CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, const ScriptGroupBase* group);
+  virtual ~CpuScriptGroup2Impl();
+
+  bool init();
+  virtual void execute();
+
+ private:
+  void setGlobalsForBatch(const list<CPUClosure*>& batch);
+  void runBatch(const list<CPUClosure*>& batch);
+
+  RsdCpuReferenceImpl* mCpuRefImpl;
+  const ScriptGroup2* mGroup;
+
+  list<list<CPUClosure*>*> mBatches;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // CPU_REF_CPUSCRIPTGROUP2IMPL_H_
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index b0e924e..d886cef 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -45,7 +45,7 @@
 
 class ScriptC;
 class Script;
-class ScriptGroup;
+class ScriptGroupBase;
 class ScriptKernelID;
 
 
@@ -97,7 +97,13 @@
     };
     typedef CpuScript * (* script_lookup_t)(Context *, const Script *s);
 
-    class CpuScriptGroup {
+    class CpuScriptGroupBase {
+     public:
+      virtual void execute() = 0;
+      virtual ~CpuScriptGroupBase() {}
+    };
+
+    class CpuScriptGroup : public CpuScriptGroupBase {
     public:
         virtual void setInput(const ScriptKernelID *kid, Allocation *) = 0;
         virtual void setOutput(const ScriptKernelID *kid, Allocation *) = 0;
@@ -105,6 +111,12 @@
         virtual ~CpuScriptGroup() {};
     };
 
+    class CpuScriptGroup2 : public CpuScriptGroupBase {
+     public:
+      virtual void execute() = 0;
+      virtual ~CpuScriptGroup2() {}
+    };
+
     static Context * getTlsContext();
     static const Script * getTlsScript();
     static pthread_key_t getThreadTLSKey();
@@ -124,7 +136,7 @@
                                      uint8_t const *bitcode, size_t bitcodeSize,
                                      uint32_t flags) = 0;
     virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
-    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg) = 0;
+    virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
     virtual bool getInForEach() = 0;
 
 #ifndef RS_COMPATIBILITY_LIB
diff --git a/driver/rsdScriptGroup.cpp b/driver/rsdScriptGroup.cpp
index a7b2e77..ed800a3 100644
--- a/driver/rsdScriptGroup.cpp
+++ b/driver/rsdScriptGroup.cpp
@@ -28,7 +28,7 @@
 using namespace android::renderscript;
 
 
-bool rsdScriptGroupInit(const Context *rsc, ScriptGroup *sg) {
+bool rsdScriptGroupInit(const Context *rsc, ScriptGroupBase *sg) {
     RsdHal *dc = (RsdHal *)rsc->mHal.drv;
 
     sg->mHal.drv = dc->mCpuRef->createScriptGroup(sg);
@@ -43,13 +43,15 @@
                              const ScriptKernelID *kid, Allocation *) {
 }
 
-void rsdScriptGroupExecute(const Context *rsc, const ScriptGroup *sg) {
-    RsdCpuReference::CpuScriptGroup *sgi = (RsdCpuReference::CpuScriptGroup *)sg->mHal.drv;
+void rsdScriptGroupExecute(const Context *rsc, const ScriptGroupBase *sg) {
+    RsdCpuReference::CpuScriptGroupBase *sgi =
+        (RsdCpuReference::CpuScriptGroupBase *)sg->mHal.drv;
     sgi->execute();
 }
 
-void rsdScriptGroupDestroy(const Context *rsc, const ScriptGroup *sg) {
-    RsdCpuReference::CpuScriptGroup *sgi = (RsdCpuReference::CpuScriptGroup *)sg->mHal.drv;
+void rsdScriptGroupDestroy(const Context *rsc, const ScriptGroupBase *sg) {
+    RsdCpuReference::CpuScriptGroupBase *sgi =
+        (RsdCpuReference::CpuScriptGroupBase *)sg->mHal.drv;
     delete sgi;
 }
 
@@ -68,5 +70,3 @@
     obj->v2 = nullptr;
 #endif
 }
-
-
diff --git a/driver/rsdScriptGroup.h b/driver/rsdScriptGroup.h
index db44e23..95e5d19 100644
--- a/driver/rsdScriptGroup.h
+++ b/driver/rsdScriptGroup.h
@@ -20,7 +20,7 @@
 #include <rs_hal.h>
 
 bool rsdScriptGroupInit(const android::renderscript::Context *rsc,
-                        android::renderscript::ScriptGroup *sg);
+                        android::renderscript::ScriptGroupBase *sg);
 void rsdScriptGroupSetInput(const android::renderscript::Context *rsc,
                             const android::renderscript::ScriptGroup *sg,
                             const android::renderscript::ScriptKernelID *kid,
@@ -30,9 +30,9 @@
                              const android::renderscript::ScriptKernelID *kid,
                              android::renderscript::Allocation *);
 void rsdScriptGroupExecute(const android::renderscript::Context *rsc,
-                           const android::renderscript::ScriptGroup *sg);
+                           const android::renderscript::ScriptGroupBase *sg);
 void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
-                           const android::renderscript::ScriptGroup *sg);
+                           const android::renderscript::ScriptGroupBase *sg);
 void rsdScriptGroupUpdateCachedObject(const android::renderscript::Context *rsc,
                                       const android::renderscript::ScriptGroup *sg,
                                       android::renderscript::rs_script_group *obj);
diff --git a/rs.spec b/rs.spec
index 18ece8c..22c53b9 100644
--- a/rs.spec
+++ b/rs.spec
@@ -252,6 +252,31 @@
     param uint32_t srcMip
     }
 
+ClosureCreate {
+    direct
+    param RsScriptKernelID kernelID
+    param RsAllocation returnValue
+    param RsScriptFieldID * fieldIDs
+    param uintptr_t * values
+    param size_t * sizes
+    param RsClosure * depClosures
+    param RsScriptFieldID * depFieldIDs
+    ret RsClosure
+    }
+
+ClosureSetArg {
+  param RsClosure closureID
+  param uint32_t index
+  param uintptr_t value
+  param size_t valueSize
+}
+
+ClosureSetGlobal {
+  param RsClosure closureID
+  param RsScriptFieldID fieldID
+  param uintptr_t value
+  param size_t valueSize
+}
 
 SamplerCreate {
     direct
@@ -410,6 +435,12 @@
     param RsScriptGroup group
 }
 
+ScriptGroup2Create{
+    direct
+    param RsClosure * closures
+    ret RsScriptGroup2
+}
+
 AllocationIoSend {
     param RsAllocation alloc
     }
diff --git a/rsClosure.cpp b/rsClosure.cpp
new file mode 100644
index 0000000..8530fc1
--- /dev/null
+++ b/rsClosure.cpp
@@ -0,0 +1,147 @@
+#include "rsClosure.h"
+
+#include "cpu_ref/rsCpuCore.h"
+#include "rsContext.h" // XXX: necessary to avoid compiler error on rsScript.h below
+#include "rsScript.h"
+#include "rsType.h"
+
+namespace android {
+namespace renderscript {
+
+RsClosure rsi_ClosureCreate(Context* context, RsScriptKernelID kernelID,
+                            RsAllocation returnValue,
+                            RsScriptFieldID* fieldIDs, size_t fieldIDs_length,
+                            uintptr_t* values, size_t values_length,
+                            size_t* sizes, size_t sizes_length,
+                            RsClosure* depClosures, size_t depClosures_length,
+                            RsScriptFieldID* depFieldIDs,
+                            size_t depFieldIDs_length) {
+  rsAssert(fieldIDs_length == values_length && values_length == sizes_length &&
+           sizes_length == depClosures_length &&
+           depClosures_length == depFieldIDs_length);
+
+  return (RsClosure)(new Closure(
+      context, (const ScriptKernelID*)kernelID, (Allocation*)returnValue,
+      fieldIDs_length, (const ScriptFieldID**)fieldIDs, (const void**)values,
+      sizes, (const Closure**)depClosures,
+      (const ScriptFieldID**)depFieldIDs));
+}
+
+void rsi_ClosureEval(Context* rsc, RsClosure closure) {
+  ((Closure*)closure)->eval();
+}
+
+void rsi_ClosureSetArg(Context* rsc, RsClosure closure, uint32_t index,
+                       uintptr_t value, size_t size) {
+  ((Closure*)closure)->setArg(index, (const void*)value, size);
+}
+
+void rsi_ClosureSetGlobal(Context* rsc, RsClosure closure,
+                          RsScriptFieldID fieldID, uintptr_t value,
+                          size_t size) {
+  ((Closure*)closure)->setGlobal((const ScriptFieldID*)fieldID,
+                                 (const void*)value, size);
+}
+
+Closure::Closure(Context* context,
+                 const ScriptKernelID* kernelID,
+                 Allocation* returnValue,
+                 const int numValues,
+                 const ScriptFieldID** fieldIDs,
+                 const void** values,
+                 const size_t* sizes,
+                 const Closure** depClosures,
+                 const ScriptFieldID** depFieldIDs) :
+    ObjectBase(context), mContext(context), mKernelID((ScriptKernelID*)kernelID),
+    mReturnValue(returnValue) {
+  size_t i;
+
+  for (i = 0; i < (size_t)numValues && fieldIDs[i] == nullptr; i++);
+
+  vector<const void*> args(values, values + i);
+  mArgs.swap(args);
+
+  for (; i < (size_t)numValues; i++) {
+    mGlobals[fieldIDs[i]] = std::make_pair(values[i], sizes[i]);
+  }
+
+  mDependences.insert(depClosures, depClosures + numValues);
+
+  for (i = 0; i < mArgs.size(); i++) {
+    const Closure* dep = depClosures[i];
+    if (dep != nullptr) {
+      auto mapping = mArgDeps[dep];
+      if (mapping == nullptr) {
+        mapping = new map<int, const ObjectBaseRef<ScriptFieldID>*>();
+        mArgDeps[dep] = mapping;
+      }
+      (*mapping)[i] = new ObjectBaseRef<ScriptFieldID>(
+          const_cast<ScriptFieldID*>(depFieldIDs[i]));
+    }
+  }
+
+  for (; i < (size_t)numValues; i++) {
+    const Closure* dep = depClosures[i];
+    if (dep != nullptr) {
+      auto mapping = mGlobalDeps[dep];
+      if (mapping == nullptr) {
+        mapping = new map<const ObjectBaseRef<ScriptFieldID>*,
+            const ObjectBaseRef<ScriptFieldID>*>();
+        mGlobalDeps[dep] = mapping;
+      }
+      (*mapping)[new ObjectBaseRef<ScriptFieldID>(
+          const_cast<ScriptFieldID*>(fieldIDs[i]))] =
+          new ObjectBaseRef<ScriptFieldID>(
+              const_cast<ScriptFieldID*>(depFieldIDs[i]));
+    }
+  }
+}
+
+Closure::~Closure() {
+  for (const auto& p : mArgDeps) {
+    auto map = p.second;
+    for (const auto& p1 : *map) {
+      delete p1.second;
+    }
+    delete p.second;
+  }
+
+  for (const auto& p : mGlobalDeps) {
+    auto map = p.second;
+    for (const auto& p1 : *map) {
+      delete p1.first;
+      delete p1.second;
+    }
+    delete p.second;
+  }
+}
+
+void Closure::eval() {
+  Script *s = mKernelID->mScript;
+
+  for (const auto& p : mGlobals) {
+    const void* value = p.second.first;
+    int size = p.second.second;
+    // We use -1 size to indicate an ObjectBase rather than a primitive type
+    if (size < 0) {
+      s->setVarObj(p.first->mSlot, (ObjectBase*)value);
+    } else {
+      s->setVar(p.first->mSlot, (const void*)&value, size);
+    }
+  }
+
+  s->runForEach(mContext, mKernelID->mSlot, (const Allocation **)(&mArgs[0]),
+                mArgs.size(), mReturnValue, nullptr, 0, nullptr);
+}
+
+void Closure::setArg(const uint32_t index, const void* value, const size_t size) {
+  mArgs[index] = value;
+}
+
+void Closure::setGlobal(const ScriptFieldID* fieldID, const void* value,
+                        const size_t size) {
+  mGlobals[fieldID] = std::make_pair(value, size);
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/rsClosure.h b/rsClosure.h
new file mode 100644
index 0000000..372cd32
--- /dev/null
+++ b/rsClosure.h
@@ -0,0 +1,78 @@
+#ifndef ANDROID_RENDERSCRIPT_CLOSURE_H_
+#define ANDROID_RENDERSCRIPT_CLOSURE_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "rsDefines.h"
+#include "rsObjectBase.h"
+
+namespace android {
+namespace renderscript {
+
+using std::map;
+using std::pair;
+using std::set;
+using std::vector;
+
+class Allocation;
+class Context;
+class ScriptFieldID;
+class ScriptKernelID;
+class Type;
+
+class Closure : public ObjectBase {
+ public:
+  Closure(Context* context,
+          const ScriptKernelID* kernelID,
+          Allocation* returnValue,
+          const int numValues,
+          const ScriptFieldID** fieldIDs,
+          const void** values,  // Allocations or primitive (numeric) types
+          const size_t* sizes,   // size for data type. -1 indicates an allocation.
+          const Closure** depClosures,
+          const ScriptFieldID** depFieldIDs);
+
+  virtual ~Closure();
+
+  virtual void serialize(Context *rsc, OStream *stream) const {}
+
+  virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_CLOSURE; }
+
+  void eval();
+
+  void setArg(const uint32_t index, const void* value, const size_t size);
+  void setGlobal(const ScriptFieldID* fieldID, const void* value,
+                 const size_t size);
+
+  Context* mContext;
+  const ObjectBaseRef<ScriptKernelID> mKernelID;
+
+  // Values referrenced in arguments and globals cannot be futures. They must be
+  // either a known value or unbound value.
+  // For now, all arguments should be Allocations.
+  vector<const void*> mArgs;
+
+  // A global could be allocation or any primitive data type.
+  map<const ScriptFieldID*, pair<const void*, int>> mGlobals;
+
+  Allocation* mReturnValue;
+
+  // All the other closures that this closure depends on
+  set<const Closure*> mDependences;
+
+  // All the other closures which this closure depends on for one of its
+  // arguments, and the fields which it depends on.
+  map<const Closure*, map<int, const ObjectBaseRef<ScriptFieldID>*>*> mArgDeps;
+
+  // All the other closures that this closure depends on for one of its fields,
+  // and the fields that it depends on.
+  map<const Closure*, map<const ObjectBaseRef<ScriptFieldID>*,
+                          const ObjectBaseRef<ScriptFieldID>*>*> mGlobalDeps;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_CLOSURE_H_
diff --git a/rsDefines.h b/rsDefines.h
index 0a91ea8..9345eb9 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -33,6 +33,7 @@
 typedef void * RsAdapter2D;
 typedef void * RsAllocation;
 typedef void * RsAnimation;
+typedef void * RsClosure;
 typedef void * RsContext;
 typedef void * RsDevice;
 typedef void * RsElement;
@@ -44,6 +45,7 @@
 typedef void * RsScriptFieldID;
 typedef void * RsScriptMethodID;
 typedef void * RsScriptGroup;
+typedef void * RsScriptGroup2;
 typedef void * RsMesh;
 typedef void * RsPath;
 typedef void * RsType;
@@ -251,7 +253,3 @@
 #endif
 
 #endif // RENDER_SCRIPT_DEFINES_H
-
-
-
-
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index 6f14637..3fe6942 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -294,6 +294,10 @@
             break;
         case RS_A3D_CLASS_ID_SCRIPT_GROUP:
             break;
+        case RS_A3D_CLASS_ID_CLOSURE:
+            break;
+        case RS_A3D_CLASS_ID_SCRIPT_GROUP2:
+            break;
     }
     if (entry->mRsObj) {
         entry->mRsObj->incUserRef();
diff --git a/rsInternalDefines.h b/rsInternalDefines.h
index 19f2122..57cb72a 100644
--- a/rsInternalDefines.h
+++ b/rsInternalDefines.h
@@ -163,7 +163,9 @@
     RS_A3D_CLASS_ID_SCRIPT_KERNEL_ID,
     RS_A3D_CLASS_ID_SCRIPT_FIELD_ID,
     RS_A3D_CLASS_ID_SCRIPT_METHOD_ID,
-    RS_A3D_CLASS_ID_SCRIPT_GROUP
+    RS_A3D_CLASS_ID_SCRIPT_GROUP,
+    RS_A3D_CLASS_ID_CLOSURE,
+    RS_A3D_CLASS_ID_SCRIPT_GROUP2
 };
 
 enum RsCullMode {
@@ -202,7 +204,3 @@
 #endif
 
 #endif // RENDER_SCRIPT_DEFINES_H
-
-
-
-
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
index 618c28c..791ab14 100644
--- a/rsScriptGroup.cpp
+++ b/rsScriptGroup.cpp
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+#include "rsScriptGroup.h"
 
 #include "rsContext.h"
+#include "rsScriptGroup2.h"
+
+#include <algorithm>
 #include <time.h>
 
 using namespace android;
 using namespace android::renderscript;
 
-ScriptGroup::ScriptGroup(Context *rsc) : ObjectBase(rsc) {
+ScriptGroup::ScriptGroup(Context *rsc) : ScriptGroupBase(rsc) {
 }
 
 ScriptGroup::~ScriptGroup() {
@@ -270,12 +273,10 @@
 }
 
 void ScriptGroup::execute(Context *rsc) {
-
     if (!validateInputAndOutput(rsc)) {
         return;
     }
 
-    //ALOGE("ScriptGroup::execute");
     if (rsc->mHal.funcs.scriptgroup.execute) {
         rsc->mHal.funcs.scriptgroup.execute(rsc, this);
         return;
@@ -324,13 +325,6 @@
 
 }
 
-void ScriptGroup::serialize(Context *rsc, OStream *stream) const {
-}
-
-RsA3DClassID ScriptGroup::getClassId() const {
-    return RS_A3D_CLASS_ID_SCRIPT_GROUP;
-}
-
 ScriptGroup::Link::Link() {
 }
 
@@ -371,7 +365,7 @@
 }
 
 void rsi_ScriptGroupExecute(Context *rsc, RsScriptGroup sg) {
-    ScriptGroup *s = (ScriptGroup *)sg;
+    ScriptGroupBase *s = (ScriptGroupBase *)sg;
     s->execute(rsc);
 }
 
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
index 974e3ba..ff0259a 100644
--- a/rsScriptGroup.h
+++ b/rsScriptGroup.h
@@ -17,21 +17,30 @@
 #ifndef ANDROID_RS_SCRIPT_GROUP_H
 #define ANDROID_RS_SCRIPT_GROUP_H
 
-#include "rsAllocation.h"
-#include "rsScript.h"
+#include "rsScriptGroupBase.h"
 
+#include <vector>
 
 // ---------------------------------------------------------------------------
 namespace android {
 namespace renderscript {
 
+class Allocation;
+class Context;
 class ProgramVertex;
 class ProgramFragment;
 class ProgramRaster;
 class ProgramStore;
+class Script;
+class ScriptFieldID;
+class ScriptKernelID;
+class Type;
 
-class ScriptGroup : public ObjectBase {
+class ScriptGroup : public ScriptGroupBase {
 public:
+    virtual SG_API_Version getApiVersion() const { return SG_V1; }
+    virtual void execute(Context *rsc);
+
     std::vector<ObjectBaseRef<ScriptKernelID> > mKernels;
 
     class Link {
@@ -70,15 +79,6 @@
     std::vector<IO *> mInputs;
     std::vector<IO *> mOutputs;
 
-    struct Hal {
-        void * drv;
-
-        struct DriverInfo {
-        };
-        DriverInfo info;
-    };
-    Hal mHal;
-
     static ScriptGroup * create(Context *rsc,
                            ScriptKernelID ** kernels, size_t kernelsSize,
                            ScriptKernelID ** src, size_t srcSize,
@@ -86,14 +86,9 @@
                            ScriptFieldID ** dstF, size_t dstFSize,
                            const Type ** type, size_t typeSize);
 
-    virtual void serialize(Context *rsc, OStream *stream) const;
-    virtual RsA3DClassID getClassId() const;
-
-    void execute(Context *rsc);
     void setInput(Context *rsc, ScriptKernelID *kid, Allocation *a);
     void setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a);
 
-
 protected:
     virtual ~ScriptGroup();
     bool mInitialized;
diff --git a/rsScriptGroup2.cpp b/rsScriptGroup2.cpp
new file mode 100644
index 0000000..06a252a
--- /dev/null
+++ b/rsScriptGroup2.cpp
@@ -0,0 +1,27 @@
+#include "rsScriptGroup2.h"
+
+#include "rsContext.h"
+
+namespace android {
+namespace renderscript {
+
+void ScriptGroup2::execute(Context* rsc) {
+  if (rsc->mHal.funcs.scriptgroup.execute) {
+    rsc->mHal.funcs.scriptgroup.execute(rsc, this);
+  }
+}
+
+RsScriptGroup2 rsi_ScriptGroup2Create(Context* rsc, RsClosure* closures,
+                                      size_t numClosures) {
+  ScriptGroup2* group = new ScriptGroup2(rsc, (Closure**)closures, numClosures);
+
+  // Create a device-specific implementation by calling the device driver
+  if (rsc->mHal.funcs.scriptgroup.init) {
+    rsc->mHal.funcs.scriptgroup.init(rsc, group);
+  }
+
+  return group;
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/rsScriptGroup2.h b/rsScriptGroup2.h
new file mode 100644
index 0000000..c759faf
--- /dev/null
+++ b/rsScriptGroup2.h
@@ -0,0 +1,36 @@
+#ifndef ANDROID_RENDERSCRIPT_SCRIPTGROUP2_H_
+#define ANDROID_RENDERSCRIPT_SCRIPTGROUP2_H_
+
+#include "rsScriptGroupBase.h"
+
+#include <list>
+
+namespace android {
+namespace renderscript {
+
+class Closure;
+class Context;
+
+class ScriptGroup2 : public ScriptGroupBase {
+ public:
+  /*
+    TODO:
+    Inputs and outputs are set and retrieved in Java runtime.
+    They are opaque in the C++ runtime.
+    For better compiler optimizations (of a script group), we need to include
+    input and output information in the C++ runtime.
+   */
+  ScriptGroup2(Context* rsc, Closure** closures, size_t numClosures) :
+      ScriptGroupBase(rsc), mClosures(closures, closures + numClosures) {}
+  virtual ~ScriptGroup2() {}
+
+  virtual SG_API_Version getApiVersion() const { return SG_V2; }
+  virtual void execute(Context* rsc);
+
+  std::list<Closure*> mClosures;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RENDERSCRIPT_SCRIPTGROUP2_H_
diff --git a/rsScriptGroupBase.h b/rsScriptGroupBase.h
new file mode 100644
index 0000000..00ae6c6
--- /dev/null
+++ b/rsScriptGroupBase.h
@@ -0,0 +1,41 @@
+#ifndef ANDROID_RS_SCRIPT_GROUP_BASE_H
+#define ANDROID_RS_SCRIPT_GROUP_BASE_H
+
+#include "rsObjectBase.h"
+
+namespace android {
+namespace renderscript {
+
+class ScriptGroupBase : public ObjectBase {
+ public:
+  ScriptGroupBase(Context* rsc) : ObjectBase(rsc) {}
+  virtual ~ScriptGroupBase() {}
+
+  virtual void serialize(Context *rsc, OStream *stream) const {}
+
+  virtual RsA3DClassID getClassId() const {
+    return RS_A3D_CLASS_ID_SCRIPT_GROUP;
+  }
+
+  enum SG_API_Version {
+    SG_V1 = 10,
+    SG_V2 = 20,
+  };
+
+  virtual void execute(Context *rsc) = 0;
+  virtual SG_API_Version getApiVersion() const = 0;
+
+  struct Hal {
+    void * drv;
+
+    struct DriverInfo {
+    };
+    DriverInfo info;
+  };
+  Hal mHal;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // ANDROID_RS_SCRIPT_GROUP_BASE_H
diff --git a/rs_hal.h b/rs_hal.h
index 419827b..b3c2e39 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -35,6 +35,7 @@
 class ScriptMethodID;
 class ScriptC;
 class ScriptGroup;
+class ScriptGroupBase;
 class Path;
 class Program;
 class ProgramStore;
@@ -300,13 +301,13 @@
     } framebuffer;
 
     struct {
-        bool (*init)(const Context *rsc, ScriptGroup *sg);
+        bool (*init)(const Context *rsc, ScriptGroupBase *sg);
         void (*setInput)(const Context *rsc, const ScriptGroup *sg,
                          const ScriptKernelID *kid, Allocation *);
         void (*setOutput)(const Context *rsc, const ScriptGroup *sg,
                           const ScriptKernelID *kid, Allocation *);
-        void (*execute)(const Context *rsc, const ScriptGroup *sg);
-        void (*destroy)(const Context *rsc, const ScriptGroup *sg);
+        void (*execute)(const Context *rsc, const ScriptGroupBase *sg);
+        void (*destroy)(const Context *rsc, const ScriptGroupBase *sg);
         void (*updateCachedObject)(const Context *rsc, const ScriptGroup *sg, rs_script_group *obj);
     } scriptgroup;