diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 5de964f..27243f9 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -29,6 +29,7 @@
 	rsCpuRuntimeMath.cpp \
 	rsCpuRuntimeStubs.cpp \
 	rsCpuScriptGroup.cpp \
+	rsCpuScriptGroup2.cpp \
 	rsCpuIntrinsic.cpp \
 	rsCpuIntrinsic3DLUT.cpp \
 	rsCpuIntrinsicBlend.cpp \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 47bc1c0..84c2416 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -17,6 +17,7 @@
 #include "rsCpuCore.h"
 #include "rsCpuScript.h"
 #include "rsCpuScriptGroup.h"
+#include "rsCpuScriptGroup2.h"
 
 #include <malloc.h>
 #include "rsContext.h"
@@ -660,11 +661,19 @@
     return i;
 }
 
-RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
-    CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
-    if (!sgi->init()) {
+void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
+  switch (sg->getApiVersion()) {
+    case ScriptGroupBase::SG_V1: {
+      CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
+      if (!sgi->init()) {
         delete sgi;
         return nullptr;
+      }
+      return sgi;
     }
-    return sgi;
+    case ScriptGroupBase::SG_V2: {
+      return new CpuScriptGroup2Impl(this, sg);
+    }
+  }
+  return nullptr;
 }
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index bfd5e51..e069658 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -182,7 +182,7 @@
                                      uint32_t flags);
     virtual CpuScript * createIntrinsic(const Script *s,
                                         RsScriptIntrinsicID iid, Element *e);
-    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg);
+    virtual void* createScriptGroup(const ScriptGroupBase *sg);
 
     const RsdCpuReference::CpuSymbol *symLookup(const char *);
 
diff --git a/cpu_ref/rsCpuScriptGroup.cpp b/cpu_ref/rsCpuScriptGroup.cpp
index 751bafb..3d32a51 100644
--- a/cpu_ref/rsCpuScriptGroup.cpp
+++ b/cpu_ref/rsCpuScriptGroup.cpp
@@ -18,15 +18,13 @@
 #include "rsCpuScript.h"
 #include "rsScriptGroup.h"
 #include "rsCpuScriptGroup.h"
-//#include "rsdBcc.h"
-//#include "rsdAllocation.h"
 
 using namespace android;
 using namespace android::renderscript;
 
-CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg) {
+CpuScriptGroupImpl::CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroupBase *sg) {
     mCtx = ctx;
-    mSG = sg;
+    mSG = (ScriptGroup*)sg;
 }
 
 CpuScriptGroupImpl::~CpuScriptGroupImpl() {
diff --git a/cpu_ref/rsCpuScriptGroup.h b/cpu_ref/rsCpuScriptGroup.h
index 1a4af05..50ba2ac 100644
--- a/cpu_ref/rsCpuScriptGroup.h
+++ b/cpu_ref/rsCpuScriptGroup.h
@@ -30,7 +30,7 @@
     virtual void execute();
     virtual ~CpuScriptGroupImpl();
 
-    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroup *sg);
+    CpuScriptGroupImpl(RsdCpuReferenceImpl *ctx, const ScriptGroupBase *sg);
     bool init();
 
     static void scriptGroupRoot(const RsExpandKernelParams *p,
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
new file mode 100644
index 0000000..9dc4d90
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -0,0 +1,192 @@
+#include "rsCpuScriptGroup2.h"
+
+#include "cpu_ref/rsCpuCore.h"
+#include "rsClosure.h"
+#include "rsContext.h"
+#include "rsCpuCore.h"
+#include "rsCpuScript.h"
+#include "rsScript.h"
+#include "rsScriptGroup2.h"
+
+namespace android {
+namespace renderscript {
+
+namespace {
+
+static const size_t DefaultKernelArgCount = 2;
+
+void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
+               uint32_t xend, uint32_t outstep) {
+  const list<CPUClosure*>& closures = *(list<CPUClosure*>*)kparams->usr;
+  RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams;
+  const void **oldIns  = kparams->ins;
+  uint32_t *oldStrides = kparams->inEStrides;
+
+  std::vector<const void*> ins(DefaultKernelArgCount);
+  std::vector<uint32_t> strides(DefaultKernelArgCount);
+
+  for (CPUClosure* cpuClosure : closures) {
+    const Closure* closure = cpuClosure->mClosure;
+
+    auto in_iter = ins.begin();
+    auto stride_iter = strides.begin();
+
+    for (const auto& arg : closure->mArgs) {
+      const Allocation* a = (const Allocation*)arg;
+      const uint32_t eStride = a->mHal.state.elementSizeBytes;
+      const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
+          eStride * xstart;
+      if (kparams->dimY > 1) {
+        ptr += a->mHal.drvState.lod[0].stride * kparams->y;
+      }
+      *in_iter++ = ptr;
+      *stride_iter++ = eStride;
+    }
+
+    mutable_kparams->ins = &ins[0];
+    mutable_kparams->inEStrides = &strides[0];
+
+    const Allocation* out = closure->mReturnValue;
+    const uint32_t ostep = out->mHal.state.elementSizeBytes;
+    const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
+           ostep * xstart;
+    if (kparams->dimY > 1) {
+      ptr += out->mHal.drvState.lod[0].stride * kparams->y;
+    }
+
+    mutable_kparams->out = (void*)ptr;
+
+    mutable_kparams->usr = cpuClosure->mUsrPtr;
+
+    cpuClosure->mFunc(kparams, xstart, xend, ostep);
+  }
+
+  mutable_kparams->ins        = oldIns;
+  mutable_kparams->inEStrides = oldStrides;
+  mutable_kparams->usr        = &closures;
+}
+
+/*
+  Returns true if closure depends on any closure in batch via a glboal variable
+  TODO: this probably should go into class Closure.
+ */
+bool conflict(const list<CPUClosure*> &batch, CPUClosure* closure) {
+  for (const auto &p : closure->mClosure->mGlobalDeps) {
+    const Closure* dep = p.first;
+    for (CPUClosure* c : batch) {
+      if (c->mClosure == dep) {
+        return true;
+      }
+    }
+  }
+  for (const auto &p : closure->mClosure->mArgDeps) {
+    const Closure* dep = p.first;
+    for (CPUClosure* c : batch) {
+      if (c->mClosure == dep) {
+        for (const auto &p1 : *p.second) {
+          if (p1.second != nullptr) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
+                                         const ScriptGroupBase *sg) :
+    mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
+  list<CPUClosure*>* batch = new list<CPUClosure*>();
+  for (Closure* closure: mGroup->mClosures) {
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    RsdCpuScriptImpl* si =
+        (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript);
+
+    MTLaunchStruct mtls;
+    si->forEachKernelSetup(kernelID->mSlot, &mtls);
+    // TODO: Is mtls.fep.usrLen ever used?
+    CPUClosure* cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
+                                    mtls.fep.usr, mtls.fep.usrLen);
+    if (conflict(*batch, cc)) {
+      mBatches.push_back(batch);
+      batch = new list<CPUClosure*>();
+    }
+    batch->push_back(cc);
+  }
+  mBatches.push_back(batch);
+}
+
+CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
+  for (list<CPUClosure*>* batch : mBatches) {
+    for (CPUClosure* c : *batch) {
+      delete c;
+    }
+  }
+}
+
+void CpuScriptGroup2Impl::execute() {
+  for (list<CPUClosure*>* batch : mBatches) {
+    setGlobalsForBatch(*batch);
+    runBatch(*batch);
+  }
+}
+
+void CpuScriptGroup2Impl::setGlobalsForBatch(const list<CPUClosure*>& batch) {
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    Script* s = kernelID->mScript;
+    for (const auto& p : closure->mGlobals) {
+      const void* value = p.second.first;
+      int size = p.second.second;
+      // We use -1 size to indicate an ObjectBase rather than a primitive type
+      if (size < 0) {
+        s->setVarObj(p.first->mSlot, (ObjectBase*)value);
+      } else {
+        s->setVar(p.first->mSlot, (const void*)&value, size);
+      }
+    }
+  }
+}
+
+void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) {
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    cpuClosure->mSi->preLaunch(kernelID->mSlot,
+                               (const Allocation**)&closure->mArgs[0],
+                               closure->mArgs.size(), closure->mReturnValue,
+                               cpuClosure->mUsrPtr, cpuClosure->mUsrSize,
+                               nullptr);
+  }
+
+  const CPUClosure* cpuClosure = batch.front();
+  const Closure* closure = cpuClosure->mClosure;
+  MTLaunchStruct mtls;
+
+  cpuClosure->mSi->forEachMtlsSetup((const Allocation**)&closure->mArgs[0],
+                                    closure->mArgs.size(),
+                                    closure->mReturnValue,
+                                    nullptr, 0, nullptr, &mtls);
+
+  mtls.script = nullptr;
+  mtls.kernel = (void (*)())&groupRoot;
+  mtls.fep.usr = &batch;
+
+  mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+
+  for (CPUClosure* cpuClosure : batch) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    cpuClosure->mSi->postLaunch(kernelID->mSlot,
+                                (const Allocation**)&closure->mArgs[0],
+                                closure->mArgs.size(), closure->mReturnValue,
+                                nullptr, 0, nullptr);
+  }
+}
+
+}  // namespace renderscript
+}  // namespace android
diff --git a/cpu_ref/rsCpuScriptGroup2.h b/cpu_ref/rsCpuScriptGroup2.h
new file mode 100644
index 0000000..6cb72a6
--- /dev/null
+++ b/cpu_ref/rsCpuScriptGroup2.h
@@ -0,0 +1,60 @@
+#ifndef CPU_REF_CPUSCRIPTGROUP2IMPL_H_
+#define CPU_REF_CPUSCRIPTGROUP2IMPL_H_
+
+#include <list>
+
+#include "rsd_cpu.h"
+
+using std::list;
+
+namespace android {
+namespace renderscript {
+
+class Closure;
+class RsdCpuScriptImpl;
+class RsdCpuReferenceImpl;
+class ScriptGroup2;
+
+struct RsExpandKernelParams;
+
+typedef void (*ExpandFuncTy)(const RsExpandKernelParams*, uint32_t, uint32_t,
+                             uint32_t);
+
+class CPUClosure {
+ public:
+  CPUClosure(const Closure* closure, RsdCpuScriptImpl* si, ExpandFuncTy func,
+             const void* usrPtr, const size_t usrSize) :
+      mClosure(closure), mSi(si), mFunc(func), mUsrPtr(usrPtr),
+      mUsrSize(usrSize) {}
+
+  // It's important to do forwarding here than inheritance for unbound value
+  // binding to work.
+  const Closure* mClosure;
+  RsdCpuScriptImpl* mSi;
+  const ExpandFuncTy mFunc;
+  const void* mUsrPtr;
+  const size_t mUsrSize;
+};
+
+class CpuScriptGroup2Impl : public RsdCpuReference::CpuScriptGroup2 {
+ public:
+  CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, const ScriptGroupBase* group);
+  virtual ~CpuScriptGroup2Impl();
+
+  bool init();
+  virtual void execute();
+
+ private:
+  void setGlobalsForBatch(const list<CPUClosure*>& batch);
+  void runBatch(const list<CPUClosure*>& batch);
+
+  RsdCpuReferenceImpl* mCpuRefImpl;
+  const ScriptGroup2* mGroup;
+
+  list<list<CPUClosure*>*> mBatches;
+};
+
+}  // namespace renderscript
+}  // namespace android
+
+#endif  // CPU_REF_CPUSCRIPTGROUP2IMPL_H_
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index b0e924e..d886cef 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -45,7 +45,7 @@
 
 class ScriptC;
 class Script;
-class ScriptGroup;
+class ScriptGroupBase;
 class ScriptKernelID;
 
 
@@ -97,7 +97,13 @@
     };
     typedef CpuScript * (* script_lookup_t)(Context *, const Script *s);
 
-    class CpuScriptGroup {
+    class CpuScriptGroupBase {
+     public:
+      virtual void execute() = 0;
+      virtual ~CpuScriptGroupBase() {}
+    };
+
+    class CpuScriptGroup : public CpuScriptGroupBase {
     public:
         virtual void setInput(const ScriptKernelID *kid, Allocation *) = 0;
         virtual void setOutput(const ScriptKernelID *kid, Allocation *) = 0;
@@ -105,6 +111,12 @@
         virtual ~CpuScriptGroup() {};
     };
 
+    class CpuScriptGroup2 : public CpuScriptGroupBase {
+     public:
+      virtual void execute() = 0;
+      virtual ~CpuScriptGroup2() {}
+    };
+
     static Context * getTlsContext();
     static const Script * getTlsScript();
     static pthread_key_t getThreadTLSKey();
@@ -124,7 +136,7 @@
                                      uint8_t const *bitcode, size_t bitcodeSize,
                                      uint32_t flags) = 0;
     virtual CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) = 0;
-    virtual CpuScriptGroup * createScriptGroup(const ScriptGroup *sg) = 0;
+    virtual void* createScriptGroup(const ScriptGroupBase *sg) = 0;
     virtual bool getInForEach() = 0;
 
 #ifndef RS_COMPATIBILITY_LIB
