Add backed for script groups.

Change-Id: If2fdbde7381fcdaeb54d41a913b855fd83d4f186
diff --git a/Android.mk b/Android.mk
index 8e32c2c..fa6d277 100644
--- a/Android.mk
+++ b/Android.mk
@@ -36,6 +36,7 @@
 	driver/rsdRuntimeMath.cpp \
 	driver/rsdRuntimeStubs.cpp \
 	driver/rsdSampler.cpp \
+	driver/rsdScriptGroup.cpp \
 	driver/rsdShader.cpp \
 	driver/rsdShaderCache.cpp \
 	driver/rsdVertexArray.cpp
@@ -151,6 +152,7 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptGroup.cpp \
 	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
@@ -242,6 +244,7 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptGroup.cpp \
 	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 8240b10..8956b2e 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -16,7 +16,6 @@
 
 
 #include "rsdCore.h"
-#include "rsdBcc.h"
 #include "rsdRuntime.h"
 #include "rsdAllocation.h"
 #include "rsdFrameBufferObj.h"
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 35d6f3b..baf3e7a 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -15,16 +15,17 @@
  */
 
 #include "rsdCore.h"
-#include "rsdBcc.h"
-#include "rsdRuntime.h"
-#include "rsdAllocation.h"
-#include "rsdIntrinsics.h"
 
 #include <bcc/BCCContext.h>
 #include <bcc/Renderscript/RSCompilerDriver.h>
 #include <bcc/Renderscript/RSExecutable.h>
 #include <bcc/Renderscript/RSInfo.h>
 
+#include "rsdBcc.h"
+#include "rsdRuntime.h"
+#include "rsdAllocation.h"
+#include "rsdIntrinsics.h"
+
 #include "rsContext.h"
 #include "rsElement.h"
 #include "rsScriptC.h"
@@ -36,26 +37,6 @@
 using namespace android;
 using namespace android::renderscript;
 
-struct DrvScript {
-    RsScriptIntrinsicID mIntrinsicID;
-    int (*mRoot)();
-    int (*mRootExpand)();
-    void (*mInit)();
-    void (*mFreeChildren)();
-
-    bcc::BCCContext *mCompilerContext;
-    bcc::RSCompilerDriver *mCompilerDriver;
-    bcc::RSExecutable *mExecutable;
-
-    Allocation **mBoundAllocs;
-    RsdIntriniscFuncs_t mIntrinsicFuncs;
-    void * mIntrinsicData;
-};
-
-typedef void (*outer_foreach_t)(
-    const android::renderscript::RsForEachStubParamStruct *,
-    uint32_t x1, uint32_t x2,
-    uint32_t instep, uint32_t outstep);
 
 static Script * setTLS(Script *sc) {
     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(rsdgThreadTLSKey);
@@ -188,28 +169,6 @@
     return false;
 }
 
-typedef struct {
-    RsForEachStubParamStruct fep;
-
-    Context *rsc;
-    Script *script;
-    ForEachFunc_t kernel;
-    uint32_t sig;
-    const Allocation * ain;
-    Allocation * aout;
-
-    uint32_t mSliceSize;
-    volatile int mSliceNum;
-
-    uint32_t xStart;
-    uint32_t xEnd;
-    uint32_t yStart;
-    uint32_t yEnd;
-    uint32_t zStart;
-    uint32_t zEnd;
-    uint32_t arrayStart;
-    uint32_t arrayEnd;
-} MTLaunchStruct;
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
 static void wc_xy(void *usr, uint32_t idx) {
@@ -265,6 +224,145 @@
     }
 }
 
+void rsdScriptInvokeForEachMtlsSetup(const Context *rsc,
+                                     const Allocation * ain,
+                                     Allocation * aout,
+                                     const void * usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall *sc,
+                                     MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    if (ain) {
+        mtls->fep.dimX = ain->getType()->getDimX();
+        mtls->fep.dimY = ain->getType()->getDimY();
+        mtls->fep.dimZ = ain->getType()->getDimZ();
+        //mtls->dimArray = ain->getType()->getDimArray();
+    } else if (aout) {
+        mtls->fep.dimX = aout->getType()->getDimX();
+        mtls->fep.dimY = aout->getType()->getDimY();
+        mtls->fep.dimZ = aout->getType()->getDimZ();
+        //mtls->dimArray = aout->getType()->getDimArray();
+    } else {
+        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    mtls->xEnd = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
+
+    Context *mrsc = (Context *)rsc;
+    mtls->rsc = mrsc;
+    mtls->ain = ain;
+    mtls->aout = aout;
+    mtls->fep.usr = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 10;
+    mtls->mSliceNum = 0;
+
+    mtls->fep.ptrIn = NULL;
+    mtls->fep.eStrideIn = 0;
+    if (ain) {
+        DrvAllocation *aindrv = (DrvAllocation *)ain->mHal.drv;
+        mtls->fep.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
+        mtls->fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls->fep.yStrideIn = aindrv->lod[0].stride;
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        DrvAllocation *aoutdrv = (DrvAllocation *)aout->mHal.drv;
+        mtls->fep.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aoutdrv->lod[0].stride;
+    }
+}
+
+void rsdScriptLaunchThreads(const Context *rsc,
+                            Script *s,
+                            uint32_t slot,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            uint32_t usrLen,
+                            const RsScriptCall *sc,
+                            MTLaunchStruct *mtls) {
+
+    Script * oldTLS = setTLS(s);
+    Context *mrsc = (Context *)rsc;
+    RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
+
+    if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
+        dc->mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4);
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            rsdLaunchThreads(mrsc, wc_xy, mtls);
+        } else {
+            mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4);
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            rsdLaunchThreads(mrsc, wc_x, mtls);
+        }
+        dc->mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsForEachStubParamStruct p;
+        memcpy(&p, &mtls->fep, sizeof(p));
+        uint32_t sig = mtls->sig;
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+                    uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
+                                      mtls->fep.dimY * p.z + p.y;
+                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset);
+                    p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset);
+                    fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
+                }
+            }
+        }
+    }
+
+    setTLS(oldTLS);
+}
+
 void rsdScriptInvokeForEach(const Context *rsc,
                             Script *s,
                             uint32_t slot,
@@ -277,15 +375,13 @@
     RsdHal * dc = (RsdHal *)rsc->mHal.drv;
 
     MTLaunchStruct mtls;
-    memset(&mtls, 0, sizeof(mtls));
-
-    //ALOGE("for each script %p  in %p   out %p", s, ain, aout);
+    rsdScriptInvokeForEachMtlsSetup(rsc, ain, aout, usr, usrLen, sc, &mtls);
+    mtls.script = s;
 
     DrvScript *drv = (DrvScript *)s->mHal.drv;
-
     if (drv->mIntrinsicID) {
         mtls.kernel = (void (*)())drv->mIntrinsicFuncs.root;
-        usr = drv->mIntrinsicData;
+        mtls.fep.usr = drv->mIntrinsicData;
     } else {
         rsAssert(slot < drv->mExecutable->getExportForeachFuncAddrs().size());
         mtls.kernel = reinterpret_cast<ForEachFunc_t>(
@@ -294,122 +390,8 @@
         mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
     }
 
-    if (ain) {
-        mtls.fep.dimX = ain->getType()->getDimX();
-        mtls.fep.dimY = ain->getType()->getDimY();
-        mtls.fep.dimZ = ain->getType()->getDimZ();
-        //mtls.dimArray = ain->getType()->getDimArray();
-    } else if (aout) {
-        mtls.fep.dimX = aout->getType()->getDimX();
-        mtls.fep.dimY = aout->getType()->getDimY();
-        mtls.fep.dimZ = aout->getType()->getDimZ();
-        //mtls.dimArray = aout->getType()->getDimArray();
-    } else {
-        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
-        return;
-    }
 
-    if (!sc || (sc->xEnd == 0)) {
-        mtls.xEnd = mtls.fep.dimX;
-    } else {
-        rsAssert(sc->xStart < mtls.fep.dimX);
-        rsAssert(sc->xEnd <= mtls.fep.dimX);
-        rsAssert(sc->xStart < sc->xEnd);
-        mtls.xStart = rsMin(mtls.fep.dimX, sc->xStart);
-        mtls.xEnd = rsMin(mtls.fep.dimX, sc->xEnd);
-        if (mtls.xStart >= mtls.xEnd) return;
-    }
-
-    if (!sc || (sc->yEnd == 0)) {
-        mtls.yEnd = mtls.fep.dimY;
-    } else {
-        rsAssert(sc->yStart < mtls.fep.dimY);
-        rsAssert(sc->yEnd <= mtls.fep.dimY);
-        rsAssert(sc->yStart < sc->yEnd);
-        mtls.yStart = rsMin(mtls.fep.dimY, sc->yStart);
-        mtls.yEnd = rsMin(mtls.fep.dimY, sc->yEnd);
-        if (mtls.yStart >= mtls.yEnd) return;
-    }
-
-    mtls.xEnd = rsMax((uint32_t)1, mtls.xEnd);
-    mtls.yEnd = rsMax((uint32_t)1, mtls.yEnd);
-    mtls.zEnd = rsMax((uint32_t)1, mtls.zEnd);
-    mtls.arrayEnd = rsMax((uint32_t)1, mtls.arrayEnd);
-
-    rsAssert(!ain || (ain->getType()->getDimZ() == 0));
-
-    Context *mrsc = (Context *)rsc;
-    Script * oldTLS = setTLS(s);
-
-    mtls.rsc = mrsc;
-    mtls.ain = ain;
-    mtls.aout = aout;
-    mtls.script = s;
-    mtls.fep.usr = usr;
-    mtls.fep.usrLen = usrLen;
-    mtls.mSliceSize = 10;
-    mtls.mSliceNum = 0;
-
-    mtls.fep.ptrIn = NULL;
-    mtls.fep.eStrideIn = 0;
-    if (ain) {
-        DrvAllocation *aindrv = (DrvAllocation *)ain->mHal.drv;
-        mtls.fep.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
-        mtls.fep.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls.fep.yStrideIn = aindrv->lod[0].stride;
-    }
-
-    mtls.fep.ptrOut = NULL;
-    mtls.fep.eStrideOut = 0;
-    if (aout) {
-        DrvAllocation *aoutdrv = (DrvAllocation *)aout->mHal.drv;
-        mtls.fep.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
-        mtls.fep.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls.fep.yStrideOut = aoutdrv->lod[0].stride;
-    }
-
-
-    if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
-        dc->mInForEach = true;
-        if (mtls.fep.dimY > 1) {
-            mtls.mSliceSize = mtls.fep.dimY / (dc->mWorkers.mCount * 4);
-            if(mtls.mSliceSize < 1) {
-                mtls.mSliceSize = 1;
-            }
-
-            rsdLaunchThreads(mrsc, wc_xy, &mtls);
-        } else {
-            mtls.mSliceSize = mtls.fep.dimX / (dc->mWorkers.mCount * 4);
-            if(mtls.mSliceSize < 1) {
-                mtls.mSliceSize = 1;
-            }
-
-            rsdLaunchThreads(mrsc, wc_x, &mtls);
-        }
-        dc->mInForEach = false;
-
-        //ALOGE("launch 1");
-    } else {
-        RsForEachStubParamStruct p;
-        memcpy(&p, &mtls.fep, sizeof(p));
-        uint32_t sig = mtls.sig;
-
-        //ALOGE("launch 3");
-        outer_foreach_t fn = (outer_foreach_t) mtls.kernel;
-        for (p.ar[0] = mtls.arrayStart; p.ar[0] < mtls.arrayEnd; p.ar[0]++) {
-            for (p.z = mtls.zStart; p.z < mtls.zEnd; p.z++) {
-                for (p.y = mtls.yStart; p.y < mtls.yEnd; p.y++) {
-                    uint32_t offset = mtls.fep.dimY * mtls.fep.dimZ * p.ar[0] +
-                                      mtls.fep.dimY * p.z + p.y;
-                    p.out = mtls.fep.ptrOut + (mtls.fep.yStrideOut * offset);
-                    p.in = mtls.fep.ptrIn + (mtls.fep.yStrideIn * offset);
-                    fn(&p, mtls.xStart, mtls.xEnd, mtls.fep.eStrideIn, mtls.fep.eStrideOut);
-                }
-            }
-        }
-    }
-
-    setTLS(oldTLS);
+    rsdScriptLaunchThreads(rsc, s, slot, ain, aout, usr, usrLen, sc, &mtls);
 }
 
 
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index 114e6cf..a99a062 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -20,6 +20,11 @@
 #include <rs_hal.h>
 #include <rsRuntime.h>
 
+namespace bcc {
+    class BCCContext;
+    class RSCompilerDriver;
+    class RSExecutable;
+}
 
 bool rsdScriptInit(const android::renderscript::Context *, android::renderscript::ScriptC *,
                    char const *resName, char const *cacheDir,
@@ -86,4 +91,94 @@
                         const android::renderscript::Script *script,
                         const void *);
 
+
+typedef void (*outer_foreach_t)(
+    const android::renderscript::RsForEachStubParamStruct *,
+    uint32_t x1, uint32_t x2,
+    uint32_t instep, uint32_t outstep);
+
+typedef struct RsdIntriniscFuncs_rec {
+
+    void (*bind)(const android::renderscript::Context *dc,
+                 const android::renderscript::Script *script,
+                 void * intrinsicData,
+                 uint32_t slot, android::renderscript::Allocation *data);
+    void (*setVar)(const android::renderscript::Context *dc,
+                   const android::renderscript::Script *script,
+                   void * intrinsicData,
+                   uint32_t slot, void *data, size_t dataLength);
+    void (*root)(const android::renderscript::RsForEachStubParamStruct *,
+                 uint32_t x1, uint32_t x2, uint32_t instep, uint32_t outstep);
+
+    void (*destroy)(const android::renderscript::Context *dc,
+                    const android::renderscript::Script *script,
+                    void * intrinsicData);
+} RsdIntriniscFuncs_t;
+
+struct DrvScript {
+    RsScriptIntrinsicID mIntrinsicID;
+    int (*mRoot)();
+    int (*mRootExpand)();
+    void (*mInit)();
+    void (*mFreeChildren)();
+
+    bcc::BCCContext *mCompilerContext;
+    bcc::RSCompilerDriver *mCompilerDriver;
+    bcc::RSExecutable *mExecutable;
+
+    android::renderscript::Allocation **mBoundAllocs;
+    RsdIntriniscFuncs_t mIntrinsicFuncs;
+    void * mIntrinsicData;
+};
+
+typedef struct {
+    android::renderscript::RsForEachStubParamStruct fep;
+    uint32_t cpuIdx;
+
+} MTThreadStuct;
+
+typedef struct {
+    android::renderscript::RsForEachStubParamStruct fep;
+
+    android::renderscript::Context *rsc;
+    android::renderscript::Script *script;
+    ForEachFunc_t kernel;
+    uint32_t sig;
+    const android::renderscript::Allocation * ain;
+    android::renderscript::Allocation * aout;
+
+    uint32_t mSliceSize;
+    volatile int mSliceNum;
+
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+} MTLaunchStruct;
+
+void rsdScriptLaunchThreads(const android::renderscript::Context *rsc,
+                            android::renderscript::Script *s,
+                            uint32_t slot,
+                            const android::renderscript::Allocation * ain,
+                            android::renderscript::Allocation * aout,
+                            const void * usr,
+                            uint32_t usrLen,
+                            const RsScriptCall *sc,
+                            MTLaunchStruct *mtls);
+
+void rsdScriptInvokeForEachMtlsSetup(const android::renderscript::Context *rsc,
+                                     const android::renderscript::Allocation * ain,
+                                     android::renderscript::Allocation * aout,
+                                     const void * usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall *sc,
+                                     MTLaunchStruct *mtls);
+
+
+
+
 #endif
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index b2e2b08..cdfc600 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -25,6 +25,7 @@
 #include "rsdProgramFragment.h"
 #include "rsdMesh.h"
 #include "rsdSampler.h"
+#include "rsdScriptGroup.h"
 #include "rsdFrameBuffer.h"
 
 #include <malloc.h>
@@ -142,6 +143,15 @@
         rsdFrameBufferDestroy
     },
 
+    {
+        rsdScriptGroupInit,
+        rsdScriptGroupSetInput,
+        rsdScriptGroupSetOutput,
+        rsdScriptGroupExecute,
+        rsdScriptGroupDestroy
+    }
+
+
 };
 
 pthread_key_t rsdgThreadTLSKey = 0;
diff --git a/driver/rsdIntrinsics.h b/driver/rsdIntrinsics.h
index 4a1a4a2..a494d76 100644
--- a/driver/rsdIntrinsics.h
+++ b/driver/rsdIntrinsics.h
@@ -18,24 +18,7 @@
 #define RSD_INTRINSICS_H
 
 #include <rs_hal.h>
-
-typedef struct RsdIntriniscFuncs_rec {
-
-    void (*bind)(const android::renderscript::Context *dc,
-                 const android::renderscript::Script *script,
-                 void * intrinsicData,
-                 uint32_t slot, android::renderscript::Allocation *data);
-    void (*setVar)(const android::renderscript::Context *dc,
-                   const android::renderscript::Script *script,
-                   void * intrinsicData,
-                   uint32_t slot, void *data, size_t dataLength);
-    void (*root)(const android::renderscript::RsForEachStubParamStruct *,
-                 uint32_t x1, uint32_t x2, uint32_t instep, uint32_t outstep);
-
-    void (*destroy)(const android::renderscript::Context *dc,
-                    const android::renderscript::Script *script,
-                    void * intrinsicData);
-} RsdIntriniscFuncs_t;
+#include "rsdBcc.h"
 
 void * rsdIntrinsic_Init(const android::renderscript::Context *dc,
                          android::renderscript::Script *script,
diff --git a/driver/rsdScriptGroup.cpp b/driver/rsdScriptGroup.cpp
new file mode 100644
index 0000000..0ef41a4
--- /dev/null
+++ b/driver/rsdScriptGroup.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsdCore.h"
+
+#include <bcc/BCCContext.h>
+#include <bcc/Renderscript/RSCompilerDriver.h>
+#include <bcc/Renderscript/RSExecutable.h>
+#include <bcc/Renderscript/RSInfo.h>
+
+#include "rsScript.h"
+#include "rsScriptGroup.h"
+#include "rsdScriptGroup.h"
+#include "rsdBcc.h"
+
+using namespace android;
+using namespace android::renderscript;
+
+
+bool rsdScriptGroupInit(const android::renderscript::Context *rsc,
+                        const android::renderscript::ScriptGroup *sg) {
+    return true;
+}
+
+void rsdScriptGroupSetInput(const android::renderscript::Context *rsc,
+                            const android::renderscript::ScriptGroup *sg,
+                            const android::renderscript::ScriptKernelID *kid,
+                            android::renderscript::Allocation *) {
+}
+
+void rsdScriptGroupSetOutput(const android::renderscript::Context *rsc,
+                             const android::renderscript::ScriptGroup *sg,
+                             const android::renderscript::ScriptKernelID *kid,
+                             android::renderscript::Allocation *) {
+}
+
+void rsdScriptGroupExecute(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg) {
+
+    Vector<Allocation *> ins;
+    Vector<Allocation *> outs;
+    Vector<const ScriptKernelID *> kernels;
+
+    for (size_t ct=0; ct < sg->mNodes.size(); ct++) {
+        ScriptGroup::Node *n = sg->mNodes[ct];
+        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
+
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            const ScriptKernelID *k = n->mKernels[ct2];
+            Allocation *ain = NULL;
+            Allocation *aout = NULL;
+
+            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
+                if (n->mInputs[ct3]->mDstKernel.get() == k) {
+                    ain = n->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" link in %p", ain);
+                }
+            }
+            for (size_t ct3=0; ct3 < sg->mInputs.size(); ct3++) {
+                if (sg->mInputs[ct3]->mKernel == k) {
+                    ain = sg->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" io in %p", ain);
+                }
+            }
+
+            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == k) {
+                    aout = n->mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" link out %p", aout);
+                }
+            }
+            for (size_t ct3=0; ct3 < sg->mOutputs.size(); ct3++) {
+                if (sg->mOutputs[ct3]->mKernel == k) {
+                    aout = sg->mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" io out %p", aout);
+                }
+            }
+
+            ins.add(ain);
+            outs.add(aout);
+            kernels.add(k);
+        }
+
+    }
+
+    RsdHal * dc = (RsdHal *)rsc->mHal.drv;
+    MTLaunchStruct mtls;
+    for (size_t ct=0; ct < ins.size(); ct++) {
+
+        Script *s = kernels[ct]->mScript;
+        DrvScript *drv = (DrvScript *)s->mHal.drv;
+        uint32_t slot = kernels[ct]->mSlot;
+
+        rsdScriptInvokeForEachMtlsSetup(rsc, ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+        mtls.script = s;
+
+        if (drv->mIntrinsicID) {
+            mtls.kernel = (void (*)())drv->mIntrinsicFuncs.root;
+            mtls.fep.usr = drv->mIntrinsicData;
+        } else {
+            mtls.kernel = reinterpret_cast<ForEachFunc_t>(
+                              drv->mExecutable->getExportForeachFuncAddrs()[slot]);
+            rsAssert(mtls.kernel != NULL);
+            mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
+        }
+
+//        typedef void (*outer_foreach_t)(
+  //          const android::renderscript::RsForEachStubParamStruct *,
+    //        uint32_t x1, uint32_t x2,
+      //      uint32_t instep, uint32_t outstep);
+        //outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+
+        rsdScriptLaunchThreads(rsc, s, slot, ins[ct], outs[ct], NULL, 0, NULL, &mtls);
+    }
+
+}
+
+void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg) {
+}
+
+
diff --git a/driver/rsdScriptGroup.h b/driver/rsdScriptGroup.h
new file mode 100644
index 0000000..a817aef
--- /dev/null
+++ b/driver/rsdScriptGroup.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_SCRIPT_GROUP_H
+#define RSD_SCRIPT_GROUP_H
+
+#include <rs_hal.h>
+
+bool rsdScriptGroupInit(const android::renderscript::Context *rsc,
+                        const android::renderscript::ScriptGroup *sg);
+void rsdScriptGroupSetInput(const android::renderscript::Context *rsc,
+                            const android::renderscript::ScriptGroup *sg,
+                            const android::renderscript::ScriptKernelID *kid,
+                            android::renderscript::Allocation *);
+void rsdScriptGroupSetOutput(const android::renderscript::Context *rsc,
+                             const android::renderscript::ScriptGroup *sg,
+                             const android::renderscript::ScriptKernelID *kid,
+                             android::renderscript::Allocation *);
+void rsdScriptGroupExecute(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg);
+void rsdScriptGroupDestroy(const android::renderscript::Context *rsc,
+                           const android::renderscript::ScriptGroup *sg);
+
+
+#endif // RSD_SCRIPT_GROUP_H
diff --git a/rs.spec b/rs.spec
index 607f7dc..a9ec6fb 100644
--- a/rs.spec
+++ b/rs.spec
@@ -352,6 +352,49 @@
     ret RsScript
     }
 
+ScriptKernelIDCreate {
+    direct
+    param RsScript sid
+    param int slot
+    param int sig
+    ret RsScriptKernelID
+    }
+
+ScriptFieldIDCreate {
+    direct
+    param RsScript sid
+    param int slot
+    ret RsScriptFieldID
+    }
+
+ScriptGroupCreate {
+    direct
+    param RsScriptKernelID * kernels
+    param RsScriptKernelID * src
+    param RsScriptKernelID * dstK
+    param RsScriptFieldID * dstF
+    param const RsType * type
+    ret RsScriptGroup
+}
+
+ScriptGroupSetOutput {
+    param RsScriptGroup group
+    param RsScriptKernelID kernel
+    param RsAllocation alloc
+}
+
+ScriptGroupSetInput {
+    param RsScriptGroup group
+    param RsScriptKernelID kernel
+    param RsAllocation alloc
+}
+
+ScriptGroupExecute {
+    param RsScriptGroup group
+}
+
+
+
 ProgramStoreCreate {
     direct
     param bool colorMaskR
diff --git a/rsContext.h b/rsContext.h
index b071dc0..28ac52c 100644
--- a/rsContext.h
+++ b/rsContext.h
@@ -22,6 +22,7 @@
 
 #include "rsThreadIO.h"
 #include "rsScriptC.h"
+#include "rsScriptGroup.h"
 #include "rsSampler.h"
 #include "rsFont.h"
 #include "rsPath.h"
diff --git a/rsDefines.h b/rsDefines.h
index 6d0c90b..ae96784 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -40,6 +40,10 @@
 typedef void * RsFont;
 typedef void * RsSampler;
 typedef void * RsScript;
+typedef void * RsScriptKernelID;
+typedef void * RsScriptFieldID;
+typedef void * RsScriptMethodID;
+typedef void * RsScriptGroup;
 typedef void * RsMesh;
 typedef void * RsPath;
 typedef void * RsType;
@@ -323,7 +327,11 @@
     RS_A3D_CLASS_ID_ANIMATION,
     RS_A3D_CLASS_ID_ADAPTER_1D,
     RS_A3D_CLASS_ID_ADAPTER_2D,
-    RS_A3D_CLASS_ID_SCRIPT_C
+    RS_A3D_CLASS_ID_SCRIPT_C,
+    RS_A3D_CLASS_ID_SCRIPT_KERNEL_ID,
+    RS_A3D_CLASS_ID_SCRIPT_FIELD_ID,
+    RS_A3D_CLASS_ID_SCRIPT_METHOD_ID,
+    RS_A3D_CLASS_ID_SCRIPT_GROUP
 };
 
 enum RsCullMode {
diff --git a/rsFileA3D.cpp b/rsFileA3D.cpp
index 07c413f..c79d008 100644
--- a/rsFileA3D.cpp
+++ b/rsFileA3D.cpp
@@ -279,6 +279,14 @@
             break;
         case RS_A3D_CLASS_ID_SCRIPT_C:
             break;
+        case RS_A3D_CLASS_ID_SCRIPT_KERNEL_ID:
+            break;
+        case RS_A3D_CLASS_ID_SCRIPT_FIELD_ID:
+            break;
+        case RS_A3D_CLASS_ID_SCRIPT_METHOD_ID:
+            break;
+        case RS_A3D_CLASS_ID_SCRIPT_GROUP:
+            break;
     }
     if (entry->mRsObj) {
         entry->mRsObj->incUserRef();
diff --git a/rsScript.cpp b/rsScript.cpp
index 25ee1a0..9bbc2ba 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -86,9 +86,56 @@
     return decSysRef();
 }
 
+ScriptKernelID::ScriptKernelID(Context *rsc, Script *s, int slot, int sig)
+        : ObjectBase(rsc) {
+
+    mScript = s;
+    mSlot = slot;
+    mHasKernelInput = (sig & 1) != 0;
+    mHasKernelOutput = (sig & 2) != 0;
+}
+
+ScriptKernelID::~ScriptKernelID() {
+
+}
+
+void ScriptKernelID::serialize(Context *rsc, OStream *stream) const {
+
+}
+
+RsA3DClassID ScriptKernelID::getClassId() const {
+    return RS_A3D_CLASS_ID_SCRIPT_KERNEL_ID;
+}
+
+ScriptFieldID::ScriptFieldID(Context *rsc, Script *s, int slot) : ObjectBase(rsc) {
+    mScript = s;
+    mSlot = slot;
+}
+
+ScriptFieldID::~ScriptFieldID() {
+
+}
+
+void ScriptFieldID::serialize(Context *rsc, OStream *stream) const {
+
+}
+
+RsA3DClassID ScriptFieldID::getClassId() const {
+    return RS_A3D_CLASS_ID_SCRIPT_FIELD_ID;
+}
+
+
 namespace android {
 namespace renderscript {
 
+RsScriptKernelID rsi_ScriptKernelIDCreate(Context *rsc, RsScript vs, int slot, int sig) {
+    return new ScriptKernelID(rsc, (Script *)vs, slot, sig);
+}
+
+RsScriptFieldID rsi_ScriptFieldIDCreate(Context *rsc, RsScript vs, int slot) {
+    return new ScriptFieldID(rsc, (Script *)vs, slot);
+}
+
 void rsi_ScriptBindAllocation(Context * rsc, RsScript vs, RsAllocation va, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     Allocation *a = static_cast<Allocation *>(va);
diff --git a/rsScript.h b/rsScript.h
index acd6afb..6339f49 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -29,8 +29,35 @@
 class ProgramRaster;
 class ProgramStore;
 
+class ScriptKernelID : public ObjectBase {
+public:
+    ScriptKernelID(Context *rsc, Script *s, int slot, int sig);
+    virtual ~ScriptKernelID();
+
+    virtual void serialize(Context *rsc, OStream *stream) const;
+    virtual RsA3DClassID getClassId() const;
+
+    Script *mScript;
+    int mSlot;
+    bool mHasKernelInput;
+    bool mHasKernelOutput;
+};
+
+class ScriptFieldID : public ObjectBase {
+public:
+    ScriptFieldID(Context *rsc, Script *s, int slot);
+    virtual ~ScriptFieldID();
+
+    virtual void serialize(Context *rsc, OStream *stream) const;
+    virtual RsA3DClassID getClassId() const;
+
+    Script *mScript;
+    int mSlot;
+};
+
 class Script : public ObjectBase {
 public:
+
     struct Hal {
         void * drv;
 
diff --git a/rsScriptGroup.cpp b/rsScriptGroup.cpp
new file mode 100644
index 0000000..9230485
--- /dev/null
+++ b/rsScriptGroup.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsContext.h"
+#include <time.h>
+
+using namespace android;
+using namespace android::renderscript;
+
+ScriptGroup::ScriptGroup(Context *rsc) : ObjectBase(rsc) {
+}
+
+ScriptGroup::~ScriptGroup() {
+    if (mRSC->mHal.funcs.scriptgroup.destroy) {
+        mRSC->mHal.funcs.scriptgroup.destroy(mRSC, this);
+    }
+
+    for (size_t ct=0; ct < mLinks.size(); ct++) {
+        delete mLinks[ct];
+    }
+}
+
+ScriptGroup::IO::IO(const ScriptKernelID *kid) {
+    mKernel = kid;
+}
+
+ScriptGroup::Node::Node(Script *s) {
+    mScript = s;
+    mSeen = false;
+    mOrder = 0;
+}
+
+ScriptGroup::Node * ScriptGroup::findNode(Script *s) const {
+    //ALOGE("find %p   %i", s, (int)mNodes.size());
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            if (n->mKernels[ct2]->mScript == s) {
+                return n;
+            }
+        }
+    }
+    return NULL;
+}
+
+bool ScriptGroup::calcOrderRecurse(Node *n, int depth) {
+    n->mSeen = true;
+    if (n->mOrder < depth) {
+        n->mOrder = depth;
+    }
+    bool ret = true;
+    for (size_t ct=0; ct < n->mOutputs.size(); ct++) {
+        const Link *l = n->mOutputs[ct];
+        Node *nt = NULL;
+        if (l->mDstField.get()) {
+            nt = findNode(l->mDstField->mScript);
+        } else {
+            nt = findNode(l->mDstKernel->mScript);
+        }
+        if (nt->mSeen) {
+            return false;
+        }
+        ret &= calcOrderRecurse(nt, n->mOrder + 1);
+    }
+    return ret;
+}
+
+static int CompareNodeForSort(ScriptGroup::Node *const* lhs,
+                              ScriptGroup::Node *const* rhs) {
+    if (lhs[0]->mOrder > rhs[0]->mOrder) {
+        return 1;
+    }
+    return 0;
+}
+
+
+bool ScriptGroup::calcOrder() {
+    // Make nodes
+    for (size_t ct=0; ct < mKernels.size(); ct++) {
+        const ScriptKernelID *k = mKernels[ct].get();
+        //ALOGE(" kernel %i, %p  s=%p", (int)ct, k, mKernels[ct]->mScript);
+        Node *n = findNode(k->mScript);
+        //ALOGE("    n = %p", n);
+        if (n == NULL) {
+            n = new Node(k->mScript);
+            mNodes.add(n);
+        }
+        n->mKernels.add(k);
+    }
+
+    // add links
+    //ALOGE("link count %i", (int)mLinks.size());
+    for (size_t ct=0; ct < mLinks.size(); ct++) {
+        Link *l = mLinks[ct];
+        //ALOGE("link  %i %p", (int)ct, l);
+        Node *n = findNode(l->mSource->mScript);
+        //ALOGE("link n %p", n);
+        n->mOutputs.add(l);
+
+        if (l->mDstKernel.get()) {
+            //ALOGE("l->mDstKernel.get() %p", l->mDstKernel.get());
+            n = findNode(l->mDstKernel->mScript);
+            //ALOGE("  n1 %p", n);
+            n->mInputs.add(l);
+        } else {
+            n = findNode(l->mDstField->mScript);
+            //ALOGE("  n2 %p", n);
+            n->mInputs.add(l);
+        }
+    }
+
+    //ALOGE("node count %i", (int)mNodes.size());
+    // Order nodes
+    bool ret = true;
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        if (n->mInputs.size() == 0) {
+            for (size_t ct2=0; ct2 < mNodes.size(); ct2++) {
+                mNodes[ct2]->mSeen = false;
+            }
+            ret &= calcOrderRecurse(n, 0);
+        }
+    }
+
+    for (size_t ct=0; ct < mKernels.size(); ct++) {
+        const ScriptKernelID *k = mKernels[ct].get();
+        const Node *n = findNode(k->mScript);
+
+        if (k->mHasKernelOutput) {
+            bool found = false;
+            for (size_t ct2=0; ct2 < n->mOutputs.size(); ct2++) {
+                if (n->mOutputs[ct2]->mSource.get() == k) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                //ALOGE("add io out %p", k);
+                mOutputs.add(new IO(k));
+            }
+        }
+
+        if (k->mHasKernelInput) {
+            bool found = false;
+            for (size_t ct2=0; ct2 < n->mInputs.size(); ct2++) {
+                if (n->mInputs[ct2]->mDstKernel.get() == k) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                //ALOGE("add io in %p", k);
+                mInputs.add(new IO(k));
+            }
+        }
+    }
+
+    // sort
+    mNodes.sort(&CompareNodeForSort);
+
+    return ret;
+}
+
+ScriptGroup * ScriptGroup::create(Context *rsc,
+                           ScriptKernelID ** kernels, size_t kernelsSize,
+                           ScriptKernelID ** src, size_t srcSize,
+                           ScriptKernelID ** dstK, size_t dstKSize,
+                           ScriptFieldID  ** dstF, size_t dstFSize,
+                           const Type ** type, size_t typeSize) {
+
+    size_t kernelCount = kernelsSize / sizeof(ScriptKernelID *);
+    size_t linkCount = typeSize / sizeof(Type *);
+
+    //ALOGE("ScriptGroup::create kernels=%i  links=%i", (int)kernelCount, (int)linkCount);
+
+
+    // Start by counting unique kernel sources
+
+    ScriptGroup *sg = new ScriptGroup(rsc);
+
+    sg->mKernels.reserve(kernelCount);
+    for (size_t ct=0; ct < kernelCount; ct++) {
+        sg->mKernels.add(kernels[ct]);
+    }
+
+    sg->mLinks.reserve(linkCount);
+    for (size_t ct=0; ct < linkCount; ct++) {
+        Link *l = new Link();
+        l->mType = type[ct];
+        l->mSource = src[ct];
+        l->mDstField = dstF[ct];
+        l->mDstKernel = dstK[ct];
+        sg->mLinks.add(l);
+    }
+
+    sg->calcOrder();
+
+    // allocate links
+    for (size_t ct=0; ct < sg->mNodes.size(); ct++) {
+        const Node *n = sg->mNodes[ct];
+        for (size_t ct2=0; ct2 < n->mOutputs.size(); ct2++) {
+            Link *l = n->mOutputs[ct2];
+            if (l->mAlloc.get()) {
+                continue;
+            }
+            const ScriptKernelID *k = l->mSource.get();
+
+            Allocation * alloc = Allocation::createAllocation(rsc,
+                    l->mType.get(), RS_ALLOCATION_USAGE_SCRIPT);
+            l->mAlloc = alloc;
+
+            for (size_t ct3=ct2+1; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == l->mSource.get()) {
+                    n->mOutputs[ct3]->mAlloc = alloc;
+                }
+            }
+        }
+    }
+
+    if (rsc->mHal.funcs.scriptgroup.init) {
+        rsc->mHal.funcs.scriptgroup.init(rsc, sg);
+    }
+    return sg;
+}
+
+void ScriptGroup::setInput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
+    for (size_t ct=0; ct < mInputs.size(); ct++) {
+        if (mInputs[ct]->mKernel == kid) {
+            mInputs[ct]->mAlloc = a;
+
+            if (rsc->mHal.funcs.scriptgroup.setInput) {
+                rsc->mHal.funcs.scriptgroup.setInput(rsc, this, kid, a);
+            }
+            return;
+        }
+    }
+    rsAssert(!"ScriptGroup:setInput kid not found");
+}
+
+void ScriptGroup::setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a) {
+    for (size_t ct=0; ct < mOutputs.size(); ct++) {
+        if (mOutputs[ct]->mKernel == kid) {
+            mOutputs[ct]->mAlloc = a;
+
+            if (rsc->mHal.funcs.scriptgroup.setOutput) {
+                rsc->mHal.funcs.scriptgroup.setOutput(rsc, this, kid, a);
+            }
+            return;
+        }
+    }
+    rsAssert(!"ScriptGroup:setOutput kid not found");
+}
+
+void ScriptGroup::execute(Context *rsc) {
+    //ALOGE("ScriptGroup::execute");
+    if (rsc->mHal.funcs.scriptgroup.execute) {
+        rsc->mHal.funcs.scriptgroup.execute(rsc, this);
+        return;
+    }
+
+    for (size_t ct=0; ct < mNodes.size(); ct++) {
+        Node *n = mNodes[ct];
+        //ALOGE("node %i, order %i, in %i out %i", (int)ct, n->mOrder, (int)n->mInputs.size(), (int)n->mOutputs.size());
+
+        for (size_t ct2=0; ct2 < n->mKernels.size(); ct2++) {
+            const ScriptKernelID *k = n->mKernels[ct2];
+            Allocation *ain = NULL;
+            Allocation *aout = NULL;
+
+            for (size_t ct3=0; ct3 < n->mInputs.size(); ct3++) {
+                if (n->mInputs[ct3]->mDstKernel.get() == k) {
+                    ain = n->mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" link in %p", ain);
+                }
+            }
+            for (size_t ct3=0; ct3 < mInputs.size(); ct3++) {
+                if (mInputs[ct3]->mKernel == k) {
+                    ain = mInputs[ct3]->mAlloc.get();
+                    //ALOGE(" io in %p", ain);
+                }
+            }
+
+            for (size_t ct3=0; ct3 < n->mOutputs.size(); ct3++) {
+                if (n->mOutputs[ct3]->mSource.get() == k) {
+                    aout = n->mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" link out %p", aout);
+                }
+            }
+            for (size_t ct3=0; ct3 < mOutputs.size(); ct3++) {
+                if (mOutputs[ct3]->mKernel == k) {
+                    aout = mOutputs[ct3]->mAlloc.get();
+                    //ALOGE(" io out %p", aout);
+                }
+            }
+
+            n->mScript->runForEach(rsc, k->mSlot, ain, aout, NULL, 0);
+        }
+
+    }
+
+}
+
+void ScriptGroup::serialize(Context *rsc, OStream *stream) const {
+}
+
+RsA3DClassID ScriptGroup::getClassId() const {
+    return RS_A3D_CLASS_ID_SCRIPT_GROUP;
+}
+
+ScriptGroup::Link::Link() {
+}
+
+ScriptGroup::Link::~Link() {
+}
+
+namespace android {
+namespace renderscript {
+
+
+RsScriptGroup rsi_ScriptGroupCreate(Context *rsc,
+                           RsScriptKernelID * kernels, size_t kernelsSize,
+                           RsScriptKernelID * src, size_t srcSize,
+                           RsScriptKernelID * dstK, size_t dstKSize,
+                           RsScriptFieldID * dstF, size_t dstFSize,
+                           const RsType * type, size_t typeSize) {
+
+
+    return ScriptGroup::create(rsc,
+                               (ScriptKernelID **) kernels, kernelsSize,
+                               (ScriptKernelID **) src, srcSize,
+                               (ScriptKernelID **) dstK, dstKSize,
+                               (ScriptFieldID  **) dstF, dstFSize,
+                               (const Type **) type, typeSize);
+}
+
+
+void rsi_ScriptGroupSetInput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
+        RsAllocation alloc) {
+    //ALOGE("rsi_ScriptGroupSetInput");
+    ScriptGroup *s = (ScriptGroup *)sg;
+    s->setInput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
+}
+
+void rsi_ScriptGroupSetOutput(Context *rsc, RsScriptGroup sg, RsScriptKernelID kid,
+        RsAllocation alloc) {
+    //ALOGE("rsi_ScriptGroupSetOutput");
+    ScriptGroup *s = (ScriptGroup *)sg;
+    s->setOutput(rsc, (ScriptKernelID *)kid, (Allocation *)alloc);
+}
+
+void rsi_ScriptGroupExecute(Context *rsc, RsScriptGroup sg) {
+    //ALOGE("rsi_ScriptGroupExecute");
+    ScriptGroup *s = (ScriptGroup *)sg;
+    s->execute(rsc);
+}
+
+}
+}
+
diff --git a/rsScriptGroup.h b/rsScriptGroup.h
new file mode 100644
index 0000000..d51b1db
--- /dev/null
+++ b/rsScriptGroup.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RS_SCRIPT_GROUP_H
+#define ANDROID_RS_SCRIPT_GROUP_H
+
+#include "rsAllocation.h"
+#include "rsScript.h"
+
+
+// ---------------------------------------------------------------------------
+namespace android {
+namespace renderscript {
+
+class ProgramVertex;
+class ProgramFragment;
+class ProgramRaster;
+class ProgramStore;
+
+class ScriptGroup : public ObjectBase {
+public:
+    Vector<ObjectBaseRef<ScriptKernelID> > mKernels;
+
+    class Link {
+    public:
+        ObjectBaseRef<const ScriptKernelID> mSource;
+        ObjectBaseRef<const ScriptKernelID> mDstKernel;
+        ObjectBaseRef<const ScriptFieldID> mDstField;
+        ObjectBaseRef<const Type> mType;
+        ObjectBaseRef<Allocation> mAlloc;
+        Link();
+        ~Link();
+    };
+
+    class Node {
+    public:
+        Node(Script *);
+
+        Vector<const ScriptKernelID *> mKernels;
+        Vector<Link *> mOutputs;
+        Vector<Link *> mInputs;
+        bool mSeen;
+        int mOrder;
+        Script *mScript;
+    };
+
+    class IO {
+    public:
+        IO(const ScriptKernelID *);
+
+        const ScriptKernelID *mKernel;
+        ObjectBaseRef<Allocation> mAlloc;
+    };
+
+    Vector<Link *> mLinks;
+    Vector<Node *> mNodes;
+    Vector<IO *> mInputs;
+    Vector<IO *> mOutputs;
+
+    struct Hal {
+        void * drv;
+
+        struct DriverInfo {
+        };
+        DriverInfo info;
+    };
+    Hal mHal;
+
+    static ScriptGroup * create(Context *rsc,
+                           ScriptKernelID ** kernels, size_t kernelsSize,
+                           ScriptKernelID ** src, size_t srcSize,
+                           ScriptKernelID ** dstK, size_t dstKSize,
+                           ScriptFieldID ** dstF, size_t dstFSize,
+                           const Type ** type, size_t typeSize);
+
+    virtual void serialize(Context *rsc, OStream *stream) const;
+    virtual RsA3DClassID getClassId() const;
+
+    void execute(Context *rsc);
+    void setInput(Context *rsc, ScriptKernelID *kid, Allocation *a);
+    void setOutput(Context *rsc, ScriptKernelID *kid, Allocation *a);
+
+
+protected:
+    virtual ~ScriptGroup();
+    bool mInitialized;
+
+
+private:
+    bool calcOrderRecurse(Node *n, int depth);
+    bool calcOrder();
+    Node * findNode(Script *s) const;
+
+    ScriptGroup(Context *);
+};
+
+
+}
+}
+#endif
+
diff --git a/rs_hal.h b/rs_hal.h
index 51f6327..6f0e530 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -30,7 +30,11 @@
 class Type;
 class Allocation;
 class Script;
+class ScriptKernelID;
+class ScriptFieldID;
+class ScriptMethodID;
 class ScriptC;
+class ScriptGroup;
 class Path;
 class Program;
 class ProgramStore;
@@ -258,6 +262,16 @@
         void (*destroy)(const Context *rsc, const FBOCache *fb);
     } framebuffer;
 
+    struct {
+        bool (*init)(const Context *rsc, const ScriptGroup *sg);
+        void (*setInput)(const Context *rsc, const ScriptGroup *sg,
+                         const ScriptKernelID *kid, Allocation *);
+        void (*setOutput)(const Context *rsc, const ScriptGroup *sg,
+                          const ScriptKernelID *kid, Allocation *);
+        void (*execute)(const Context *rsc, const ScriptGroup *sg);
+        void (*destroy)(const Context *rsc, const ScriptGroup *sg);
+    } scriptgroup;
+
 } RsdHalFunctions;