Intrinisics

Change-Id: I1ce02ecd853382a2c92823b021750b93f1786ccf
diff --git a/Android.mk b/Android.mk
index 08fa6dc..e2eff95 100644
--- a/Android.mk
+++ b/Android.mk
@@ -17,6 +17,7 @@
 	driver/rsdFrameBuffer.cpp \
 	driver/rsdFrameBufferObj.cpp \
 	driver/rsdGL.cpp \
+	driver/rsdIntrinsics.cpp \
 	driver/rsdMesh.cpp \
 	driver/rsdMeshObj.cpp \
 	driver/rsdPath.cpp \
@@ -134,6 +135,7 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
 	rsThreadIO.cpp \
@@ -225,6 +227,7 @@
 	rsScriptC.cpp \
 	rsScriptC_Lib.cpp \
 	rsScriptC_LibGL.cpp \
+	rsScriptIntrinsic.cpp \
 	rsSignal.cpp \
 	rsStream.cpp \
 	rsThreadIO.cpp \
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index a36f728..9225e06 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -18,6 +18,7 @@
 #include "rsdBcc.h"
 #include "rsdRuntime.h"
 #include "rsdAllocation.h"
+#include "rsdIntrinsics.h"
 
 #include <bcc/BCCContext.h>
 #include <bcc/Renderscript/RSCompilerDriver.h>
@@ -36,6 +37,7 @@
 using namespace android::renderscript;
 
 struct DrvScript {
+    RsScriptIntrisicID mIntrinsicID;
     int (*mRoot)();
     int (*mRootExpand)();
     void (*mInit)();
@@ -167,27 +169,42 @@
 
 }
 
+bool rsdInitIntrinsic(const Context *rsc, Script *s, RsScriptIntrisicID iid, Element *e) {
+    pthread_mutex_lock(&rsdgInitMutex);
+
+    DrvScript *drv = (DrvScript *)calloc(1, sizeof(DrvScript));
+    if (drv == NULL) {
+        goto error;
+    }
+    s->mHal.drv = drv;
+    drv->mIntrinsicID = iid;
+
+    s->mHal.info.exportedVariableCount = 1;
+
+
+
+
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return true;
+
+error:
+    pthread_mutex_unlock(&rsdgInitMutex);
+    return false;
+}
+
 typedef struct {
+    RsForEachStubParamStruct fep;
+
     Context *rsc;
     Script *script;
     ForEachFunc_t kernel;
     uint32_t sig;
     const Allocation * ain;
     Allocation * aout;
-    const void * usr;
-    size_t usrLen;
 
     uint32_t mSliceSize;
     volatile int mSliceNum;
 
-    const uint8_t *ptrIn;
-    uint32_t eStrideIn;
-    uint8_t *ptrOut;
-    uint32_t eStrideOut;
-
-    uint32_t yStrideIn;
-    uint32_t yStrideOut;
-
     uint32_t xStart;
     uint32_t xEnd;
     uint32_t yStart;
@@ -196,20 +213,13 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
-
-    uint32_t dimX;
-    uint32_t dimY;
-    uint32_t dimZ;
-    uint32_t dimArray;
 } MTLaunchStruct;
 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
 static void wc_xy(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
-    memset(&p, 0, sizeof(p));
-    p.usr = mtls->usr;
-    p.usr_len = mtls->usrLen;
+    memcpy(&p, &mtls->fep, sizeof(p));
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -226,9 +236,9 @@
         //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
         for (p.y = yStart; p.y < yEnd; p.y++) {
-            p.out = mtls->ptrOut + (mtls->yStrideOut * p.y);
-            p.in = mtls->ptrIn + (mtls->yStrideIn * p.y);
-            fn(&p, mtls->xStart, mtls->xEnd, mtls->eStrideIn, mtls->eStrideOut);
+            p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y);
+            p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y);
+            fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
         }
     }
 }
@@ -236,9 +246,7 @@
 static void wc_x(void *usr, uint32_t idx) {
     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
     RsForEachStubParamStruct p;
-    memset(&p, 0, sizeof(p));
-    p.usr = mtls->usr;
-    p.usr_len = mtls->usrLen;
+    memcpy(&p, &mtls->fep, sizeof(p));
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
@@ -255,9 +263,9 @@
         //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
         //ALOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
 
-        p.out = mtls->ptrOut + (mtls->eStrideOut * xStart);
-        p.in = mtls->ptrIn + (mtls->eStrideIn * xStart);
-        fn(&p, xStart, xEnd, mtls->eStrideIn, mtls->eStrideOut);
+        p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
+        p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
+        fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
     }
 }
 
@@ -275,24 +283,29 @@
     MTLaunchStruct mtls;
     memset(&mtls, 0, sizeof(mtls));
 
-    //ALOGE("for each script %p  in %p   out %p", s, ain, aout);
+    ALOGE("for each script %p  in %p   out %p", s, ain, aout);
 
     DrvScript *drv = (DrvScript *)s->mHal.drv;
-    rsAssert(slot < drv->mExecutable->getExportForeachFuncAddrs().size());
-    mtls.kernel = reinterpret_cast<ForEachFunc_t>(
-                      drv->mExecutable->getExportForeachFuncAddrs()[slot]);
-    rsAssert(mtls.kernel != NULL);
-    mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
+
+    if (drv->mIntrinsicID) {
+        mtls.kernel = (void (*)())&rsdIntrinsic_Convolve3x3_uchar4;
+    } else {
+        rsAssert(slot < drv->mExecutable->getExportForeachFuncAddrs().size());
+        mtls.kernel = reinterpret_cast<ForEachFunc_t>(
+                          drv->mExecutable->getExportForeachFuncAddrs()[slot]);
+        rsAssert(mtls.kernel != NULL);
+        mtls.sig = drv->mExecutable->getInfo().getExportForeachFuncs()[slot].second;
+    }
 
     if (ain) {
-        mtls.dimX = ain->getType()->getDimX();
-        mtls.dimY = ain->getType()->getDimY();
-        mtls.dimZ = ain->getType()->getDimZ();
+        mtls.fep.dimX = ain->getType()->getDimX();
+        mtls.fep.dimY = ain->getType()->getDimY();
+        mtls.fep.dimZ = ain->getType()->getDimZ();
         //mtls.dimArray = ain->getType()->getDimArray();
     } else if (aout) {
-        mtls.dimX = aout->getType()->getDimX();
-        mtls.dimY = aout->getType()->getDimY();
-        mtls.dimZ = aout->getType()->getDimZ();
+        mtls.fep.dimX = aout->getType()->getDimX();
+        mtls.fep.dimY = aout->getType()->getDimY();
+        mtls.fep.dimZ = aout->getType()->getDimZ();
         //mtls.dimArray = aout->getType()->getDimArray();
     } else {
         rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
@@ -300,24 +313,24 @@
     }
 
     if (!sc || (sc->xEnd == 0)) {
-        mtls.xEnd = mtls.dimX;
+        mtls.xEnd = mtls.fep.dimX;
     } else {
-        rsAssert(sc->xStart < mtls.dimX);
-        rsAssert(sc->xEnd <= mtls.dimX);
+        rsAssert(sc->xStart < mtls.fep.dimX);
+        rsAssert(sc->xEnd <= mtls.fep.dimX);
         rsAssert(sc->xStart < sc->xEnd);
-        mtls.xStart = rsMin(mtls.dimX, sc->xStart);
-        mtls.xEnd = rsMin(mtls.dimX, sc->xEnd);
+        mtls.xStart = rsMin(mtls.fep.dimX, sc->xStart);
+        mtls.xEnd = rsMin(mtls.fep.dimX, sc->xEnd);
         if (mtls.xStart >= mtls.xEnd) return;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        mtls.yEnd = mtls.dimY;
+        mtls.yEnd = mtls.fep.dimY;
     } else {
-        rsAssert(sc->yStart < mtls.dimY);
-        rsAssert(sc->yEnd <= mtls.dimY);
+        rsAssert(sc->yStart < mtls.fep.dimY);
+        rsAssert(sc->yEnd <= mtls.fep.dimY);
         rsAssert(sc->yStart < sc->yEnd);
-        mtls.yStart = rsMin(mtls.dimY, sc->yStart);
-        mtls.yEnd = rsMin(mtls.dimY, sc->yEnd);
+        mtls.yStart = rsMin(mtls.fep.dimY, sc->yStart);
+        mtls.yEnd = rsMin(mtls.fep.dimY, sc->yEnd);
         if (mtls.yStart >= mtls.yEnd) return;
     }
 
@@ -335,40 +348,41 @@
     mtls.ain = ain;
     mtls.aout = aout;
     mtls.script = s;
-    mtls.usr = usr;
-    mtls.usrLen = usrLen;
+    mtls.fep.usr = usr;
+    mtls.fep.usrLen = usrLen;
     mtls.mSliceSize = 10;
     mtls.mSliceNum = 0;
 
-    mtls.ptrIn = NULL;
-    mtls.eStrideIn = 0;
+    mtls.fep.ptrIn = NULL;
+    mtls.fep.eStrideIn = 0;
     if (ain) {
         DrvAllocation *aindrv = (DrvAllocation *)ain->mHal.drv;
-        mtls.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
-        mtls.eStrideIn = ain->getType()->getElementSizeBytes();
-        mtls.yStrideIn = aindrv->lod[0].stride;
+        mtls.fep.ptrIn = (const uint8_t *)aindrv->lod[0].mallocPtr;
+        mtls.fep.eStrideIn = ain->getType()->getElementSizeBytes();
+        mtls.fep.yStrideIn = aindrv->lod[0].stride;
     }
 
-    mtls.ptrOut = NULL;
-    mtls.eStrideOut = 0;
+    mtls.fep.ptrOut = NULL;
+    mtls.fep.eStrideOut = 0;
     if (aout) {
         DrvAllocation *aoutdrv = (DrvAllocation *)aout->mHal.drv;
-        mtls.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
-        mtls.eStrideOut = aout->getType()->getElementSizeBytes();
-        mtls.yStrideOut = aoutdrv->lod[0].stride;
+        mtls.fep.ptrOut = (uint8_t *)aoutdrv->lod[0].mallocPtr;
+        mtls.fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls.fep.yStrideOut = aoutdrv->lod[0].stride;
     }
 
+
     if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) {
         dc->mInForEach = true;
-        if (mtls.dimY > 1) {
-            mtls.mSliceSize = mtls.dimY / (dc->mWorkers.mCount * 4);
+        if (mtls.fep.dimY > 1) {
+            mtls.mSliceSize = mtls.fep.dimY / (dc->mWorkers.mCount * 4);
             if(mtls.mSliceSize < 1) {
                 mtls.mSliceSize = 1;
             }
 
             rsdLaunchThreads(mrsc, wc_xy, &mtls);
         } else {
-            mtls.mSliceSize = mtls.dimX / (dc->mWorkers.mCount * 4);
+            mtls.mSliceSize = mtls.fep.dimX / (dc->mWorkers.mCount * 4);
             if(mtls.mSliceSize < 1) {
                 mtls.mSliceSize = 1;
             }
@@ -380,9 +394,7 @@
         //ALOGE("launch 1");
     } else {
         RsForEachStubParamStruct p;
-        memset(&p, 0, sizeof(p));
-        p.usr = mtls.usr;
-        p.usr_len = mtls.usrLen;
+        memcpy(&p, &mtls.fep, sizeof(p));
         uint32_t sig = mtls.sig;
 
         //ALOGE("launch 3");
@@ -390,13 +402,11 @@
         for (p.ar[0] = mtls.arrayStart; p.ar[0] < mtls.arrayEnd; p.ar[0]++) {
             for (p.z = mtls.zStart; p.z < mtls.zEnd; p.z++) {
                 for (p.y = mtls.yStart; p.y < mtls.yEnd; p.y++) {
-                    uint32_t offset = mtls.dimX * mtls.dimY * mtls.dimZ * p.ar[0] +
-                                      mtls.dimX * mtls.dimY * p.z +
-                                      mtls.dimX * p.y;
-                    p.out = mtls.ptrOut + (mtls.eStrideOut * offset);
-                    p.in = mtls.ptrIn + (mtls.eStrideIn * offset);
-                    fn(&p, mtls.xStart, mtls.xEnd, mtls.eStrideIn,
-                       mtls.eStrideOut);
+                    uint32_t offset = mtls.fep.dimY * mtls.fep.dimZ * p.ar[0] +
+                                      mtls.fep.dimY * p.z + p.y;
+                    p.out = mtls.fep.ptrOut + (mtls.fep.yStrideOut * offset);
+                    p.in = mtls.fep.ptrIn + (mtls.fep.yStrideIn * offset);
+                    fn(&p, mtls.xStart, mtls.xEnd, mtls.fep.eStrideIn, mtls.fep.eStrideOut);
                 }
             }
         }
@@ -451,6 +461,11 @@
     //rsAssert(!script->mFieldIsObject[slot]);
     //ALOGE("setGlobalVar %p %p %i %p %i", dc, script, slot, data, dataLength);
 
+    if (drv->mIntrinsicID) {
+        rsdIntrinsic_Convolve3x3_SetVar(dc, script, slot, data, dataLength);
+        return;
+    }
+
     int32_t *destPtr = reinterpret_cast<int32_t *>(
                           drv->mExecutable->getExportVarAddrs()[slot]);
     if (!destPtr) {
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index ead6701..8b91e93 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -24,6 +24,11 @@
 bool rsdScriptInit(const android::renderscript::Context *, android::renderscript::ScriptC *,
                    char const *resName, char const *cacheDir,
                    uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags);
+bool rsdInitIntrinsic(const android::renderscript::Context *rsc,
+                      android::renderscript::Script *s,
+                      RsScriptIntrisicID iid,
+                      android::renderscript::Element *e);
+
 void rsdScriptInvokeFunction(const android::renderscript::Context *dc,
                              android::renderscript::Script *script,
                              uint32_t slot,
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index 33ab49a..d580a3d 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -54,6 +54,7 @@
     SetPriority,
     {
         rsdScriptInit,
+        rsdInitIntrinsic,
         rsdScriptInvokeFunction,
         rsdScriptInvokeRoot,
         rsdScriptInvokeForEach,
diff --git a/driver/rsdIntrinsics.cpp b/driver/rsdIntrinsics.cpp
new file mode 100644
index 0000000..b235669
--- /dev/null
+++ b/driver/rsdIntrinsics.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rsdCore.h"
+#include "rsdIntrinsics.h"
+
+
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+
+
+
+using namespace android;
+using namespace android::renderscript;
+
+/*
+typedef struct {
+    const void *in;
+    void *out;
+    const void *usr;
+    size_t usr_len;
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+    uint32_t lod;
+    RsAllocationCubemapFace face;
+    uint32_t ar[16];
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+    size_t strideIn;
+    size_t strideOut;
+    const void *baseIn;
+    void *baseOut;
+} RsForEachStubParamStruct;
+*/
+
+float4 convert_float4(uchar4 i) {
+    float4 f4 = {i.x, i.y, i.z, i.w};
+    return f4;
+}
+
+uchar4 convert_uchar4(float4 i) {
+    uchar4 f4 = {(uchar)i.x, (uchar)i.y, (uchar)i.z, (uchar)i.w};
+    return f4;
+}
+
+float4 clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+static float params[9] = { 0.f,  -1.f,  0.f,
+                          -1.f,   5.f, -1.f,
+                           0.f,  -1.f,  0.f };
+
+void rsdIntrinsic_Convolve3x3_SetVar(const Context *dc, const Script *script,
+                           uint32_t slot, void *data, size_t dataLength) {
+    memcpy (params, data, dataLength);
+}
+
+void rsdIntrinsic_Convolve3x3_uchar4(const RsForEachStubParamStruct *p,
+                                     uint32_t xstart, uint32_t xend,
+                                     uint32_t instep, uint32_t outstep) {
+
+    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)p->dimY);
+    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
+    const uint8_t *bp = (const uint8_t *)p->ptrIn;
+    const uchar4 *py0 = (const uchar4 *)(bp + p->yStrideIn * y2);
+    const uchar4 *py1 = (const uchar4 *)(bp + p->yStrideIn * p->y);
+    const uchar4 *py2 = (const uchar4 *)(bp + p->yStrideIn * y1);
+
+    uchar4 *out = (uchar4 *)p->out;
+
+    for(uint32_t x = xstart; x < xend; x++) {
+        uint32_t x1 = rsMin((int32_t)x+1, (int32_t)p->dimX);
+        uint32_t x2 = rsMax((int32_t)x-1, 0);
+
+        float4 p00 = convert_float4(py0[x1]) * params[0];
+        float4 p01 = convert_float4(py0[x])  * params[1];
+        float4 p02 = convert_float4(py0[x2]) * params[2];
+        float4 p10 = convert_float4(py1[x1]) * params[3];
+        float4 p11 = convert_float4(py1[x])  * params[4];
+        float4 p12 = convert_float4(py1[x2]) * params[5];
+        float4 p20 = convert_float4(py2[x1]) * params[6];
+        float4 p21 = convert_float4(py2[x])  * params[7];
+        float4 p22 = convert_float4(py2[x2]) * params[8];
+
+        p00 += p01;
+        p02 += p10;
+        p11 += p12;
+        p20 += p21;
+
+        p22 += p00;
+        p02 += p11;
+
+        p20 += p22;
+        p20 += p02;
+
+        p20 = clamp(p20, 0.f, 255.f);
+        *out = convert_uchar4(p20);
+        out++;
+    }
+
+}
+
diff --git a/driver/rsdIntrinsics.h b/driver/rsdIntrinsics.h
new file mode 100644
index 0000000..5aecd34
--- /dev/null
+++ b/driver/rsdIntrinsics.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSD_INTRINSICS_H
+#define RSD_INTRINSICS_H
+
+#include <rs_hal.h>
+
+void rsdIntrinsic_Convolve3x3_SetVar(const android::renderscript::Context *dc,
+                                     const android::renderscript::Script *script,
+                                     uint32_t slot, void *data, size_t dataLength);
+void rsdIntrinsic_Convolve3x3_uchar4(const android::renderscript::RsForEachStubParamStruct *,
+                                     uint32_t x1, uint32_t x2,
+                                     uint32_t instep, uint32_t outstep);
+
+
+
+#endif // RSD_INTRINSICS_H
+
diff --git a/rs.spec b/rs.spec
index f32443f..607f7dc 100644
--- a/rs.spec
+++ b/rs.spec
@@ -346,6 +346,11 @@
     ret RsScript
     }
 
+ScriptIntrinsicCreate {
+    param uint32_t id
+    param RsElement eid
+    ret RsScript
+    }
 
 ProgramStoreCreate {
     direct
diff --git a/rsDefines.h b/rsDefines.h
index 854df08..466469f 100644
--- a/rsDefines.h
+++ b/rsDefines.h
@@ -333,6 +333,12 @@
     RS_CULL_INVALID = 100,
 };
 
+enum RsScriptIntrisicID {
+    RS_SCRIPT_INTRINSIC_ID_UNDEFINED = 0,
+    RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3 = 1,
+    RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5 = 2
+};
+
 typedef struct {
     RsA3DClassID classID;
     const char* objectName;
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
new file mode 100644
index 0000000..8e48d41
--- /dev/null
+++ b/rsScriptIntrinsic.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "rsContext.h"
+#include "rsScriptIntrinsic.h"
+#include <time.h>
+
+using namespace android;
+using namespace android::renderscript;
+
+ScriptIntrinsic::ScriptIntrinsic(Context *rsc) : Script(rsc) {
+}
+
+ScriptIntrinsic::~ScriptIntrinsic() {
+}
+
+bool ScriptIntrinsic::init(Context *rsc, RsScriptIntrisicID iid, Element *e) {
+    mIntrinsicID = iid;
+    mElement.set(e);
+    rsc->mHal.funcs.script.initIntrinsic(rsc, this, iid, e);
+
+    return true;
+}
+
+bool ScriptIntrinsic::freeChildren() {
+    return false;
+}
+
+void ScriptIntrinsic::setupScript(Context *rsc) {
+}
+
+uint32_t ScriptIntrinsic::run(Context *rsc) {
+    rsAssert(!"ScriptIntrinsic::run - should not happen");
+    return 0;
+}
+
+
+void ScriptIntrinsic::runForEach(Context *rsc,
+                         uint32_t slot,
+                         const Allocation * ain,
+                         Allocation * aout,
+                         const void * usr,
+                         size_t usrBytes,
+                         const RsScriptCall *sc) {
+
+    rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
+}
+
+void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
+}
+
+void ScriptIntrinsic::serialize(Context *rsc, OStream *stream) const {
+}
+
+RsA3DClassID ScriptIntrinsic::getClassId() const {
+    return (RsA3DClassID)0;
+}
+
+
+
+namespace android {
+namespace renderscript {
+
+
+RsScript rsi_ScriptIntrinsicCreate(Context *rsc, uint32_t id, RsElement ve) {
+    ScriptIntrinsic *si = new ScriptIntrinsic(rsc);
+    ALOGE("rsi_ScriptIntrinsicCreate %i", id);
+    if (!si->init(rsc, (RsScriptIntrisicID)id, (Element *)ve)) {
+        delete si;
+        return NULL;
+    }
+    return si;
+}
+
+}
+}
+
+
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
new file mode 100644
index 0000000..76d8f2b
--- /dev/null
+++ b/rsScriptIntrinsic.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RS_SCRIPT_INTRINSIC_H
+#define ANDROID_RS_SCRIPT_INTRINSIC_H
+
+#include "rsScript.h"
+
+
+// ---------------------------------------------------------------------------
+namespace android {
+namespace renderscript {
+
+
+class ScriptIntrinsic : public Script {
+public:
+
+    ObjectBaseRef<const Element> mElement;
+
+    ScriptIntrinsic(Context *);
+    virtual ~ScriptIntrinsic();
+
+    bool init(Context *rsc, RsScriptIntrisicID iid, Element *e);
+
+
+    virtual void serialize(Context *rsc, OStream *stream) const;
+    virtual RsA3DClassID getClassId() const;
+    virtual bool freeChildren();
+
+    virtual void runForEach(Context *rsc,
+                            uint32_t slot,
+                            const Allocation * ain,
+                            Allocation * aout,
+                            const void * usr,
+                            size_t usrBytes,
+                            const RsScriptCall *sc = NULL);
+
+    virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
+    virtual void setupScript(Context *rsc);
+    virtual uint32_t run(Context *);
+protected:
+    uint32_t mIntrinsicID;
+    float mParams[9];
+
+};
+
+
+}
+}
+#endif
+
+
diff --git a/rs_hal.h b/rs_hal.h
index 08938a1..16e3309 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -47,13 +47,25 @@
     const void *in;
     void *out;
     const void *usr;
-    size_t usr_len;
+    size_t usrLen;
     uint32_t x;
     uint32_t y;
     uint32_t z;
     uint32_t lod;
     RsAllocationCubemapFace face;
     uint32_t ar[16];
+
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+    uint32_t dimArray;
+
+    const uint8_t *ptrIn;
+    uint8_t *ptrOut;
+    uint32_t eStrideIn;
+    uint32_t eStrideOut;
+    uint32_t yStrideIn;
+    uint32_t yStrideOut;
 } RsForEachStubParamStruct;
 
 /**
@@ -78,6 +90,9 @@
                      uint8_t const *bitcode,
                      size_t bitcodeSize,
                      uint32_t flags);
+        bool (*initIntrinsic)(const Context *rsc, Script *s,
+                              RsScriptIntrisicID iid,
+                              Element *e);
 
         void (*invokeFunction)(const Context *rsc, Script *s,
                                uint32_t slot,