Improve rsForEach overhead.

Change-Id: Iaabef7bb573233ef7c5756077f840ee933ee0c39

fix spacing, reduce rsForEach overhead about 50%
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 5fd5c35..269703c 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -226,6 +226,7 @@
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
+    outer_foreach_t fn = dc->mForEachLaunch[sig];
     while (1) {
         uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
         uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
@@ -239,16 +240,10 @@
         //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
         for (p.y = yStart; p.y < yEnd; p.y++) {
             uint32_t offset = mtls->dimX * p.y;
-            uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * offset);
-            const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * offset);
-
-            for (p.x = mtls->xStart; p.x < mtls->xEnd; p.x++) {
-                p.in = xPtrIn;
-                p.out = xPtrOut;
-                dc->mForEachLaunch[sig](&mtls->script->mHal.info.root, &p);
-                xPtrIn += mtls->eStrideIn;
-                xPtrOut += mtls->eStrideOut;
-            }
+            p.out = mtls->ptrOut + (mtls->eStrideOut * offset);
+            p.in = mtls->ptrIn + (mtls->eStrideIn * offset);
+            fn(&mtls->script->mHal.info.root, &p, mtls->xStart, mtls->xEnd,
+               mtls->eStrideIn, mtls->eStrideOut);
         }
     }
 }
@@ -262,6 +257,7 @@
     RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv;
     uint32_t sig = mtls->sig;
 
+    outer_foreach_t fn = dc->mForEachLaunch[sig];
     while (1) {
         uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
         uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
@@ -273,15 +269,10 @@
 
         //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
         //LOGE("usr ptr in %p,  out %p", mtls->ptrIn, mtls->ptrOut);
-        uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * xStart);
-        const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * xStart);
-        for (p.x = xStart; p.x < xEnd; p.x++) {
-            p.in = xPtrIn;
-            p.out = xPtrOut;
-            dc->mForEachLaunch[sig](&mtls->script->mHal.info.root, &p);
-            xPtrIn += mtls->eStrideIn;
-            xPtrOut += mtls->eStrideOut;
-        }
+        p.out = mtls->ptrOut + (mtls->eStrideOut * xStart);
+        p.in = mtls->ptrIn + (mtls->eStrideIn * xStart);
+        fn(&mtls->script->mHal.info.root, &p, mtls->xStart, mtls->xEnd,
+           mtls->eStrideIn, mtls->eStrideOut);
     }
 }
 
@@ -392,22 +383,17 @@
         uint32_t sig = mtls.sig;
 
         //LOGE("launch 3");
+        outer_foreach_t fn = dc->mForEachLaunch[sig];
         for (p.ar[0] = mtls.arrayStart; p.ar[0] < mtls.arrayEnd; p.ar[0]++) {
             for (p.z = mtls.zStart; p.z < mtls.zEnd; p.z++) {
                 for (p.y = mtls.yStart; p.y < mtls.yEnd; p.y++) {
                     uint32_t offset = mtls.dimX * mtls.dimY * mtls.dimZ * p.ar[0] +
                                       mtls.dimX * mtls.dimY * p.z +
                                       mtls.dimX * p.y;
-                    uint8_t *xPtrOut = mtls.ptrOut + (mtls.eStrideOut * offset);
-                    const uint8_t *xPtrIn = mtls.ptrIn + (mtls.eStrideIn * offset);
-
-                    for (p.x = mtls.xStart; p.x < mtls.xEnd; p.x++) {
-                        p.in = xPtrIn;
-                        p.out = xPtrOut;
-                        dc->mForEachLaunch[sig](&s->mHal.info.root, &p);
-                        xPtrIn += mtls.eStrideIn;
-                        xPtrOut += mtls.eStrideOut;
-                    }
+                    p.out = mtls.ptrOut + (mtls.eStrideOut * offset);
+                    p.in = mtls.ptrIn + (mtls.eStrideIn * offset);
+                    fn(&mtls.script->mHal.info.root, &p, mtls.xStart, mtls.xEnd,
+                       mtls.eStrideIn, mtls.eStrideOut);
                 }
             }
         }
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index f8107d9..247f4dc 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -292,75 +292,136 @@
 }
 
 static void rsdForEach17(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, uint32_t);
     (*(fe*)vRoot)(p->in, p->y);
 }
 
 static void rsdForEach18(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(void *, uint32_t);
     (*(fe*)vRoot)(p->out, p->y);
 }
 
 static void rsdForEach19(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, void *, uint32_t);
     (*(fe*)vRoot)(p->in, p->out, p->y);
 }
 
 static void rsdForEach21(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, const void *, uint32_t);
     (*(fe*)vRoot)(p->in, p->usr, p->y);
 }
 
 static void rsdForEach22(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(void *, const void *, uint32_t);
     (*(fe*)vRoot)(p->out, p->usr, p->y);
 }
 
 static void rsdForEach23(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, void *, const void *, uint32_t);
     (*(fe*)vRoot)(p->in, p->out, p->usr, p->y);
 }
 
 static void rsdForEach25(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->in, p->x, p->y);
+    const uint8_t *pin = (const uint8_t *)p->in;
+    uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pin, x, y);
+        pin += instep;
+    }
 }
 
 static void rsdForEach26(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->out, p->x, p->y);
+    uint8_t *pout = (uint8_t *)p->out;
+    uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pout, x, y);
+        pout += outstep;
+    }
 }
 
 static void rsdForEach27(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->in, p->out, p->x, p->y);
+    uint8_t *pout = (uint8_t *)p->out;
+    const uint8_t *pin = (const uint8_t *)p->in;
+    uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pin, pout, x, y);
+        pin += instep;
+        pout += outstep;
+    }
 }
 
 static void rsdForEach29(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, const void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->in, p->usr, p->x, p->y);
+    const uint8_t *pin = (const uint8_t *)p->in;
+    const void *usr = p->usr;
+    const uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pin, usr, x, y);
+        pin += instep;
+    }
 }
 
 static void rsdForEach30(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(void *, const void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->out, p->usr, p->x, p->y);
+    uint8_t *pout = (uint8_t *)p->out;
+    const void *usr = p->usr;
+    const uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pout, usr, x, y);
+        pout += outstep;
+    }
 }
 
 static void rsdForEach31(const void *vRoot,
-        const android::renderscript::RsForEachStubParamStruct *p) {
+        const android::renderscript::RsForEachStubParamStruct *p,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep) {
     typedef void (*fe)(const void *, void *, const void *, uint32_t, uint32_t);
-    (*(fe*)vRoot)(p->in, p->out, p->usr, p->x, p->y);
+    uint8_t *pout = (uint8_t *)p->out;
+    const uint8_t *pin = (const uint8_t *)p->in;
+    const void *usr = p->usr;
+    const uint32_t y = p->y;
+    for (uint32_t x = x1; x < x2; x++) {
+        (*(fe*)vRoot)(pin, pout, usr, x, y);
+        pin += instep;
+        pout += outstep;
+    }
 }
 
 
diff --git a/driver/rsdCore.h b/driver/rsdCore.h
index 159b72a..ce86d11 100644
--- a/driver/rsdCore.h
+++ b/driver/rsdCore.h
@@ -28,7 +28,9 @@
 typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
 
 typedef void (*outer_foreach_t)(const void *,
-    const android::renderscript::RsForEachStubParamStruct *);
+    const android::renderscript::RsForEachStubParamStruct *,
+                                uint32_t x1, uint32_t x2,
+                                uint32_t instep, uint32_t outstep);
 
 typedef struct RsdSymbolTableRec {
     const char * mName;