Merge tag android-5.1.0_r1 into AOSP_5.1_MERGE

Change-Id: I92de070a286fc79da057be139cb5287dd8f40883
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bd276bf..f041ad9 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -42,6 +42,10 @@
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
 
+ifeq ($(RS_DISABLE_A53_WORKAROUND),true)
+LOCAL_CFLAGS_arm64 += -DDISABLE_A53_WORKAROUND
+endif
+
 LOCAL_SRC_FILES_arm64 += \
     rsCpuIntrinsics_advsimd_3DLUT.S \
     rsCpuIntrinsics_advsimd_Convolve.S \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index f09e334..4285dae 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -266,7 +266,7 @@
 
     GetCpuInfo();
 
-    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    int cpu = sysconf(_SC_NPROCESSORS_CONF);
     if(mRSC->props.mDebugMaxThreads) {
         cpu = mRSC->props.mDebugMaxThreads;
     }
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 123cc9f..9dccd80 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -297,7 +297,7 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+    if (gArchUseSIMD) {
         rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
@@ -367,9 +367,9 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+    if (gArchUseSIMD) {
         rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
-                 stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius);
+                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8c85277..6a7808e 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -126,7 +126,7 @@
 } Key_t;
 
 //Re-enable when intrinsic is fixed
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
 typedef struct {
     void (*column[4])(void);
     void (*store)(void);
@@ -184,7 +184,7 @@
     int ipa[4];
     float tmpFp[16];
     float tmpFpa[4];
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
     FunctionTab_t mFnTab;
 #endif
 
@@ -910,16 +910,20 @@
                 out += outstep * len;
                 in += instep * len;
             }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
             else {
                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
-                    rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    // Currently this generates off by one errors.
+                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    //x1 += len;
+                    //out += outstep * len;
+                    //in += instep * len;
                 } else {
                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                    x1 += len;
+                    out += outstep * len;
+                    in += instep * len;
                 }
-                x1 += len;
-                out += outstep * len;
-                in += instep * len;
             }
 #endif
         }
@@ -971,7 +975,7 @@
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
         }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
         else {
             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a835..e5953cf 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -105,7 +105,7 @@
                 convert_float4(py2[x]) * coeff[7] +
                 convert_float4(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
     *out = o;
 }
@@ -127,7 +127,7 @@
                 convert_float2(py2[x]) * coeff[7] +
                 convert_float2(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -147,7 +147,7 @@
                ((float)py2[x1]) * coeff[6] +
                ((float)py2[x]) * coeff[7] +
                ((float)py2[x2]) * coeff[8];
-    *out = clamp(px, 0.f, 255.f);
+    *out = clamp(px + 0.5f, 0.f, 255.f);
 }
 
 static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e2a6b8b..a2c29fd 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -125,7 +125,7 @@
                 convert_float4(py4[x2]) * coeff[22] +
                 convert_float4(py4[x3]) * coeff[23] +
                 convert_float4(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar4(px);
 }
 
@@ -168,7 +168,7 @@
                 convert_float2(py4[x2]) * coeff[22] +
                 convert_float2(py4[x3]) * coeff[23] +
                 convert_float2(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -211,7 +211,7 @@
                (float)(py4[x2]) * coeff[22] +
                (float)(py4[x3]) * coeff[23] +
                (float)(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = px;
 }
 
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 474f82d..19607c9 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -83,7 +83,7 @@
 
 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
                          float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -112,13 +112,13 @@
                                   convert_float4(yp3[xs3]), xf);
 
     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return convert_uchar4(p);
 }
 
 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
                          float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -147,13 +147,13 @@
                                   convert_float2(yp3[xs3]), xf);
 
     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return convert_uchar2(p);
 }
 
 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -171,7 +171,7 @@
                                  (float)yp3[xs2], (float)yp3[xs3], xf);
 
     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return (uchar)p;
 }
 
@@ -189,8 +189,8 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -208,7 +208,7 @@
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
@@ -229,8 +229,8 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -248,7 +248,7 @@
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
@@ -269,8 +269,8 @@
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -288,7 +288,7 @@
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index c53ef31..e191e25 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -161,8 +161,8 @@
         out++;
         x1++;
     }
-// reenable for ARM64 when intrinsic is fixed
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if((x2 > x1) && gArchUseSIMD) {
         int32_t len = x2 - x1;
         if (cstep == 1) {
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 929f76f..fc1eefe 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -52,17 +52,17 @@
  *      x6 -- rup
  *      x7 -- rdn
  *      x12 -- switch index
- *      q0-q3 -- coefficient table
+ *      v0-v3 -- coefficient table
  *      x13 = -pitch
  *      x15 = top-row in
  *      x19 = bottom-row in
  * Output:
  *      x1 += 16
- *      q10,q11 -- 16 convolved columns
+ *      v10,v11 -- 16 convolved columns
  * Modifies:
  *      x10 = upper row pointer
  *      x11 = lower row pointer
- *      q12-q15 = temporary sums
+ *      v12-v15 = temporary sums
  */
 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
   .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
@@ -146,15 +146,15 @@
  * When the buffer gets too big the buffer at [x9] is used.
  *
  * Input:
- *      q4-q11 -- convoltion window
+ *      v16-v31,v4-v11 -- convoltion window
  *      x9 -- pointer to additional convolution window data
  * Output:
  *      x9 -- updated buffer pointer (if used)
  *      d31 -- result to be stored
  * Modifies:
  *      x12 -- temp buffer pointer
- *      q12-q13 -- temporaries for load and vext operations.
- *      q14-q15 -- intermediate sums
+ *      v12-v13 -- temporaries for load and vext operations.
+ *      v14-v15 -- intermediate sums
  */
 #define TUNED_LIST1 8, 16
 .macro hconv1_8/*{{{*/
@@ -407,7 +407,7 @@
             umlal2      v15.4s, v12.8h, v3.h[1]
             umlal       v14.4s, v13.4h, v3.h[1]
             umlal2      v15.4s, v13.8h, v3.h[1]
-    124:    ext         v12.16b, v3.16b, v4.16b, #7*2
+    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
             ext         v13.16b, v9.16b, v10.16b, #7*2
             umlal       v14.4s, v12.4h, v3.h[0]
             umlal2      v15.4s, v12.8h, v3.h[0]
@@ -1055,64 +1055,47 @@
             ret
 END(fetch_generic_asm)
 
-/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
+/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-PRIVATE(prefetch_clamp1)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            mov         v11.16b, v10.16b
-            sub         x1, x1, #16
-1:          mov         v12.16b, v11.16b
-            movi        v13.8b, #0xff
-            tbz         x11, #2, 1f
-            ext         v12.16b, v12.16b, v12.16b, #4*2
-            sub         x1, x1, #8
-            shl         v13.2d, v13.2d, #32
-1:          tbz         x11, #1, 1f
-            ext         v12.16b, v12.16b, v12.16b, #6*2
-            sub         x1, x1, #4
-            shl         v13.2d, v13.2d, #16
-1:          tbz         x11, #0, 1f
-            ext         v12.16b, v12.16b, v12.16b, #7*2
-            sub         x1, x1, #2
-            shl         v13.2d, v13.2d, #8
-1:          dup         v12.8h, v12.h[6]
-            sxtl        v13.8h, v13.8b
-            bif         v11.16b, v12.16b, v13.16b
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
+PRIVATE(prefetch_clampright1)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #1
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.8h}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
             ret
-END(prefetch_clamp1)
+1:          dup         v12.8h, v11.h[7]
+            ret
+END(prefetch_clampright1)
 
-PRIVATE(prefetch_clamp4)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            sub         x1, x1, #16     // what's this?
-            mov         v11.16b, v10.16b
-1:          dup         v12.2d, v11.d[1]
-            tbz         x11, #2, 1f
-            dup         v12.2d, v11.d[0]
-            sub         x1, x1, #8
-            dup         v11.2d, v11.d[0]
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
+PRIVATE(prefetch_clampright4)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #4
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.2d}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
             ret
-END(prefetch_clamp4)
+1:          dup         v12.2d, v11.d[1]
+            ret
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1130,10 @@
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
 4:          b           4f+4
-           //v12 contains pad word from prefetch_clamp call
+           //v12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
   .if \rem > 0
             b           4f+4
@@ -1209,24 +1192,18 @@
   .else
             dup         v9.2d, v10.d[0]
   .endif
-            tst         x10, #15
+            ands        x12, x10, #15
             beq         2f
-            sub         x12, xzr, x10
-            tbz         x10, #3, 1f
-            mov         v11.16b, v10.16b
-            mov         v10.16b, v9.16b
-1:          tbz         x12, #2, 1f
-            ext         v11.16b, v10.16b, v11.16b, #4*2
-            ext         v10.16b, v9.16b, v10.16b, #4*2
-  .if \step == 1
-  1:        tbz         x12, #1, 1f
-            ext         v11.16b, v10.16b, v11.16b, #2*2
-            ext         v10.16b, v9.16b, v10.16b, #2*2
-  1:        tbz         x12, #0, 1f
-            ext         v11.16b, v10.16b, v11.16b, #1*2
-            ext         v10.16b, v9.16b, v10.16b, #1*2
-  .endif
-1:          sub         x1, x1, x10
+            sub         sp, sp, #32
+            st1         {v10.8h,v11.8h}, [sp]
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            sub         x1, x1, x10
             sub         x15, x15, x10
             sub         x19, x19, x10
             bic         x10, x10, #15
@@ -1363,13 +1340,13 @@
             b           3b
 4:          tbz         x3, #2, 1f
             st1         {v15.s}[0], [x0], #4
-            ext         v15.16b, v15.16b, v15.16b, #4*2
+            ext         v15.8b, v15.8b, v15.8b, #4
 1:          tbz         x3, #1, 1f
             st1         {v15.h}[0], [x0], #2
-            ext         v15.16b, v15.16b, v15.16b, #2*2
+            ext         v15.8b, v15.8b, v15.8b, #2
 1:          tbz         x3, #0, 5f
             st1         {v15.b}[0], [x0], #1
-            ext         v15.16b, v15.16b, v15.16b, #1*2
+            ext         v15.8b, v15.8b, v15.8b, #1
 5:          nop
 .endm
 
@@ -1438,7 +1415,6 @@
 
             ldr         x12, [sp, #88] // tab
 
-            add         x0, x0, x8
             add         x1, x1, x8
 
             cmp         x6, x5
@@ -1448,7 +1424,7 @@
             cmp         x8, x5
             csel        x8, x5, x8, hs
             cmp         x9, x5
-            csel        x9, x5, x8, hs
+            csel        x9, x5, x9, hs
 
             add         x4, x8, x9
             add         x4, x4, x3
@@ -1504,7 +1480,6 @@
 
             ldr         x12, [sp, #88]
 
-            add         x0, x0, x8, LSL #2
             add         x1, x1, x8, LSL #2
 
             cmp         x6, x5
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
index 632ef7a..bb4b7ae 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -21,60 +21,127 @@
  * register.  This macro will be called from within several different wrapper
  * variants for different data layouts.  Y data starts with the even and odd
  * bytes split into the low parts of v8 and v9 respectively.  U and V are in
- * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
- * pre-loaded with a constant 0xff alpha channel.
+ * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
  *
  * The complicated arithmetic is the result of refactoring the original
  * equations to avoid 16-bit overflow without losing any precision.
  */
-.macro yuvkern
-        movi        v7.8b, #149
+.macro yuvkern, regu=v10, regv=v11
+        /* v0   out R_lo / even R_lo accumulator
+         * v1   out G_lo / even G_lo accumulator
+         * v2   out B_lo / even B_lo accumulator
+         * v3   out A_lo / const 0xff*ff
+         * v4   out R_hi / even R_hi accumulator
+         * v5   out G_hi / even G_hi accumulator
+         * v6   out B_hi / even B_hi accumulator
+         * v7   out A_hi / const 0xff*ff
+         * v8   even Y   / G_lo luma tmp
+         * v9   odd Y    / G_lo luma tmp
+         * \regu in U
+         * \regv in V
+         * v12  R_lo luma tmp
+         * v13  B_lo luma tmp
+         * v14  R_hi luma tmp
+         * v15  B_hi luma tmp
+         * v16  odd R_lo accumulator
+         * v17  odd G_lo accumulator
+         * v18  odd B_lo accumulator
+         * v19  multiplier extra bits low
+         * v20  odd R_hi accumulator
+         * v21  odd G_hi accumulator
+         * v22  odd B_hi accumulator
+         * v23  multiplier extra bits high
+         * v24  constant 149
+         * v25  constant 50
+         * v26  constant 104
+         * v27  constant 204
+         * v28  constant 254
+         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+         */
 
-        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
-        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
+        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
+        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
+        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
+        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
 
-        movi        v7.8b, #50
-        movi        v10.8b, #104
-        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
-        umlal       v8.8h, v17.8b, v10.8b
+        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
+        umlal       v8.8h, \regv\().8b, v26.8b
+        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
+        umlal2      v9.8h, \regv\().16b, v26.16b
 
-        ushr        v7.8b, v17.8b, #1
-        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
-        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
+        ushr        v19.16b, \regv\().16b, #1
+        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
+        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
 
-        ushll       v7.8h, v16.8b, #2
-        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
-        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
+        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
+        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
 
-        movi        v7.16b, #204
-        movi        v10.8b, #254
-        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
-        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
+        ushll       v19.8h, \regu\().8b,  #2
+        ushll2      v23.8h, \regu\().16b, #2
+        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
+        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
 
-        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
-        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
-        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
-        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
+        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
+        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
 
-        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
-        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
-        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
+        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
 
-        uqrshrn     v0.8b, v0.8h, #6
-        uqrshrn     v4.8b, v4.8h, #6
-        uqrshrn     v1.8b, v1.8h, #7
-        uqrshrn     v5.8b, v5.8h, #7
-        uqrshrn     v2.8b, v2.8h, #6
-        uqrshrn     v6.8b, v6.8h, #6
+        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
+        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
 
-        zip1        v0.16b, v0.16b, v4.16b
-        zip1        v1.16b, v1.16b, v5.16b
-        zip1        v2.16b, v2.16b, v6.16b
+        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
+        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
+        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
+
+        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
+        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
+        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
+        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
+
+        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
+        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
+        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
+        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
+        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b,  v0.8h,  #6
+        uqrshrn     v16.8b, v16.8h, #6
+        uqrshrn     v1.8b,  v1.8h,  #7
+        uqrshrn     v17.8b, v17.8h, #7
+        uqrshrn     v2.8b,  v2.8h,  #6
+        uqrshrn     v18.8b, v18.8h, #6
+
+        uqrshrn     v4.8b,  v4.8h,  #6
+        uqrshrn     v20.8b, v20.8h, #6
+        uqrshrn     v5.8b,  v5.8h,  #7
+        uqrshrn     v21.8b, v21.8h, #7
+        uqrshrn     v6.8b,  v6.8h,  #6
+        uqrshrn     v22.8b, v22.8h, #6
+
+        zip1        v0.16b, v0.16b, v16.16b
+        zip1        v1.16b, v1.16b, v17.16b
+        zip1        v2.16b, v2.16b, v18.16b
+
+        zip1        v4.16b, v4.16b, v20.16b
+        zip1        v5.16b, v5.16b, v21.16b
+        zip1        v6.16b, v6.16b, v22.16b
 .endm
 
 /* Define the wrapper code which will load and store the data, iterate the
@@ -83,50 +150,51 @@
  * being handled.
  */
 .macro wrap_line kernel, interleaved=0, swapuv=0
-
+        movi        v24.16b, #149
+        movi        v25.16b, #50
+        movi        v26.16b, #104
+        movi        v27.16b, #204
+        movi        v28.16b, #254
         mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        dup         v13.8h, w5
+        dup         v29.8h, w5
         mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        dup         v14.8h, w5
+        dup         v30.8h, w5
         mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        dup         v15.8h, w5
+        dup         v31.8h, w5
 
         movi        v3.16b, #0xff
+        movi        v7.16b, #0xff
 
-        subs        x2, x2, #16
+        subs        x2, x2, #32
         bhs         1f
         b           2f
 
         .align 4
-1:      ld2         {v8.8b,v9.8b}, [x1], #16
-//      prfm PLDL1STRM, [x1, #256]
+1:      ld2         {v8.16b,v9.16b}, [x1], #32
   .if \interleaved
-    .if \swapuv
-        ld2         {v17.8b,v18.8b}, [x3], #16
-        mov         v16.8b, v18.8b
-    .else
-        ld2         {v16.8b,v17.8b}, [x3], #16
-    .endif
-//      prfm PLD1STRM,  [x3, #256]
+        ld2         {v10.16b,v11.16b}, [x3], #32
   .else
-        ld1         {v16.8b}, [x3], #8
-        ld1         {v17.8b}, [x4], #8
-//      prfm PLD1STRM,  [x3, #128]
-//      prfm PLD1STRM,  [x4, #128]
+        ld1         {v10.16b}, [x3], #16
+        ld1         {v11.16b}, [x4], #16
   .endif
 
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
         \kernel
+  .endif
 
-        subs        x2, x2, #16
+        subs        x2, x2, #32
 
-        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+        st4         {v0.16b - v3.16b}, [x0], #64
+        st4         {v4.16b - v7.16b}, [x0], #64
 
         bhs         1b
 
-2:      adds        x2, x2, #16
+2:      adds        x2, x2, #32
         beq         2f
 
-        /* To handle the tail portion of the data (something less than 16
+        /* To handle the tail portion of the data (something less than 32
          * bytes) load small power-of-two chunks into working registers.  It
          * doesn't matter where they end up in the register; the same process
          * will store them back out using the same positions and the
@@ -135,40 +203,48 @@
          */
         movi        v8.8b, #0
         movi        v9.8b, #0
-        movi        v16.8b, #0
-        movi        v17.8b, #0
+        movi        v10.8b, #0
+        movi        v11.8b, #0
 
-        tbz         x2, #3, 1f
-        ld1         {v9.8b}, [x1], #8
+        tbz         x2, #4, 1f
+        ld1         {v9.16b}, [x1], #16
   .if \interleaved
-        ld1         {v17.8b}, [x3], #8
+        ld1         {v11.16b}, [x3], #16
   .else
-        ld1         {v16.s}[1], [x3], #4
-        ld1         {v17.s}[1], [x4], #4
+        ld1         {v10.d}[1], [x3], #8
+        ld1         {v11.d}[1], [x4], #8
+  .endif
+1:      tbz         x2, #3, 1f
+        ld1         {v8.d}[1], [x1], #8
+  .if \interleaved
+        ld1         {v10.d}[1], [x3], #8
+  .else
+        ld1         {v10.s}[1], [x3], #4
+        ld1         {v11.s}[1], [x4], #4
   .endif
 1:      tbz         x2, #2, 1f
         ld1         {v8.s}[1], [x1], #4
   .if \interleaved
-        ld1         {v16.s}[1], [x3], #4
+        ld1         {v10.s}[1], [x3], #4
   .else
-        ld1         {v16.h}[1], [x3], #2
-        ld1         {v17.h}[1], [x4], #2
+        ld1         {v10.h}[1], [x3], #2
+        ld1         {v11.h}[1], [x4], #2
   .endif
 1:      tbz         x2, #1, 1f
         ld1         {v8.h}[1], [x1], #2
   .if \interleaved
-        ld1         {v16.h}[1], [x3], #2
+        ld1         {v10.h}[1], [x3], #2
   .else
-        ld1         {v16.b}[1], [x3], #1
-        ld1         {v17.b}[1], [x4], #1
+        ld1         {v10.b}[1], [x3], #1
+        ld1         {v11.b}[1], [x4], #1
   .endif
 1:      tbz         x2, #0, 1f
         ld1         {v8.b}[1], [x1], #1
   .if \interleaved
-        ld1         {v16.h}[0], [x3], #2
+        ld1         {v10.h}[0], [x3], #2
   .else
-        ld1         {v16.b}[0], [x3], #1
-        ld1         {v17.b}[0], [x4], #1
+        ld1         {v10.b}[0], [x3], #1
+        ld1         {v11.b}[0], [x4], #1
   .endif
 
         /* One small impediment in the process above is that some of the load
@@ -176,29 +252,38 @@
          * same time as loading only part of a register.  So the data is loaded
          * linearly and unpacked manually at this point if necessary.
          */
-1:      uzp1        v8.16b, v8.16b, v9.16b
+1:      mov         v12.16b, v8.16b
+        uzp1        v8.16b, v12.16b, v9.16b
+        uzp2        v9.16b, v12.16b, v9.16b
   .if \interleaved
-    .if \swapuv
-        uzp1        v16.16b, v17.16b, v16.16b
-    .else
-        uzp1        v16.16b, v16.16b, v17.16b
-    .endif
+        mov         v12.16b, v10.16b
+        uzp1        v10.16b, v12.16b, v11.16b
+        uzp2        v11.16b, v12.16b, v11.16b
   .endif
 
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
         \kernel
+  .endif
 
         /* As above but with the output; structured stores for partial vectors
          * aren't available, so the data is re-packed first and stored linearly.
          */
-        zip1        v4.16b, v0.16b, v2.16b
-        zip2        v6.16b, v0.16b, v2.16b
-        zip1        v5.16b, v1.16b, v3.16b
-        zip2        v7.16b, v1.16b, v3.16b
-        zip1        v0.16b, v4.16b, v5.16b
-        zip2        v1.16b, v4.16b, v5.16b
-        zip1        v2.16b, v6.16b, v7.16b
-        zip2        v3.16b, v6.16b, v7.16b
+        zip1        v16.16b, v0.16b, v2.16b
+        zip2        v18.16b, v0.16b, v2.16b
+        zip1        v17.16b, v1.16b, v3.16b
+        zip2        v19.16b, v1.16b, v3.16b
+        zip1        v0.16b, v16.16b, v17.16b
+        zip2        v1.16b, v16.16b, v17.16b
+        zip1        v2.16b, v18.16b, v19.16b
+        zip2        v3.16b, v18.16b, v19.16b
 
+        /* Luckily v4-v7 don't need to be unzipped because the complete set of
+         * four and can be stored using st4. */
+
+        tbz         x2, #4, 1f
+        st4         {v4.16b - v7.16b}, [x0], #64
 1:      tbz         x2, #3, 1f
         st1         {v2.16b,v3.16b}, [x0], #32
 1:      tbz         x2, #2, 1f
@@ -225,7 +310,7 @@
         add         x1, x1, x4
         add         x4, x3, x6
         add         x3, x2, x6
-        sub         x2, x5, x6, LSL #2
+        sub         x2, x5, x6, LSL #1
 
         sub         x6, sp, #32
         sub         sp, sp, #64
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index 8fc47f5..a7ae795 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -15,6 +15,7 @@
  */
 
 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
 #define END(f) .fnend; .size f, .-f;
 
 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
@@ -1049,7 +1050,7 @@
 /* Dedicated function wrapper for the fetch macro, for the cases where
  * performance isn't that important, to keep code size down.
  */
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
             push        {r10,r11}
             fetch
             pop         {r10,r11}
@@ -1060,61 +1061,46 @@
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-ENTRY(prefetch_clamp1)
-            rsb         r11, r11, #0
-            tst         r11, #8
+PRIVATE(prefetch_clampright1)
+            ands        r12, r11, #15
             beq         1f
-            vmov.u16    q11, q10
-            sub         r1, r1, #16
-1:          vmov.u16    q12, q11
-            vmov.i8     d26, #0xff
-            tst         r11, #4
-            beq         1f
-            vext.u16    q12, q12, q12, #4
-            sub         r1, r1, #8
-            vshl.u64    d26, d26, #32
-1:          tst         r11, #2
-            beq         1f
-            vext.u16    q12, q12, q12, #6
-            sub         r1, r1, #4
-            vshl.u64    d26, d26, #16
-1:          tst         r11, #1
-            beq         1f
-            vext.u16    q12, q12, q12, #7
-            sub         r1, r1, #2
-            vshl.u64    d26, d26, #8
-1:          vdup.u16    q12, d25[2]
-            vmovl.s8    q13, d26
-            vbif        q11, q12, q13
-1:          tst         r11, #8
-            beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #1
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u16    {d24[]}, [r12]
+            vld1.u16    {d25[]}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
             bx          lr
-END(prefetch_clamp1)
+1:          vdup.u16    q12, d23[3]
+            bx          lr
+END(prefetch_clampright1)
 
-ENTRY(prefetch_clamp4)
-            rsb         r11, r11, #0
-            tst         r11, #8
+PRIVATE(prefetch_clampright4)
+            ands        r12, r11, #15
             beq         1f
-            sub         r1, r1, #16
-            vmov.u16    q11, q10
-1:          vmov        d24, d23
-            tst         r11, #4
-            beq         1f
-            vmov        d24, d22
-            sub         r1, r1, #8
-            vmov        d23, d22
-1:          vmov        d25, d24
-            tst         r11, #8
-            beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #4
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u64    {d24}, [r12]
+            vld1.u64    {d25}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
             bx          lr
-END(prefetch_clamp4)
+1:          vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            bx          lr
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1133,10 @@
             prefetch_out \qa, \qb, \store, q10, q11, d23
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, q10, q11, d23
 4:          b           4f+4
-            @q12 contains pad word from prefetch_clam call
+            @q12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, q12, q12, d25
   .if \rem > 0
             b           4f+4
@@ -1205,28 +1191,18 @@
             vmov.u16    d18, d20
             vmov.u16    d19, d20
   .endif
-            tst         r10, #15
+            ands        r12, r10, #15
             beq         2f
-            rsb         r12, r10, #0
-            tst         r10, #8
-            beq         1f
-            vmov.u16    q11, q10
-            vmov.u16    q10, q9
-1:          tst         r12, #4
-            beq         1f
-            vext.u16    q11, q10, q11, #4
-            vext.u16    q10, q9, q10, #4
-  .if \step == 1
-  1:        tst         r12, #2
-            beq         1f
-            vext.u16    q11, q10, q11, #2
-            vext.u16    q10, q9, q10, #2
-  1:        tst         r12, #1
-            beq         1f
-            vext.u16    q11, q10, q11, #1
-            vext.u16    q10, q9, q10, #1
-  .endif
-1:          sub         r1, r1, r10
+            sub         sp, sp, #32
+            vst1.u16    {q10,q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            sub         r1, r1, r10
             bic         r10, r10, #15
             add         r1, r1, r10
 2:
@@ -1383,7 +1359,7 @@
 .endm
 
 .irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
             push        {r12,lr}
 
             sub         r1, r1, r8
@@ -1397,7 +1373,7 @@
 .endr
 
 .irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
             sub         r12, sp, #0x200
             bic         r9, r12, #0x3fc
             mov         sp, r9
@@ -1447,8 +1423,7 @@
 
             ldr         r12, [sp,#124]
 
-            add         r0, r0, r8 @, LSL #2 /* for blur4 option */
-            add         r1, r1, r8 @, LSL #2 /* for blur4 option */
+            add         r1, r1, r8
 
             cmp         r6, r5
             movhi       r6, r5
@@ -1503,7 +1478,6 @@
 
             ldr         r12, [sp,#124]
 
-            add         r0, r0, r8, LSL #2
             add         r1, r1, r8, LSL #2
 
             cmp         r6, r5
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda1..e8b3fb6 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -230,6 +230,11 @@
     args->push_back("-mtriple");
     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
 
+    // Enable workaround for A53 codegen by default.
+#if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND)
+    args->push_back("-aarch64-fix-cortex-a53-835769");
+#endif
+
     // Execute the bcc compiler.
     if (useRSDebugContext) {
         args->push_back("-rs-debug-ctx");
diff --git a/driver/runtime/arch/asimd.ll b/driver/runtime/arch/asimd.ll
index e1a54b4..efc53c8 100644
--- a/driver/runtime/arch/asimd.ll
+++ b/driver/runtime/arch/asimd.ll
@@ -1116,8 +1116,8 @@
 }
 
 ; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
-define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
-    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<4 x i32> %color) nounwind readnone {
+    %1 = bitcast <4 x i32> %color to <4 x float>
     %2 = insertelement <4 x float> %1, float 1.0, i32 3
     %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
     ret <4 x i8> %3
diff --git a/driver/runtime/ll32/allocation.ll b/driver/runtime/ll32/allocation.ll
index d0b3932..21d7cac 100644
--- a/driver/runtime/ll32/allocation.ll
+++ b/driver/runtime/ll32/allocation.ll
@@ -650,17 +650,20 @@
 }
 
 
-define <4 x i64> @__rsAllocationVLoadXImpl_long4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
-  ret <4 x i64> %3
+  store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !52
+  ret void
 }
-define <3 x i64> @__rsAllocationVLoadXImpl_long3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
-  ret <3 x i64> %3
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
+  store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !47
+  ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_long2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
@@ -669,17 +672,20 @@
   ret <2 x i64> %3
 }
 
-define <4 x i64> @__rsAllocationVLoadXImpl_ulong4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
-  ret <4 x i64> %3
+  store <4 x i64> %3, <4 x i64>* %agg.result, align 32, !tbaa !48
+  ret void
 }
-define <3 x i64> @__rsAllocationVLoadXImpl_ulong3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x i64>*
-  %3 = load <3 x i64>* %2, align 8
-  ret <3 x i64> %3
+  %2 = bitcast i8* %1 to <4 x i64>*
+  %3 = load <4 x i64>* %2, align 8
+  %4 = bitcast <3 x i64>* %agg.result to <4 x i64>*
+  store <4 x i64> %3, <4 x i64>* %4, align 32, !tbaa !51
+  ret void
 }
 define <2 x i64> @__rsAllocationVLoadXImpl_ulong2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
@@ -821,17 +827,20 @@
   ret <2 x float> %3
 }
 
-define <4 x double> @__rsAllocationVLoadXImpl_double4([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double4(<4 x double>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
   %3 = load <4 x double>* %2, align 8
-  ret <4 x double> %3
+  store <4 x double> %3, <4 x double>* %agg.result, align 32, !tbaa !60
+  ret void
 }
-define <3 x double> @__rsAllocationVLoadXImpl_double3([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double3(<3 x double>* noalias nocapture sret %agg.result, [1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
-  %2 = bitcast i8* %1 to <3 x double>*
-  %3 = load <3 x double>* %2, align 8
-  ret <3 x double> %3
+  %2 = bitcast i8* %1 to <4 x double>*
+  %3 = load <4 x double>* %2, align 8
+  %4 = bitcast <3 x double>* %agg.result to <4 x double>*
+  store <4 x double> %3, <4 x double>* %4, align 32, !tbaa !59
+  ret void
 }
 define <2 x double> @__rsAllocationVLoadXImpl_double2([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #0 {
   %1 = tail call i8* @rsOffsetNs([1 x i32] %a.coerce, i32 %x, i32 %y, i32 %z) #2
diff --git a/driver/runtime/ll64/allocation.ll b/driver/runtime/ll64/allocation.ll
index d026ce8..c667b4b 100644
--- a/driver/runtime/ll64/allocation.ll
+++ b/driver/runtime/ll64/allocation.ll
@@ -665,14 +665,14 @@
 }
 
 
-define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
   store <4 x i64> %3, <4 x i64>* %agg.result
   ret void
 }
-define void @__rsAllocationVLoadXImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_long3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i64>*
   %3 = load <3 x i64>* %2, align 8
@@ -686,14 +686,14 @@
   ret <2 x i64> %3
 }
 
-define void @__rsAllocationVLoadXImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong4(<4 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x i64>*
   %3 = load <4 x i64>* %2, align 8
   store <4 x i64> %3, <4 x i64>* %agg.result
   ret void
 }
-define void @__rsAllocationVLoadXImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_ulong3(<3 x i64>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x i64>*
   %3 = load <3 x i64>* %2, align 8
@@ -840,14 +840,14 @@
   ret <2 x float> %3
 }
 
-define void @__rsAllocationVLoadXImpl_double4(<4 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double4(<4 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <4 x double>*
   %3 = load <4 x double>* %2, align 8
   store <4 x double> %3, <4 x double>* %agg.result
   ret void
 }
-define void @__rsAllocationVLoadXImpl_double3(<3 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #0 {
+define void @__rsAllocationVLoadXImpl_double3(<3 x double>* noalias nocapture sret %agg.result, %struct.rs_allocation* nocapture readonly %a, i32 %x, i32 %y, i32 %z) #1 {
   %1 = tail call i8* @rsOffsetNs(%struct.rs_allocation* %a, i32 %x, i32 %y, i32 %z) #2
   %2 = bitcast i8* %1 to <3 x double>*
   %3 = load <3 x double>* %2, align 8
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
index 7cf7caf..e34666a 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
@@ -103,8 +103,8 @@
         HISTOGRAM ("Histogram"),
         MANDELBROT_DOUBLE ("Mandelbrot fp64"),
         RESIZE_BICUBIC_SCRIPT ("Resize BiCubic Script"),
-        RESIZE_BICUBIC_INTRINSIC ("Resize BiCubic Intrinsic");
-
+        RESIZE_BICUBIC_INTRINSIC ("Resize BiCubic Intrinsic"),
+        MIRROR ("Mirror Image");
 
         private final String name;
 
@@ -374,6 +374,9 @@
         case RESIZE_BICUBIC_INTRINSIC:
             mTest = new Resize(true);
             break;
+        case MIRROR:
+            mTest = new Mirror();
+            break;
         }
 
         mTest.createBaseTest(this);
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingTest.java b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingTest.java
index 1a4793c..7c8ce63 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingTest.java
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingTest.java
@@ -404,4 +404,10 @@
         TestAction ta = new TestAction(TestName.MANDELBROT_DOUBLE);
         runTest(ta, TestName.MANDELBROT_DOUBLE.name());
     }
+    // Test case 42: Mirror
+    @LargeTest
+    public void testMirror() {
+        TestAction ta = new TestAction(TestName.MIRROR);
+        runTest(ta, TestName.MIRROR.name());
+    }
 }
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/Mirror.java b/java/tests/ImageProcessing/src/com/android/rs/image/Mirror.java
new file mode 100644
index 0000000..b59cb92
--- /dev/null
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/Mirror.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image;
+
+import java.lang.Math;
+
+import android.renderscript.Allocation;
+import android.renderscript.Element;
+import android.renderscript.RenderScript;
+import android.renderscript.Script;
+import android.renderscript.ScriptC;
+import android.renderscript.Type;
+import android.util.Log;
+
+public class Mirror extends TestBase {
+    private ScriptC_mirror mScript;
+    private int mWidth;
+    private int mHeight;
+
+    public void createTest(android.content.res.Resources res) {
+        mScript = new ScriptC_mirror(mRS);
+
+        mWidth = mInPixelsAllocation.getType().getX();
+        mHeight = mInPixelsAllocation.getType().getY();
+
+        mScript.set_gIn(mInPixelsAllocation);
+        mScript.set_gWidth(mWidth);
+        mScript.set_gHeight(mHeight);
+    }
+
+    public void runTest() {
+        mScript.forEach_mirror(mOutPixelsAllocation);
+    }
+
+}
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java b/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java
index 86e1645..85038f7 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java
@@ -74,7 +74,6 @@
             mIntrinsic.forEach_bicubic(mOutPixelsAllocation);
         } else {
             mScript.forEach_bicubic(mOutPixelsAllocation);
-            //mScript.forEach_nearest(mOutPixelsAllocation);
         }
     }
 
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/mirror.rs b/java/tests/ImageProcessing/src/com/android/rs/image/mirror.rs
new file mode 100644
index 0000000..6a075d0
--- /dev/null
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/mirror.rs
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidth;
+int32_t gHeight;
+rs_allocation gIn;
+
+uchar4 RS_KERNEL mirror(uint32_t x, uint32_t y) {
+    uint32_t x0 = gWidth-x-1;
+    uchar4 p = rsGetElementAt_uchar4(gIn, x0, y);
+    return p;
+}
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs b/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs
index ec283be..101d282 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs
@@ -86,7 +86,7 @@
     float4 p3  = cubicInterpolate(p30, p31, p32, p33, xf);
 
     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return convert_uchar4(p);
 }
 
diff --git a/java/tests/ImageProcessing_jb/Android.mk b/java/tests/ImageProcessing_jb/Android.mk
index 65925b8..4893be9 100644
--- a/java/tests/ImageProcessing_jb/Android.mk
+++ b/java/tests/ImageProcessing_jb/Android.mk
@@ -17,6 +17,8 @@
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 
+LOCAL_JAVA_LIBRARIES := android.test.runner
+
 LOCAL_MODULE_TAGS := tests
 
 LOCAL_SRC_FILES := $(call all-java-files-under, src) \
diff --git a/java/tests/ImageProcessing_jb/AndroidManifest.xml b/java/tests/ImageProcessing_jb/AndroidManifest.xml
index 7d42883..5720ff7 100644
--- a/java/tests/ImageProcessing_jb/AndroidManifest.xml
+++ b/java/tests/ImageProcessing_jb/AndroidManifest.xml
@@ -3,9 +3,10 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="com.android.rs.imagejb">
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
-    <uses-sdk android:minSdkVersion="11" />
-    <application android:label="IP-18"
-                 android:hardwareAccelerated="true">
+    <uses-sdk android:minSdkVersion="18" />
+    <application android:label="ImageProcessing"
+                 android:hardwareAccelerated="true"
+                 android:theme="@android:style/Theme.Holo.Light">
         <activity android:name="ImageProcessingActivityJB">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/java/tests/ImageProcessing_jb/res/drawable-hdpi/ic_action_settings.png b/java/tests/ImageProcessing_jb/res/drawable-hdpi/ic_action_settings.png
new file mode 100644
index 0000000..54eecde
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/res/drawable-hdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing_jb/res/drawable-mdpi/ic_action_settings.png b/java/tests/ImageProcessing_jb/res/drawable-mdpi/ic_action_settings.png
new file mode 100644
index 0000000..25c36db
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/res/drawable-mdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing_jb/res/drawable-xhdpi/ic_action_settings.png b/java/tests/ImageProcessing_jb/res/drawable-xhdpi/ic_action_settings.png
new file mode 100644
index 0000000..425a8bc
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/res/drawable-xhdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing_jb/res/drawable-xxhdpi/ic_action_settings.png b/java/tests/ImageProcessing_jb/res/drawable-xxhdpi/ic_action_settings.png
new file mode 100644
index 0000000..fe5fec4
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/res/drawable-xxhdpi/ic_action_settings.png
Binary files differ
diff --git a/java/tests/ImageProcessing_jb/res/layout/controls.xml b/java/tests/ImageProcessing_jb/res/layout/controls.xml
index 0e89dd9..a77fd6b 100644
--- a/java/tests/ImageProcessing_jb/res/layout/controls.xml
+++ b/java/tests/ImageProcessing_jb/res/layout/controls.xml
@@ -20,47 +20,6 @@
             android:layout_height="fill_parent"
             android:id="@+id/toplevel">
 
-    <Spinner
-        android:id="@+id/image_size"
-        android:layout_width="fill_parent"
-        android:layout_height="wrap_content"/>
-
-    <LinearLayout
-        android:orientation="horizontal"
-        android:layout_width="fill_parent" android:layout_height="wrap_content">
-        <ToggleButton android:id="@+id/io_control"
-             android:layout_width="wrap_content"
-             android:layout_height="wrap_content"
-             android:textColorLink="@android:color/holo_blue_light"
-             android:textOff="@string/io_control_on"
-             android:textOn="@string/io_control_off"
-             android:textSize="12dp"/>
-        <ToggleButton
-             android:id="@+id/length_control"
-             android:layout_width="wrap_content"
-             android:layout_height="wrap_content"
-             android:textColorLink="@android:color/holo_blue_light"
-             android:textOff="@string/length_long"
-             android:textOn="@string/length_short"
-             android:textSize="12dp"/>
-        <ToggleButton
-             android:id="@+id/background_work"
-             android:layout_width="wrap_content"
-             android:layout_height="wrap_content"
-             android:textColorLink="@android:color/holo_blue_light"
-             android:textOff="@string/dvfs_on"
-             android:textOn="@string/dvfs_off"
-             android:textSize="12dp"/>
-        <ToggleButton
-             android:id="@+id/pause"
-             android:layout_width="wrap_content"
-             android:layout_height="wrap_content"
-             android:textColorLink="@android:color/holo_blue_light"
-             android:textOff="@string/pause_on"
-             android:textOn="@string/pause_off"
-             android:textSize="12dp"/>
-    </LinearLayout>
-
     <ListView
         android:id="@+id/test_list"
         android:layout_weight="0.2"
@@ -71,49 +30,25 @@
         android:orientation="horizontal"
         android:layout_width="fill_parent" android:layout_height="wrap_content">
         <Button
-         android:id="@+id/select_all"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/select_all"
-         android:textSize="12dp"
-         android:onClick="btnSelAll"/>
+             android:id="@+id/run"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/benchmark"
+             android:onClick="btnRun"/>
         <Button
-         android:id="@+id/select_none"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/select_none"
-         android:textSize="12dp"
-         android:onClick="btnSelNone"/>
+             android:id="@+id/select_all"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/select_all"
+             android:onClick="btnSelAll"/>
         <Button
-         android:id="@+id/select_hp"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/select_hp"
-         android:textSize="12dp"
-         android:onClick="btnSelHp"/>
-        <Button
-         android:id="@+id/select_lp"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/select_lp"
-         android:textSize="12dp"
-         android:onClick="btnSelLp"/>
-        <Button
-         android:id="@+id/select_intrinsics"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/select_intrinsics"
-         android:textSize="12dp"
-         android:onClick="btnSelIntrinsic"/>
+             android:id="@+id/select_none"
+             android:layout_width="wrap_content"
+             android:layout_height="wrap_content"
+             android:text="@string/select_none"
+             android:onClick="btnSelNone"/>
     </LinearLayout>
 
-    <Button
-         android:id="@+id/run"
-         android:layout_width="wrap_content"
-         android:layout_height="wrap_content"
-         android:text="@string/benchmark"
-         android:onClick="btnRun"/>
-
     <TextView
         android:id="@+id/results"
         android:layout_width="match_parent"
diff --git a/java/tests/ImageProcessing_jb/res/layout/spinner_layout.xml b/java/tests/ImageProcessing_jb/res/layout/spinner_layout.xml
index 8196bbf..7e9590e 100644
--- a/java/tests/ImageProcessing_jb/res/layout/spinner_layout.xml
+++ b/java/tests/ImageProcessing_jb/res/layout/spinner_layout.xml
@@ -18,6 +18,6 @@
 <TextView xmlns:android="http://schemas.android.com/apk/res/android"
     android:layout_width="fill_parent"
     android:layout_height="fill_parent"
-    android:padding="10dp"
-    android:textSize="16sp"
+    android:padding="2sp"
+    android:textSize="14sp"
 />
diff --git a/java/tests/ImageProcessing_jb/res/menu/main_activity_actions.xml b/java/tests/ImageProcessing_jb/res/menu/main_activity_actions.xml
new file mode 100644
index 0000000..df0159b
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/res/menu/main_activity_actions.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2014 The Android Open Source Project
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+          http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+-->
+
+<menu xmlns:android="http://schemas.android.com/apk/res/android" >
+    <item android:id="@+id/action_res"
+          android:title="@string/action_res"
+          android:icon="@drawable/ic_action_settings"
+          android:showAsAction="always"
+          android:actionViewClass="android.widget.Spinner" />
+
+
+    <item android:id="@+id/action_settings"
+          android:icon="@drawable/ic_action_settings"
+          android:title="@string/action_settings"
+          android:showAsAction="always"/>
+
+</menu>
diff --git a/java/tests/ImageProcessing_jb/res/values/strings.xml b/java/tests/ImageProcessing_jb/res/values/strings.xml
index fdaefcd..5ca8d7c 100644
--- a/java/tests/ImageProcessing_jb/res/values/strings.xml
+++ b/java/tests/ImageProcessing_jb/res/values/strings.xml
@@ -31,21 +31,26 @@
     <string name="benchmark">Benchmark</string>
 
     <string name="results">Results: not run</string>
-
-    <string name="io_control_on">USAGE_IO</string>
-    <string name="io_control_off">USAGE_IO</string>
     <string name="length_long">Long run</string>
     <string name="length_short">Long run</string>
-    <string name="dvfs_on">Background work</string>
-    <string name="dvfs_off">Background work</string>
-    <string name="run_all">Benchmark All</string>
-    <string name="run_one">Benchmark One</string>
     <string name="select_all">All</string>
     <string name="select_none">None</string>
-    <string name="select_hp">Full FP</string>
-    <string name="select_lp">Relaxed FP</string>
-    <string name="pause_on">Pause</string>
-    <string name="pause_off">Pause</string>
-    <string name="select_intrinsics">Intrinsics</string>
+
+    <string name="action_settings">Setting</string>
+    <string name="action_resolution">Resolution</string>
+
+    <string name="action_res">res</string>
+    <string name="ok">Ok</string>
+    <string name="cancel">Cancel</string>
+    <string name="settings">settings</string>
+    <string-array
+        name="settings_array">
+        <item>Use shared memory (TextureView) for output</item>
+        <item>Animate paramaters during benchmark</item>
+        <item>Display output while testing</item>
+        <item>Simulate background CPU load</item>
+        <item>Run each test longer, 10 seconds</item>
+        <item>Pause 10 seconds between tests</item>
+    </string-array>
 
 </resources>
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blend.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blend.java
index 63c0c9c..1516ad4 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blend.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blend.java
@@ -53,7 +53,7 @@
                     currentIntrinsic = pos;
                     if (mRS != null) {
                         runTest();
-                        act.updateDisplay();
+                        act.mProcessor.update();
                     }
                 }
 
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25.java
index 6d71e9e..1192762 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25.java
@@ -99,20 +99,4 @@
             mScript.forEach_vert(mOutPixelsAllocation);
         }
     }
-
-    public void setupBenchmark() {
-        if (mUseIntrinsic) {
-            mIntrinsic.setRadius(MAX_RADIUS);
-        } else {
-            mScript.invoke_setRadius(MAX_RADIUS);
-        }
-    }
-
-    public void exitBenchmark() {
-        if (mUseIntrinsic) {
-            mIntrinsic.setRadius(mRadius);
-        } else {
-            mScript.invoke_setRadius((int)mRadius);
-        }
-    }
 }
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25G.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25G.java
index 6e0cf59..46c0250 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25G.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Blur25G.java
@@ -82,14 +82,6 @@
         mIntrinsic.forEach(mScratchPixelsAllocation2);
     }
 
-    public void setupBenchmark() {
-        mIntrinsic.setRadius(MAX_RADIUS);
-    }
-
-    public void exitBenchmark() {
-        mIntrinsic.setRadius(mRadius);
-    }
-
     public void updateBitmap(Bitmap b) {
         mScript.forEach_toU8_4(mScratchPixelsAllocation2, mOutPixelsAllocation);
         mOutPixelsAllocation.copyTo(b);
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ColorMatrix.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ColorMatrix.java
index e60fee6..86d748c 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ColorMatrix.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ColorMatrix.java
@@ -58,6 +58,22 @@
         }
     }
 
+    public void animateBars(float time) {
+        Matrix4f m = new Matrix4f();
+        m.set(1, 0, (time + 0.2f) % 1.0f);
+        m.set(1, 1, (time + 0.9f) % 1.0f);
+        m.set(1, 2, (time + 0.4f) % 1.0f);
+        if (mUseIntrinsic) {
+            if (mUseGrey) {
+                return;
+            } else {
+                mIntrinsic.setColorMatrix(m);
+            }
+        } else {
+            mScript.invoke_setMatrix(m);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mInPixelsAllocation, mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve3x3.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve3x3.java
index 7d9ad35..301d344 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve3x3.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve3x3.java
@@ -40,15 +40,26 @@
         mUseIntrinsic = useIntrinsic;
     }
 
+    private float blend(float v1, float v2, float p) {
+        return (v2 * p) + (v1 * (1.f-p));
+    }
+
+    private float[] updateMatrix(float str) {
+        float f[] = new float[9];
+        float cf1 = blend(1.f / 9.f, 0.f, str);
+        float cf2 = blend(1.f / 9.f, -1.f, str);
+        float cf3 = blend(1.f / 9.f, 5.f, str);
+        f[0] =  cf1;  f[1] = cf2;   f[2] = cf1;
+        f[3] =  cf2;  f[4] = cf3;   f[5] = cf2;
+        f[6] =  cf1;  f[7] = cf2;   f[8] = cf1;
+        return f;
+    }
+
     public void createTest(android.content.res.Resources res) {
         mWidth = mInPixelsAllocation.getType().getX();
         mHeight = mInPixelsAllocation.getType().getY();
 
-        float f[] = new float[9];
-        f[0] =  0.f;    f[1] = -1.f;    f[2] =  0.f;
-        f[3] = -1.f;    f[4] =  5.f;    f[5] = -1.f;
-        f[6] =  0.f;    f[7] = -1.f;    f[8] =  0.f;
-
+        float f[] = updateMatrix(1.f);
         if (mUseIntrinsic) {
             mIntrinsic = ScriptIntrinsicConvolve3x3.create(mRS, Element.U8_4(mRS));
             mIntrinsic.setCoefficients(f);
@@ -62,6 +73,15 @@
         }
     }
 
+    public void animateBars(float time) {
+        float f[] = updateMatrix(time % 1.f);
+        if (mUseIntrinsic) {
+            mIntrinsic.setCoefficients(f);
+        } else {
+            mScript.set_gCoeffs(f);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve5x5.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve5x5.java
index 6b0ef8c..6a627f5 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve5x5.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Convolve5x5.java
@@ -40,11 +40,33 @@
         mUseIntrinsic = useIntrinsic;
     }
 
+    private float blend(float v1, float v2, float p) {
+        return (v2 * p) + (v1 * (1.f-p));
+    }
+
+    private float[] updateMatrix(float str) {
+        float f[] = new float[25];
+        final float f125 = 1.f / 25.f;
+        float cf1 = blend(f125, -1.f, str);
+        float cf2 = blend(f125, -3.f, str);
+        float cf3 = blend(f125, -4.f, str);
+        float cf4 = blend(f125, 6.f, str);
+        float cf5 = blend(f125, 20.f, str);
+        float cf6 = blend(f125, 0.f, str);
+        f[0] = cf1;  f[1] = cf2; f[2] = cf3; f[3] = cf2; f[4] = cf1;
+        f[5] = cf2;  f[6] = cf6; f[7] = cf4; f[8] = cf6; f[9] = cf2;
+        f[10]= cf3;  f[11]= cf4; f[12]= cf5; f[13]= cf4; f[14]= cf3;
+        f[15]= cf2;  f[16]= cf6; f[17]= cf4; f[18]= cf6; f[19]= cf2;
+        f[20]= cf1;  f[21]= cf2; f[22]= cf3; f[23]= cf2; f[24]= cf1;
+        return f;
+    }
+
+
     public void createTest(android.content.res.Resources res) {
         mWidth = mInPixelsAllocation.getType().getX();
         mHeight = mInPixelsAllocation.getType().getY();
 
-        float f[] = new float[25];
+        float f[] = updateMatrix(1.f);
         //f[0] = 0.012f; f[1] = 0.025f; f[2] = 0.031f; f[3] = 0.025f; f[4] = 0.012f;
         //f[5] = 0.025f; f[6] = 0.057f; f[7] = 0.075f; f[8] = 0.057f; f[9] = 0.025f;
         //f[10]= 0.031f; f[11]= 0.075f; f[12]= 0.095f; f[13]= 0.075f; f[14]= 0.031f;
@@ -57,12 +79,6 @@
         //f[15]= 4.f; f[16]= 8.f; f[17]= 0.f; f[18]= -8.f; f[19]= -4.f;
         //f[20]= 1.f; f[21]= 2.f; f[22]= 0.f; f[23]= -2.f; f[24]= -1.f;
 
-        f[0] = -1.f; f[1] = -3.f; f[2] = -4.f; f[3] = -3.f; f[4] = -1.f;
-        f[5] = -3.f; f[6] =  0.f; f[7] =  6.f; f[8] =  0.f; f[9] = -3.f;
-        f[10]= -4.f; f[11]=  6.f; f[12]= 20.f; f[13]=  6.f; f[14]= -4.f;
-        f[15]= -3.f; f[16]=  0.f; f[17]=  6.f; f[18]=  0.f; f[19]= -3.f;
-        f[20]= -1.f; f[21]= -3.f; f[22]= -4.f; f[23]= -3.f; f[24]= -1.f;
-
         if (mUseIntrinsic) {
             mIntrinsic = ScriptIntrinsicConvolve5x5.create(mRS, Element.U8_4(mRS));
             mIntrinsic.setCoefficients(f);
@@ -76,6 +92,15 @@
         }
     }
 
+    public void animateBars(float time) {
+        float f[] = updateMatrix(time % 1.f);
+        if (mUseIntrinsic) {
+            mIntrinsic.setCoefficients(f);
+        } else {
+            mScript.set_gCoeffs(f);
+        }
+    }
+
     public void runTest() {
         if (mUseIntrinsic) {
             mIntrinsic.forEach(mOutPixelsAllocation);
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Fisheye.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Fisheye.java
index 012c60d..4ded8c5 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Fisheye.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Fisheye.java
@@ -71,6 +71,11 @@
         do_init();
     }
 
+    public void animateBars(float time) {
+        scale = time % 2.f;
+        do_init();
+    }
+
     private void do_init() {
         if (approx) {
             if (relaxed)
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Grain.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Grain.java
index 31e5f79..02d72f9 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Grain.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Grain.java
@@ -45,6 +45,11 @@
         mScript.set_gNoiseStrength(s);
     }
 
+    public void animateBars(float time) {
+        mScript.set_gNoiseStrength(time % 1.f);
+    }
+
+
     private int findHighBit(int v) {
         int bit = 0;
         while (v > 1) {
@@ -89,6 +94,5 @@
         mScript.forEach_blend9(mNoise2);
         mScript.forEach_root(mInPixelsAllocation, mOutPixelsAllocation);
     }
-
 }
 
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/GroupTest.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/GroupTest.java
index 3e5175a..ff1e1ff 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/GroupTest.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/GroupTest.java
@@ -79,6 +79,14 @@
         }
     }
 
+    public void animateBars(float time) {
+        Matrix4f m = new Matrix4f();
+        m.set(1, 0, (time + 0.2f) % 1.0f);
+        m.set(1, 1, (time + 0.9f) % 1.0f);
+        m.set(1, 2, (time + 0.4f) % 1.0f);
+        mMatrix.setColorMatrix(m);
+    }
+
     public void runTest() {
         mConvolve.setInput(mInPixelsAllocation);
         if (mUseNative) {
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPControlsJB.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPControlsJB.java
index 911736f..a38bc84 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPControlsJB.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPControlsJB.java
@@ -16,11 +16,13 @@
 
 package com.android.rs.imagejb;
 
+import android.view.Menu;
+import android.view.MenuItem;
+import android.view.MenuInflater;
+
 import android.app.Activity;
 import android.os.Bundle;
 import android.os.Handler;
-import android.os.Message;
-import android.graphics.Canvas;
 import android.graphics.Point;
 import android.view.SurfaceView;
 import android.widget.AdapterView;
@@ -48,7 +50,6 @@
     private final String TAG = "Img";
     public final String RESULT_FILE = "image_processing_result.csv";
 
-    private ToggleButton mIOButton;
     private Spinner mResSpinner;
     private ListView mTestListView;
     private TextView mResultView;
@@ -56,10 +57,13 @@
     private ArrayAdapter<String> mTestListAdapter;
     private ArrayList<String> mTestList = new ArrayList<String>();
 
-    private boolean mToggleIO = false;
-    private boolean mToggleDVFS = false;
-    private boolean mToggleLong = false;
-    private boolean mTogglePause = false;
+    private boolean mSettings[] = {true, true, true, false, false, false};
+    private static final int SETTING_USE_IO = 0;
+    private static final int SETTING_ANIMATE = 1;
+    private static final int SETTING_DISPLAY = 2;
+    private static final int SETTING_USE_DVFS = 3;
+    private static final int SETTING_LONG_RUN = 4;
+    private static final int SETTING_PAUSE = 5;
 
     private float mResults[];
 
@@ -86,6 +90,34 @@
     }
     private Resolutions mRes;
 
+    @Override
+    public boolean onCreateOptionsMenu(Menu menu) {
+        // Inflate the menu items for use in the action bar
+        MenuInflater inflater = getMenuInflater();
+        inflater.inflate(R.menu.main_activity_actions, menu);
+
+        MenuItem searchItem = menu.findItem(R.id.action_res);
+        mResSpinner = (Spinner) searchItem.getActionView();
+
+        mResSpinner.setOnItemSelectedListener(mResSpinnerListener);
+        mResSpinner.setAdapter(new ArrayAdapter<Resolutions>(
+            this, R.layout.spinner_layout, Resolutions.values()));
+
+        // Choose one of the image sizes that close to the resolution
+        // of the screen.
+        Point size = new Point();
+        getWindowManager().getDefaultDisplay().getSize(size);
+        int md = (size.x > size.y) ? size.x : size.y;
+        for (int ct=0; ct < Resolutions.values().length; ct++) {
+            if (Resolutions.values()[ct].width <= (int)(md * 1.2)) {
+                mResSpinner.setSelection(ct);
+                break;
+            }
+        }
+
+        return super.onCreateOptionsMenu(menu);
+    }
+
 
     private AdapterView.OnItemSelectedListener mResSpinnerListener =
             new AdapterView.OnItemSelectedListener() {
@@ -97,13 +129,19 @@
                 }
             };
 
-    void init() {
-        mIOButton = (ToggleButton) findViewById(R.id.io_control);
+    void launchDemo(int id) {
+        IPTestListJB.TestName t[] = IPTestListJB.TestName.values();
 
-        mResSpinner = (Spinner) findViewById(R.id.image_size);
-        mResSpinner.setOnItemSelectedListener(mResSpinnerListener);
-        mResSpinner.setAdapter(new ArrayAdapter<Resolutions>(
-            this, R.layout.spinner_layout, Resolutions.values()));
+        int testList[] = new int[1];
+        testList[0] = id;
+
+        Intent intent = makeBasicLaunchIntent();
+        intent.putExtra("tests", testList);
+        intent.putExtra("demo", true);
+        startActivityForResult(intent, 0);
+    }
+
+    void init() {
 
         for (int i=0; i < IPTestListJB.TestName.values().length; i++) {
             mTestList.add(IPTestListJB.TestName.values()[i].toString());
@@ -118,51 +156,15 @@
         mTestListView.setChoiceMode(ListView.CHOICE_MODE_MULTIPLE);
         mTestListAdapter.notifyDataSetChanged();
 
-        ToggleButton toggle;
-        toggle = (ToggleButton) findViewById(R.id.io_control);
-        toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
-            public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-                mToggleIO = isChecked;
-            }
-        });
-        toggle.setChecked(mToggleIO);
-
-        toggle = (ToggleButton) findViewById(R.id.length_control);
-        toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
-            public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-                mToggleLong = isChecked;
-            }
-        });
-        toggle.setChecked(mToggleLong);
-
-        toggle = (ToggleButton) findViewById(R.id.background_work);
-        toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
-            public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-                mToggleDVFS = isChecked;
-            }
-        });
-        toggle.setChecked(mToggleDVFS);
-
-        toggle = (ToggleButton) findViewById(R.id.pause);
-        toggle.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
-            public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-                mTogglePause = isChecked;
-            }
-        });
-        toggle.setChecked(mTogglePause);
-
         mResultView = (TextView) findViewById(R.id.results);
 
-
-        Point size = new Point();
-        getWindowManager().getDefaultDisplay().getSize(size);
-        int md = (size.x > size.y) ? size.x : size.y;
-        for (int ct=0; ct < Resolutions.values().length; ct++) {
-            if (Resolutions.values()[ct].width <= (int)(md * 1.2)) {
-                mResSpinner.setSelection(ct);
-                break;
-            }
-        }
+        mTestListView.setOnItemLongClickListener(new ListView.OnItemLongClickListener() {
+                public boolean onItemLongClick(AdapterView<?> arg0, View arg1,
+                        int pos, long id) {
+                    launchDemo(pos);
+                    return true;
+                }
+            });
     }
 
     @Override
@@ -194,6 +196,19 @@
         }
     }
 
+    Intent makeBasicLaunchIntent() {
+        Intent intent = new Intent(this, ImageProcessingActivityJB.class);
+        intent.putExtra("enable io", mSettings[SETTING_USE_IO]);
+        intent.putExtra("enable dvfs", mSettings[SETTING_USE_DVFS]);
+        intent.putExtra("enable long", mSettings[SETTING_LONG_RUN]);
+        intent.putExtra("enable pause", mSettings[SETTING_PAUSE]);
+        intent.putExtra("enable animate", mSettings[SETTING_ANIMATE]);
+        intent.putExtra("enable display", mSettings[SETTING_DISPLAY]);
+        intent.putExtra("resolution X", mRes.width);
+        intent.putExtra("resolution Y", mRes.height);
+        return intent;
+    }
+
     public void btnRun(View v) {
         IPTestListJB.TestName t[] = IPTestListJB.TestName.values();
 
@@ -215,14 +230,8 @@
             }
         }
 
-        Intent intent = new Intent(this, ImageProcessingActivityJB.class);
+        Intent intent = makeBasicLaunchIntent();
         intent.putExtra("tests", testList);
-        intent.putExtra("enable io", mToggleIO);
-        intent.putExtra("enable dvfs", mToggleDVFS);
-        intent.putExtra("enable long", mToggleLong);
-        intent.putExtra("enable pause", mTogglePause);
-        intent.putExtra("resolution X", mRes.width);
-        intent.putExtra("resolution Y", mRes.height);
         startActivityForResult(intent, 0);
     }
 
@@ -307,6 +316,18 @@
         }
     }
 
+    public boolean onOptionsItemSelected(MenuItem item) {
+        // Handle presses on the action bar items
+        switch(item.getItemId()) {
+            case R.id.action_settings:
+                IPSettings newFragment = new IPSettings(mSettings);
+                newFragment.show(getFragmentManager(), "settings");
+                return true;
+            default:
+                return super.onOptionsItemSelected(item);
+        }
+    }
+
     public void btnSelNone(View v) {
         checkGroup(-1);
     }
@@ -319,6 +340,11 @@
         checkGroup(1);
     }
 
+    public void btnSettings(View v) {
+        IPSettings newFragment = new IPSettings(mSettings);
+        newFragment.show(getFragmentManager(), "settings");
+    }
+
     public void btnSelIntrinsic(View v) {
         checkGroup(2);
     }
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPSettings.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPSettings.java
new file mode 100644
index 0000000..d9b9c31
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPSettings.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.imagejb;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.DialogFragment;
+import android.app.Dialog;
+import android.content.DialogInterface;
+import android.os.Bundle;
+import android.view.View;
+
+public class IPSettings extends DialogFragment {
+    private boolean[] mEnables;
+    public boolean mOk = false;
+
+    public IPSettings(boolean[] enables) {
+        mEnables = enables;
+    }
+
+    @Override
+    public Dialog onCreateDialog(Bundle savedInstanceState) {
+        AlertDialog.Builder builder = new AlertDialog.Builder(getActivity());
+        builder.setTitle(R.string.settings);
+
+        // Specify the list array, the items to be selected by default (null for none),
+        // and the listener through which to receive callbacks when items are selected
+        builder.setMultiChoiceItems(R.array.settings_array, mEnables,
+                          new DialogInterface.OnMultiChoiceClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int which, boolean isChecked) {
+                       mEnables[which] = isChecked;
+                   }
+               });
+
+        // Set the action buttons
+        builder.setPositiveButton(R.string.ok, new DialogInterface.OnClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int id) {
+                       mOk = true;
+                   }
+               });
+        builder.setNegativeButton(R.string.cancel, new DialogInterface.OnClickListener() {
+                   @Override
+                   public void onClick(DialogInterface dialog, int id) {
+                   }
+               });
+
+        return builder.create();
+    }
+}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
index 398f9c1..b03171f 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
@@ -32,48 +32,50 @@
      * Define enum type for test names
      */
     public enum TestName {
-        LEVELS_VEC3_RELAXED ("Levels Vec3 Relaxed", RELAXED_FP, 55.6f),
-        LEVELS_VEC4_RELAXED ("Levels Vec4 Relaxed", RELAXED_FP, 39.1f),
-        LEVELS_VEC3_FULL ("Levels Vec3 Full", FULL_FP, 57.4f),
-        LEVELS_VEC4_FULL ("Levels Vec4 Full", FULL_FP, 68.1f),
-        BLUR_RADIUS_25 ("Blur radius 25", RELAXED_FP, 1045.f),
-        INTRINSIC_BLUR_RADIUS_25 ("Intrinsic Blur radius 25", INTRINSIC, 643.f),
-        GREYSCALE ("Greyscale", RELAXED_FP, 38.3f),
-        GRAIN ("Grain", RELAXED_FP, 57.8f),
-        FISHEYE_FULL ("Fisheye Full", FULL_FP, 211.2f),
-        FISHEYE_RELAXED ("Fisheye Relaxed", RELAXED_FP, 198.1f),
-        FISHEYE_APPROXIMATE_FULL ("Fisheye Approximate Full", FULL_FP, 211.0f),
-        FISHEYE_APPROXIMATE_RELAXED ("Fisheye Approximate Relaxed", RELAXED_FP, 190.1f),
-        VIGNETTE_FULL ("Vignette Full", FULL_FP, 98.6f),
-        VIGNETTE_RELAXED ("Vignette Relaxed", RELAXED_FP, 110.7f),
-        VIGNETTE_APPROXIMATE_FULL ("Vignette Approximate Full", FULL_FP, 80.6f),
-        VIGNETTE_APPROXIMATE_RELAXED ("Vignette Approximate Relaxed", RELAXED_FP, 87.9f),
-        GROUP_TEST_EMULATED ("Group Test (emulated)", INTRINSIC, 37.81f),
-        GROUP_TEST_NATIVE ("Group Test (native)", INTRINSIC, 37.8f),
-        CONVOLVE_3X3 ("Convolve 3x3", RELAXED_FP, 62.1f),
-        INTRINSICS_CONVOLVE_3X3 ("Intrinsics Convolve 3x3", INTRINSIC, 24.5f),
-        COLOR_MATRIX ("ColorMatrix", RELAXED_FP, 25.5f),
-        INTRINSICS_COLOR_MATRIX ("Intrinsics ColorMatrix", INTRINSIC, 13.3f),
-        INTRINSICS_COLOR_MATRIX_GREY ("Intrinsics ColorMatrix Grey", INTRINSIC, 13.4f),
-        COPY ("Copy", RELAXED_FP, 25.6f),
-        CROSS_PROCESS_USING_LUT ("CrossProcess (using LUT)", INTRINSIC, 18.6f),
-        CONVOLVE_5X5 ("Convolve 5x5", RELAXED_FP, 215.8f),
-        INTRINSICS_CONVOLVE_5X5 ("Intrinsics Convolve 5x5", INTRINSIC, 29.8f),
-        MANDELBROT_FLOAT ("Mandelbrot (fp32)", FULL_FP, 108.1f),
-        MANDELBROT_DOUBLE ("Mandelbrot (fp64)", FULL_FP, 108.1f),
-        INTRINSICS_BLEND ("Intrinsics Blend", INTRINSIC, 94.2f),
-        INTRINSICS_BLUR_25G ("Intrinsics Blur 25 uchar", INTRINSIC, 173.3f),
-        VIBRANCE ("Vibrance", RELAXED_FP, 88.3f),
-        BW_FILTER ("BW Filter", RELAXED_FP, 69.7f),
-        SHADOWS ("Shadows", RELAXED_FP, 155.3f),
-        CONTRAST ("Contrast", RELAXED_FP, 27.0f),
-        EXPOSURE ("Exposure", RELAXED_FP, 64.7f),
-        WHITE_BALANCE ("White Balance", RELAXED_FP, 160.1f),
-        COLOR_CUBE ("Color Cube", RELAXED_FP, 85.3f),
-        COLOR_CUBE_3D_INTRINSIC ("Color Cube (3D LUT intrinsic)", INTRINSIC, 49.5f),
-        ARTISTIC1 ("Artistic 1", RELAXED_FP, 120.f),
-        RESIZE_BI_SCRIPT ("Resize BiCubic Script", RELAXED_FP, 100.f),
-        RESIZE_BI_INTRINSIC ("Resize BiCubic Intrinsic", INTRINSIC, 100.f);
+        LEVELS_VEC3_RELAXED ("Levels Vec3 Relaxed", RELAXED_FP, 61.1f),
+        LEVELS_VEC4_RELAXED ("Levels Vec4 Relaxed", RELAXED_FP, 44.6f),
+        LEVELS_VEC3_FULL ("Levels Vec3 Full", FULL_FP, 61.9f),
+        LEVELS_VEC4_FULL ("Levels Vec4 Full", FULL_FP, 73.f),
+        BLUR_RADIUS_25 ("Blur radius 25", RELAXED_FP, 1103.f),
+        INTRINSIC_BLUR_RADIUS_25 ("Intrinsic Blur radius 25", INTRINSIC, 176.f),
+        GREYSCALE ("Greyscale", RELAXED_FP, 43.7f),
+        GRAIN ("Grain", RELAXED_FP, 147.4f),
+        FISHEYE_FULL ("Fisheye Full", FULL_FP, 192.f),
+        FISHEYE_RELAXED ("Fisheye Relaxed", RELAXED_FP, 181.f),
+        FISHEYE_APPROXIMATE_FULL ("Fisheye Approximate Full", FULL_FP, 193.f),
+        FISHEYE_APPROXIMATE_RELAXED ("Fisheye Approximate Relaxed", RELAXED_FP, 183.f),
+        VIGNETTE_FULL ("Vignette Full", FULL_FP, 101.f),
+        VIGNETTE_RELAXED ("Vignette Relaxed", RELAXED_FP, 116.f),
+        VIGNETTE_APPROXIMATE_FULL ("Vignette Approximate Full", FULL_FP, 85.1f),
+        VIGNETTE_APPROXIMATE_RELAXED ("Vignette Approximate Relaxed", RELAXED_FP, 96.7f),
+        GROUP_TEST_EMULATED ("Group Test (emulated)", INTRINSIC, 51.7f),
+        GROUP_TEST_NATIVE ("Group Test (native)", INTRINSIC, 52.9f),
+        CONVOLVE_3X3 ("Convolve 3x3", RELAXED_FP, 74.2f),
+        INTRINSICS_CONVOLVE_3X3 ("Intrinsics Convolve 3x3", INTRINSIC, 33.3f),
+        COLOR_MATRIX ("ColorMatrix", RELAXED_FP, 33.8f),
+        INTRINSICS_COLOR_MATRIX ("Intrinsics ColorMatrix", INTRINSIC, 21.3f),
+        INTRINSICS_COLOR_MATRIX_GREY ("Intrinsics ColorMatrix Grey", INTRINSIC, 21.4f),
+        COPY ("Copy", RELAXED_FP, 21.4f),
+        CROSS_PROCESS_USING_LUT ("CrossProcess (using LUT)", INTRINSIC, 23.1f),
+        CONVOLVE_5X5 ("Convolve 5x5", RELAXED_FP, 236.f),
+        INTRINSICS_CONVOLVE_5X5 ("Intrinsics Convolve 5x5", INTRINSIC, 39.6f),
+        MANDELBROT_FLOAT ("Mandelbrot (fp32)", FULL_FP, 117.f),
+        MANDELBROT_DOUBLE ("Mandelbrot (fp64)", FULL_FP, 136.f),
+        INTRINSICS_BLEND ("Intrinsics Blend", INTRINSIC, 105.f),
+        INTRINSICS_BLUR_25G ("Intrinsics Blur 25 uchar", INTRINSIC, 37.8f),
+        VIBRANCE ("Vibrance", RELAXED_FP, 103.f),
+        BW_FILTER ("BW Filter", RELAXED_FP, 86.f),
+        SHADOWS ("Shadows", RELAXED_FP, 130.f),
+        CONTRAST ("Contrast", RELAXED_FP, 45.4f),
+        EXPOSURE ("Exposure", RELAXED_FP, 73.4f),
+        WHITE_BALANCE ("White Balance", RELAXED_FP, 138.2f),
+        COLOR_CUBE ("Color Cube", RELAXED_FP, 83.9f),
+        COLOR_CUBE_3D_INTRINSIC ("Color Cube (3D LUT intrinsic)", INTRINSIC, 34.7f),
+        ARTISTIC1 ("Artistic 1", RELAXED_FP, 140.f),
+        RESIZE_BI_SCRIPT ("Resize BiCubic Script", RELAXED_FP, 253.f),
+        RESIZE_BI_INTRINSIC ("Resize BiCubic Intrinsic", INTRINSIC, 255.f),
+        POSTERIZE_INVOKE ("Posterize with invoke", RELAXED_FP, 215.f),
+        POSTERIZE_SET ("Posterize with set", INTRINSIC, 221.f);
 
 
         private final String name;
@@ -183,6 +185,10 @@
             return new Resize(false);
         case RESIZE_BI_INTRINSIC:
             return new Resize(true);
+        case POSTERIZE_INVOKE:
+            return new Posterize(true);
+        case POSTERIZE_SET:
+            return new Posterize(false);
         }
         return null;
     }
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingActivityJB.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingActivityJB.java
index 6d8ecb0..e49e9cc 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingActivityJB.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingActivityJB.java
@@ -17,6 +17,7 @@
 package com.android.rs.imagejb;
 
 import android.app.Activity;
+
 import android.content.Intent;
 import android.os.Bundle;
 import android.os.Handler;
@@ -52,6 +53,10 @@
     private SeekBar mBar3;
     private SeekBar mBar4;
     private SeekBar mBar5;
+
+    private int mBars[] = new int[5];
+    private int mBarsOld[] = new int[5];
+
     private TextView mText1;
     private TextView mText2;
     private TextView mText3;
@@ -66,8 +71,27 @@
     private boolean mToggleDVFS;
     private boolean mToggleLong;
     private boolean mTogglePause;
+    private boolean mToggleAnimate;
+    private boolean mToggleDisplay;
     private int mBitmapWidth;
     private int mBitmapHeight;
+    private boolean mDemoMode;
+
+    // Updates pending is a counter of how many kernels have been
+    // sent to RS for processing
+    //
+    // In benchmark this is incremented each time a kernel is launched and
+    // decremented each time a kernel completes
+    //
+    // In demo mode, each UI input increments the counter and it is zeroed
+    // when the latest settings are sent to RS for processing.
+    private int mUpdatesPending;
+
+    // In demo mode this is used to count updates in the pipeline.  It's
+    // incremented when work is submitted to RS and decremented when invalidate is
+    // called to display a result.
+    private int mShowsPending;
+
 
     static public class SizedTV extends TextureView {
         int mWidth;
@@ -98,6 +122,28 @@
 
     /////////////////////////////////////////////////////////////////////////
 
+    // Message processor to handle notifications for when kernel completes
+    private class MessageProcessor extends RenderScript.RSMessageHandler {
+        MessageProcessor() {
+        }
+
+        public void run() {
+            synchronized(mProcessor) {
+                // In demo mode, decrement the pending displays and notify the
+                // UI processor it can now enqueue more work if additional updates
+                // are blocked by a full pipeline.
+                if (mShowsPending > 0) {
+                    mShowsPending --;
+                    mProcessor.notifyAll();
+                }
+            }
+        }
+    }
+
+
+    /////////////////////////////////////////////////////////////////////////
+    // Processor is a helper thread for running the work without
+    // blocking the UI thread.
     class Processor extends Thread {
         RenderScript mRS;
         Allocation mInPixelsAllocation;
@@ -108,17 +154,45 @@
         private Surface mOutSurface;
         private float mLastResult;
         private boolean mRun = true;
-        private int mOp = 0;
         private boolean mDoingBenchmark;
         private TestBase mTest;
         private TextureView mDisplayView;
 
         private boolean mBenchmarkMode;
 
+        // We don't want to call the "changed" methods excessively as this
+        // can cause extra work for drivers.  Before running a test update
+        // any bars which have changed.
+        void runTest() {
+            if (mBars[0] != mBarsOld[0]) {
+                mTest.onBar1Changed(mBars[0]);
+                mBarsOld[0] = mBars[0];
+            }
+            if (mBars[1] != mBarsOld[1]) {
+                mTest.onBar2Changed(mBars[1]);
+                mBarsOld[1] = mBars[1];
+            }
+            if (mBars[2] != mBarsOld[2]) {
+                mTest.onBar3Changed(mBars[2]);
+                mBarsOld[2] = mBars[2];
+            }
+            if (mBars[3] != mBarsOld[3]) {
+                mTest.onBar4Changed(mBars[3]);
+                mBarsOld[3] = mBars[3];
+            }
+            if (mBars[4] != mBarsOld[4]) {
+                mTest.onBar5Changed(mBars[4]);
+                mBarsOld[4] = mBars[4];
+            }
+            mTest.runTest();
+        }
+
         Processor(RenderScript rs, TextureView v, boolean benchmarkMode) {
             mRS = rs;
             mDisplayView = v;
 
+            mRS.setMessageHandler(new MessageProcessor());
+
             switch(mBitmapWidth) {
             case 3840:
                 mInPixelsAllocation = Allocation.createFromBitmapResource(
@@ -146,6 +220,8 @@
                 break;
             }
 
+            // We create the output allocation using USAGE_IO_OUTPUT so we can share the
+            // bits with a TextureView.  This is more efficient than using a bitmap.
             mOutDisplayAllocation = Allocation.createTyped(mRS, mInPixelsAllocation.getType(),
                                                                Allocation.MipmapControl.MIPMAP_NONE,
                                                                Allocation.USAGE_SCRIPT |
@@ -163,82 +239,122 @@
             start();
         }
 
+        class Result {
+            float totalTime;
+            int itterations;
+        }
+
+        // Run one loop of kernels for at least the specified minimum time.
+        // The function returns the average time in ms for the test run
+        private Result runBenchmarkLoop(float minTime) {
+            mUpdatesPending = 0;
+            Result r = new Result();
+
+            long t = java.lang.System.currentTimeMillis();
+            do {
+                synchronized(this) {
+                    // Shows pending is used to track the number of kernels in the RS pipeline
+                    // We throttle it to 2.  This provide some buffering to allow a kernel to be started
+                    // before we are nofitied the previous finished.  However, larger numbers are uncommon
+                    // in interactive apps as they introduce 'lag' between user input and display.
+                    mShowsPending++;
+                    if (mShowsPending > 2) {
+                        try {
+                            this.wait();
+                        } catch(InterruptedException e) {
+                        }
+                    }
+                }
+
+                // If animations are enabled update the test state.
+                if (mToggleAnimate) {
+                    mTest.animateBars(r.totalTime);
+                }
+
+                // Run the kernel
+                mTest.runTest();
+                r.itterations ++;
+
+                if (mToggleDisplay) {
+                    // If we are not outputting directly to the TextureView we need to copy from
+                    // our temporary buffer.
+                    if (mOutDisplayAllocation != mOutPixelsAllocation) {
+                        mOutDisplayAllocation.copyFrom(mOutPixelsAllocation);
+                    }
+
+                    // queue the update of the TextureView with the allocation contents
+                    mOutDisplayAllocation.ioSend();
+                }
+
+                // Send our RS message handler a message so we know when this work has completed
+                mRS.sendMessage(0, null);
+
+                long t2 = java.lang.System.currentTimeMillis();
+                r.totalTime += (t2 - t) / 1000.f;
+                t = t2;
+            } while (r.totalTime < minTime);
+
+            // Wait for any stray operations to complete and update the final time
+            mRS.finish();
+            long t2 = java.lang.System.currentTimeMillis();
+            r.totalTime += (t2 - t) / 1000.f;
+            t = t2;
+            return r;
+        }
+
+
+        // Get a benchmark result for a specific test
         private float getBenchmark() {
             mDoingBenchmark = true;
+            mUpdatesPending = 0;
 
-            mTest.setupBenchmark();
             long result = 0;
-            long runtime = 1000;
+            float runtime = 1.f;
             if (mToggleLong) {
-                runtime = 10000;
+                runtime = 10.f;
             }
 
             if (mToggleDVFS) {
                 mDvfsWar.go();
             }
 
-            //Log.v("rs", "Warming");
-            long t = java.lang.System.currentTimeMillis() + 250;
-            do {
-                mTest.runTest();
-                mTest.finish();
-            } while (t > java.lang.System.currentTimeMillis());
-            //mHandler.sendMessage(Message.obtain());
+            // We run a short bit of work before starting the actual test
+            // this is to let any power management do its job and respond
+            runBenchmarkLoop(0.3f);
 
-            //Log.v("rs", "Benchmarking");
-            int ct = 0;
-            t = java.lang.System.currentTimeMillis();
-            do {
-                mTest.runTest();
-                mTest.finish();
-                ct++;
-            } while ((t + runtime) > java.lang.System.currentTimeMillis());
-            t = java.lang.System.currentTimeMillis() - t;
-            float ft = (float)t;
-            ft /= ct;
+            // Run the actual benchmark
+            Result r = runBenchmarkLoop(runtime);
 
-            mTest.exitBenchmark();
+            Log.v("rs", "Test: time=" + r.totalTime +"s,  frames=" + r.itterations +
+                  ", avg=" + r.totalTime / r.itterations * 1000.f);
+
             mDoingBenchmark = false;
-
-            android.util.Log.v("rs", "bench " + ft);
-            return ft;
+            return r.totalTime / r.itterations * 1000.f;
         }
 
-        private Handler mHandler = new Handler() {
-            // Allow the filter to complete without blocking the UI
-            // thread.  When the message arrives that the op is complete
-            // we will either mark completion or start a new filter if
-            // more work is ready.  Either way, display the result.
-            @Override
-            public void handleMessage(Message msg) {
-                synchronized(this) {
-                    if (mRS == null || mOutPixelsAllocation == null) {
-                        return;
-                    }
-                    if (mOutDisplayAllocation != mOutPixelsAllocation) {
-                        mOutDisplayAllocation.copyFrom(mOutPixelsAllocation);
-                    }
-                    mOutDisplayAllocation.ioSend();
-                    mDisplayView.invalidate();
-                    //mTest.runTestSendMessage();
-                }
-            }
-        };
-
         public void run() {
             Surface lastSurface = null;
             while (mRun) {
+                // Our loop for launching tests or benchmarks
                 synchronized(this) {
-                    try {
-                        this.wait();
-                    } catch(InterruptedException e) {
+                    // If we have no work to do, or we have displays pending, wait
+                    if ((mUpdatesPending == 0) || (mShowsPending != 0)) {
+                        try {
+                            this.wait();
+                        } catch(InterruptedException e) {
+                        }
                     }
+
+                    // We may have been asked to exit while waiting
                     if (!mRun) return;
 
+                    // During startup we may not have a surface yet to display, if
+                    // this is the case, wait.
                     if ((mOutSurface == null) || (mOutPixelsAllocation == null)) {
                         continue;
                     }
 
+                    // Our display surface changed, set it.
                     if (lastSurface != mOutSurface) {
                         mOutDisplayAllocation.setSurface(mOutSurface);
                         lastSurface = mOutSurface;
@@ -246,19 +362,27 @@
                 }
 
                 if (mBenchmarkMode) {
+                    // Loop over the tests we want to benchmark
                     for (int ct=0; (ct < mTestList.length) && mRun; ct++) {
-                        mRS.finish();
 
+                        // For reproducibility we wait a short time for any sporadic work
+                        // created by the user touching the screen to launch the test to pass.
+                        // Also allows for things to settle after the test changes.
+                        mRS.finish();
                         try {
                             sleep(250);
                         } catch(InterruptedException e) {
                         }
 
+                        // If we just ran a test, we destroy it here to relieve some memory pressure
                         if (mTest != null) {
                             mTest.destroy();
                         }
 
-                        mTest = changeTest(mTestList[ct]);
+                        // Select the next test
+                        mTest = changeTest(mTestList[ct], false);
+
+                        // If the user selected the "long pause" option, wait
                         if (mTogglePause) {
                             for (int i=0; (i < 100) && mRun; i++) {
                                 try {
@@ -268,30 +392,57 @@
                             }
                         }
 
+                        // Run the test
                         mTestResults[ct] = getBenchmark();
-                        mHandler.sendMessage(Message.obtain());
                     }
                     onBenchmarkFinish(mRun);
+                } else {
+                    boolean update = false;
+                    synchronized(this) {
+                        // If we have updates to process and are not blocked by pending shows,
+                        // start the next kernel
+                        if ((mUpdatesPending > 0) && (mShowsPending == 0)) {
+                            mUpdatesPending = 0;
+                            update = true;
+                            mShowsPending++;
+                        }
+                    }
+
+                    if (update) {
+                        // Run the kernel
+                        runTest();
+
+                        // If we are not outputting directly to the TextureView we need to copy from
+                        // our temporary buffer.
+                        if (mOutDisplayAllocation != mOutPixelsAllocation) {
+                            mOutDisplayAllocation.copyFrom(mOutPixelsAllocation);
+                        }
+
+                        // queue the update of the TextureView with the allocation contents
+                        mOutDisplayAllocation.ioSend();
+
+                        // Send our RS message handler a message so we know when this work has completed
+                        mRS.sendMessage(0, null);
+                    }
                 }
             }
 
         }
 
         public void update() {
+            // something UI related has changed, enqueue an update if one is not
+            // already pending.  Wake the worker if needed
             synchronized(this) {
-                if (mOp == 0) {
-                    mOp = 2;
+                if (mUpdatesPending < 2) {
+                    mUpdatesPending++;
+                    notifyAll();
                 }
-                notifyAll();
             }
         }
 
         public void setSurface(Surface s) {
-            synchronized(this) {
-                mOutSurface = s;
-                notifyAll();
-            }
-            //update();
+            mOutSurface = s;
+            update();
         }
 
         public void exit() {
@@ -311,6 +462,11 @@
             if (mOutPixelsAllocation != mOutDisplayAllocation) {
                 mOutPixelsAllocation.destroy();
             }
+
+            if (mTest != null) {
+                mTest.destroy();
+                mTest = null;
+            }
             mOutDisplayAllocation.destroy();
             mRS.destroy();
 
@@ -382,39 +538,33 @@
     private boolean mDoingBenchmark;
     public Processor mProcessor;
 
+    TestBase changeTest(IPTestListJB.TestName t, boolean setupUI) {
+        TestBase tb = IPTestListJB.newTest(t);
 
-    private Handler mHandler = new Handler() {
-        @Override
-        public void handleMessage(Message msg) {
-            mDisplayView.invalidate();
+        tb.createBaseTest(this);
+        if (setupUI) {
+            setupBars(tb);
         }
-    };
-
-    public void updateDisplay() {
-        mHandler.sendMessage(Message.obtain());
-        //mProcessor.update();
+        return tb;
     }
 
-    TestBase changeTest(int id) {
+    TestBase changeTest(int id, boolean setupUI) {
         IPTestListJB.TestName t = IPTestListJB.TestName.values()[id];
-        TestBase tb = IPTestListJB.newTest(t);
-        tb.createBaseTest(this);
-        //setupBars(tb);
-        return tb;
+        return changeTest(t, setupUI);
     }
 
     public void onProgressChanged(SeekBar seekBar, int progress, boolean fromUser) {
         if (fromUser) {
             if (seekBar == mBar1) {
-                mProcessor.mTest.onBar1Changed(progress);
+                mBars[0] = progress;
             } else if (seekBar == mBar2) {
-                mProcessor.mTest.onBar2Changed(progress);
+                mBars[1] = progress;
             } else if (seekBar == mBar3) {
-                mProcessor.mTest.onBar3Changed(progress);
+                mBars[2] = progress;
             } else if (seekBar == mBar4) {
-                mProcessor.mTest.onBar4Changed(progress);
+                mBars[3] = progress;
             } else if (seekBar == mBar5) {
-                mProcessor.mTest.onBar5Changed(progress);
+                mBars[4] = progress;
             }
             mProcessor.update();
         }
@@ -516,22 +666,11 @@
         finish();
     }
 
-    @Override
-    protected void onResume() {
-        super.onResume();
-        Intent i = getIntent();
-        mTestList = i.getIntArrayExtra("tests");
 
-        mToggleIO = i.getBooleanExtra("enable io", false);
-        mToggleDVFS = i.getBooleanExtra("enable dvfs", false);
-        mToggleLong = i.getBooleanExtra("enable long", false);
-        mTogglePause = i.getBooleanExtra("enable pause", false);
-        mBitmapWidth = i.getIntExtra("resolution X", 0);
-        mBitmapHeight = i.getIntExtra("resolution Y", 0);
-
-        mTestResults = new float[mTestList.length];
-
-        hideBars();
+    void startProcessor() {
+        if (!mDemoMode) {
+            hideBars();
+        }
 
         Point size = new Point();
         getWindowManager().getDefaultDisplay().getSize(size);
@@ -561,8 +700,33 @@
         mDisplayView.mHeight = th;
         //mDisplayView.setTransform(new android.graphics.Matrix());
 
-        mProcessor = new Processor(RenderScript.create(this), mDisplayView, true);
+        mProcessor = new Processor(RenderScript.create(this), mDisplayView, !mDemoMode);
         mDisplayView.setSurfaceTextureListener(this);
+
+        if (mDemoMode) {
+            mProcessor.mTest = changeTest(mTestList[0], true);
+        }
+    }
+
+    @Override
+    protected void onResume() {
+        super.onResume();
+        Intent i = getIntent();
+        mTestList = i.getIntArrayExtra("tests");
+
+        mToggleIO = i.getBooleanExtra("enable io", false);
+        mToggleDVFS = i.getBooleanExtra("enable dvfs", false);
+        mToggleLong = i.getBooleanExtra("enable long", false);
+        mTogglePause = i.getBooleanExtra("enable pause", false);
+        mToggleAnimate = i.getBooleanExtra("enable animate", false);
+        mToggleDisplay = i.getBooleanExtra("enable display", false);
+        mBitmapWidth = i.getIntExtra("resolution X", 0);
+        mBitmapHeight = i.getIntExtra("resolution Y", 0);
+        mDemoMode = i.getBooleanExtra("demo", false);
+
+        mTestResults = new float[mTestList.length];
+
+        startProcessor();
     }
 
     protected void onDestroy() {
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTest.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTest.java
new file mode 100644
index 0000000..9a9086b
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTest.java
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.imagejb;
+
+
+import android.os.Bundle;
+import android.util.Log;
+
+import com.android.rs.imagejb.IPTestListJB.TestName;
+import com.android.rs.imagejb.ImageProcessingTestRunner;
+
+import android.test.ActivityInstrumentationTestCase2;
+import android.test.suitebuilder.annotation.LargeTest;
+
+/**
+ * ImageProcessing benchmark test.
+ * To run the test, please use command
+ *
+ * adb shell am instrument -e iteration <n> -w com.android.rs.image/.ImageProcessingTestRunner
+ *
+ */
+public class ImageProcessingTest extends ActivityInstrumentationTestCase2<ImageProcessingActivityJB> {
+    private final String TAG = "ImageProcessingTest";
+    private final String TEST_NAME = "Testname";
+    private final String ITERATIONS = "Iterations";
+    private final String BENCHMARK = "Benchmark";
+    private static int INSTRUMENTATION_IN_PROGRESS = 2;
+    private int mIteration;
+    private ImageProcessingActivityJB mActivity;
+
+    public ImageProcessingTest() {
+        super(ImageProcessingActivityJB.class);
+    }
+
+
+    protected void prepareTest(int test) {
+        /*
+        mActivity.mTestList = new int[1];
+        mActivity.mTestList[0] = test;
+
+        mActivity.mBitmapWidth = 1920;
+        mActivity.mBitmapHeight = 1080;
+
+        mActivity.mTestResults = new float[1];
+
+        mActivity.startProcessor();*/
+    }
+
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        setActivityInitialTouchMode(false);
+        mActivity = getActivity();
+        ImageProcessingTestRunner mRunner = (ImageProcessingTestRunner) getInstrumentation();
+        mIteration = mRunner.mIteration;
+        assertTrue("please enter a valid iteration value", mIteration > 0);
+   }
+
+    @Override
+    public void tearDown() throws Exception {
+        super.tearDown();
+    }
+
+    class TestAction implements Runnable {
+        TestName mTestName;
+        float mResult;
+        public TestAction(TestName testName) {
+            mTestName = testName;
+        }
+        public void run() {
+            mActivity.changeTest(mTestName, false);
+            //mResult = mActivity.getBenchmark();
+            Log.v(TAG, "Benchmark for test \"" + mTestName.toString() + "\" is: " + mResult);
+            synchronized(this) {
+                this.notify();
+            }
+        }
+        public float getBenchmark() {
+            return mResult;
+        }
+    }
+
+    // Set the benchmark thread to run on ui thread
+    // Synchronized the thread such that the test will wait for the benchmark thread to finish
+    public void runOnUiThread(Runnable action) {
+        synchronized(action) {
+            mActivity.runOnUiThread(action);
+            try {
+                action.wait();
+            } catch (InterruptedException e) {
+                Log.v(TAG, "waiting for action running on UI thread is interrupted: " +
+                        e.toString());
+            }
+        }
+    }
+
+    public void runTest(TestAction ta, String testName) {
+        float sum = 0;
+        for (int i = 0; i < mIteration; i++) {
+            runOnUiThread(ta);
+            float bmValue = ta.getBenchmark();
+            Log.v(TAG, "results for iteration " + i + " is " + bmValue);
+            sum += bmValue;
+        }
+        float avgResult = sum/mIteration;
+
+        // post result to INSTRUMENTATION_STATUS
+        Bundle results = new Bundle();
+        results.putString(TEST_NAME, testName);
+        results.putInt(ITERATIONS, mIteration);
+        results.putFloat(BENCHMARK, avgResult);
+        getInstrumentation().sendStatus(INSTRUMENTATION_IN_PROGRESS, results);
+    }
+
+    // Test case 0: Levels Vec3 Relaxed
+    @LargeTest
+    public void testLevelsVec3Relaxed() {
+        TestAction ta = new TestAction(TestName.LEVELS_VEC3_RELAXED);
+        runTest(ta, TestName.LEVELS_VEC3_RELAXED.name());
+    }
+
+    // Test case 1: Levels Vec4 Relaxed
+    @LargeTest
+    public void testLevelsVec4Relaxed() {
+        TestAction ta = new TestAction(TestName.LEVELS_VEC4_RELAXED);
+        runTest(ta, TestName.LEVELS_VEC4_RELAXED.name());
+    }
+
+    // Test case 2: Levels Vec3 Full
+    @LargeTest
+    public void testLevelsVec3Full() {
+        TestAction ta = new TestAction(TestName.LEVELS_VEC3_FULL);
+        runTest(ta, TestName.LEVELS_VEC3_FULL.name());
+    }
+
+    // Test case 3: Levels Vec4 Full
+    @LargeTest
+    public void testLevelsVec4Full() {
+        TestAction ta = new TestAction(TestName.LEVELS_VEC4_FULL);
+        runTest(ta, TestName.LEVELS_VEC4_FULL.name());
+    }
+
+    // Test case 4: Blur Radius 25
+    @LargeTest
+    public void testBlurRadius25() {
+        TestAction ta = new TestAction(TestName.BLUR_RADIUS_25);
+        runTest(ta, TestName.BLUR_RADIUS_25.name());
+    }
+
+    // Test case 5: Intrinsic Blur Radius 25
+    @LargeTest
+    public void testIntrinsicBlurRadius25() {
+        TestAction ta = new TestAction(TestName.INTRINSIC_BLUR_RADIUS_25);
+        runTest(ta, TestName.INTRINSIC_BLUR_RADIUS_25.name());
+    }
+
+    // Test case 6: Greyscale
+    @LargeTest
+    public void testGreyscale() {
+        TestAction ta = new TestAction(TestName.GREYSCALE);
+        runTest(ta, TestName.GREYSCALE.name());
+    }
+
+    // Test case 7: Grain
+    @LargeTest
+    public void testGrain() {
+        TestAction ta = new TestAction(TestName.GRAIN);
+        runTest(ta, TestName.GRAIN.name());
+    }
+
+    // Test case 8: Fisheye Full
+    @LargeTest
+    public void testFisheyeFull() {
+        TestAction ta = new TestAction(TestName.FISHEYE_FULL);
+        runTest(ta, TestName.FISHEYE_FULL.name());
+    }
+
+    // Test case 9: Fisheye Relaxed
+    @LargeTest
+    public void testFishEyeRelaxed() {
+        TestAction ta = new TestAction(TestName.FISHEYE_RELAXED);
+        runTest(ta, TestName.FISHEYE_RELAXED.name());
+    }
+
+    // Test case 10: Fisheye Approximate Full
+    @LargeTest
+    public void testFisheyeApproximateFull() {
+        TestAction ta = new TestAction(TestName.FISHEYE_APPROXIMATE_FULL);
+        runTest(ta, TestName.FISHEYE_APPROXIMATE_FULL.name());
+    }
+
+    // Test case 11: Fisheye Approximate Relaxed
+    @LargeTest
+    public void testFisheyeApproximateRelaxed() {
+        TestAction ta = new TestAction(TestName.FISHEYE_APPROXIMATE_RELAXED);
+        runTest(ta, TestName.FISHEYE_APPROXIMATE_RELAXED.name());
+    }
+
+    // Test case 12: Vignette Full
+    @LargeTest
+    public void testVignetteFull() {
+        TestAction ta = new TestAction(TestName.VIGNETTE_FULL);
+        runTest(ta, TestName.VIGNETTE_FULL.name());
+    }
+
+    // Test case 13: Vignette Relaxed
+    @LargeTest
+    public void testVignetteRelaxed() {
+        TestAction ta = new TestAction(TestName.VIGNETTE_RELAXED);
+        runTest(ta, TestName.VIGNETTE_RELAXED.name());
+    }
+
+    // Test case 14: Vignette Approximate Full
+    @LargeTest
+    public void testVignetteApproximateFull() {
+        TestAction ta = new TestAction(TestName.VIGNETTE_APPROXIMATE_FULL);
+        runTest(ta, TestName.VIGNETTE_APPROXIMATE_FULL.name());
+    }
+
+    // Test case 15: Vignette Approximate Relaxed
+    @LargeTest
+    public void testVignetteApproximateRelaxed() {
+        TestAction ta = new TestAction(TestName.VIGNETTE_APPROXIMATE_RELAXED);
+        runTest(ta, TestName.VIGNETTE_APPROXIMATE_RELAXED.name());
+    }
+
+    // Test case 16: Group Test (emulated)
+    @LargeTest
+    public void testGroupTestEmulated() {
+        TestAction ta = new TestAction(TestName.GROUP_TEST_EMULATED);
+        runTest(ta, TestName.GROUP_TEST_EMULATED.name());
+    }
+
+    // Test case 17: Group Test (native)
+    @LargeTest
+    public void testGroupTestNative() {
+        TestAction ta = new TestAction(TestName.GROUP_TEST_NATIVE);
+        runTest(ta, TestName.GROUP_TEST_NATIVE.name());
+    }
+
+    // Test case 18: Convolve 3x3
+    @LargeTest
+    public void testConvolve3x3() {
+        TestAction ta = new TestAction(TestName.CONVOLVE_3X3);
+        runTest(ta, TestName.CONVOLVE_3X3.name());
+    }
+
+    // Test case 19: Intrinsics Convolve 3x3
+    @LargeTest
+    public void testIntrinsicsConvolve3x3() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_CONVOLVE_3X3);
+        runTest(ta, TestName.INTRINSICS_CONVOLVE_3X3.name());
+    }
+
+    // Test case 20: ColorMatrix
+    @LargeTest
+    public void testColorMatrix() {
+        TestAction ta = new TestAction(TestName.COLOR_MATRIX);
+        runTest(ta, TestName.COLOR_MATRIX.name());
+    }
+
+    // Test case 21: Intrinsics ColorMatrix
+    @LargeTest
+    public void testIntrinsicsColorMatrix() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_COLOR_MATRIX);
+        runTest(ta, TestName.INTRINSICS_COLOR_MATRIX.name());
+    }
+
+    // Test case 22: Intrinsics ColorMatrix Grey
+    @LargeTest
+    public void testIntrinsicsColorMatrixGrey() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_COLOR_MATRIX_GREY);
+        runTest(ta, TestName.INTRINSICS_COLOR_MATRIX_GREY.name());
+    }
+
+    // Test case 23: Copy
+    @LargeTest
+    public void testCopy() {
+        TestAction ta = new TestAction(TestName.COPY);
+        runTest(ta, TestName.COPY.name());
+    }
+
+    // Test case 24: CrossProcess (using LUT)
+    @LargeTest
+    public void testCrossProcessUsingLUT() {
+        TestAction ta = new TestAction(TestName.CROSS_PROCESS_USING_LUT);
+        runTest(ta, TestName.CROSS_PROCESS_USING_LUT.name());
+    }
+
+    // Test case 25: Convolve 5x5
+    @LargeTest
+    public void testConvolve5x5() {
+        TestAction ta = new TestAction(TestName.CONVOLVE_5X5);
+        runTest(ta, TestName.CONVOLVE_5X5.name());
+    }
+
+    // Test case 26: Intrinsics Convolve 5x5
+    @LargeTest
+    public void testIntrinsicsConvolve5x5() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_CONVOLVE_5X5);
+        runTest(ta, TestName.INTRINSICS_CONVOLVE_5X5.name());
+    }
+
+    // Test case 27: Mandelbrot
+    @LargeTest
+    public void testMandelbrot() {
+        TestAction ta = new TestAction(TestName.MANDELBROT_FLOAT);
+        runTest(ta, TestName.MANDELBROT_FLOAT.name());
+    }
+
+    // Test case 28: Intrinsics Blend
+    @LargeTest
+    public void testIntrinsicsBlend() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_BLEND);
+        runTest(ta, TestName.INTRINSICS_BLEND.name());
+    }
+
+    // Test case 29: Intrinsics Blur 25 uchar
+    @LargeTest
+    public void testIntrinsicsBlur25G() {
+        TestAction ta = new TestAction(TestName.INTRINSICS_BLUR_25G);
+        runTest(ta, TestName.INTRINSICS_BLUR_25G.name());
+    }
+
+    // Test case 30: Vibrance
+    @LargeTest
+    public void testVibrance() {
+        TestAction ta = new TestAction(TestName.VIBRANCE);
+        runTest(ta, TestName.VIBRANCE.name());
+    }
+
+    // Test case 31: BWFilter
+    @LargeTest
+    public void testBWFilter() {
+        TestAction ta = new TestAction(TestName.BW_FILTER);
+        runTest(ta, TestName.BW_FILTER.name());
+    }
+
+    // Test case 32: Shadows
+    @LargeTest
+    public void testShadows() {
+        TestAction ta = new TestAction(TestName.SHADOWS);
+        runTest(ta, TestName.SHADOWS.name());
+    }
+
+    // Test case 33: Contrast
+    @LargeTest
+    public void testContrast() {
+        TestAction ta = new TestAction(TestName.CONTRAST);
+        runTest(ta, TestName.CONTRAST.name());
+    }
+
+    // Test case 34: Exposure
+    @LargeTest
+    public void testExposure(){
+        TestAction ta = new TestAction(TestName.EXPOSURE);
+        runTest(ta, TestName.EXPOSURE.name());
+    }
+
+    // Test case 35: White Balance
+    @LargeTest
+    public void testWhiteBalance() {
+        TestAction ta = new TestAction(TestName.WHITE_BALANCE);
+        runTest(ta, TestName.WHITE_BALANCE.name());
+    }
+
+    // Test case 36: Color Cube
+    @LargeTest
+    public void testColorCube() {
+        TestAction ta = new TestAction(TestName.COLOR_CUBE);
+        runTest(ta, TestName.COLOR_CUBE.name());
+    }
+
+    // Test case 37: Color Cube (3D Intrinsic)
+    @LargeTest
+    public void testColorCube3DIntrinsic() {
+        TestAction ta = new TestAction(TestName.COLOR_CUBE_3D_INTRINSIC);
+        runTest(ta, TestName.COLOR_CUBE_3D_INTRINSIC.name());
+    }
+/*
+    // Test case 38: Usage io
+    @LargeTest
+    public void testUsageIO() {
+        TestAction ta = new TestAction(TestName.USAGE_IO);
+        runTest(ta, TestName.USAGE_IO.name());
+    }
+    // Test case 39: Artistic 1
+    @LargeTest
+    public void testArtistic1() {
+        TestAction ta = new TestAction(TestName.ARTISTIC_1);
+        runTest(ta, TestName.ARTISTIC_1.name());
+    }
+
+    // Test case 40 Histogram
+    @LargeTest
+    public void testHistogram() {
+        TestAction ta = new TestAction(TestName.HISTOGRAM);
+        runTest(ta, TestName.HISTOGRAM.name());
+    }
+
+    // Test case 41: Mandelbrot fp64
+    @LargeTest
+    public void testMandelbrotfp64() {
+        TestAction ta = new TestAction(TestName.MANDELBROT_DOUBLE);
+        runTest(ta, TestName.MANDELBROT_DOUBLE.name());
+    }
+*/
+}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTestRunner.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTestRunner.java
new file mode 100644
index 0000000..5065741
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/ImageProcessingTestRunner.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.imagejb;
+
+import com.android.rs.imagejb.ImageProcessingTest;
+import android.os.Bundle;
+import android.test.InstrumentationTestRunner;
+import android.test.InstrumentationTestSuite;
+import junit.framework.TestSuite;
+
+/**
+ * Run the ImageProcessing benchmark test
+ * adb shell am instrument -e iteration <n> -w com.android.rs.image/.ImageProcessingTestRunner
+ *
+ */
+public class ImageProcessingTestRunner extends InstrumentationTestRunner {
+    public int mIteration = 5;
+
+    @Override
+    public TestSuite getAllTests() {
+        TestSuite suite = new InstrumentationTestSuite(this);
+        suite.addTestSuite(ImageProcessingTest.class);
+        return suite;
+    }
+
+    @Override
+    public void onCreate(Bundle icicle) {
+        super.onCreate(icicle);
+        String strIteration = (String) icicle.get("iteration");
+        if (strIteration != null) {
+            mIteration = Integer.parseInt(strIteration);
+        }
+    }
+}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/LevelsV4.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/LevelsV4.java
index 3829843..e696392 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/LevelsV4.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/LevelsV4.java
@@ -18,13 +18,7 @@
 
 import java.lang.Math;
 
-import android.renderscript.Allocation;
-import android.renderscript.Element;
-import android.renderscript.RenderScript;
 import android.renderscript.Matrix3f;
-import android.renderscript.Script;
-import android.renderscript.ScriptC;
-import android.renderscript.Type;
 import android.util.Log;
 import android.widget.SeekBar;
 import android.widget.TextView;
@@ -140,6 +134,12 @@
         setLevels();
     }
 
+    public void animateBars(float time) {
+        mSaturation = time % 2.f;
+        setSaturation();
+    }
+
+
     public void createTest(android.content.res.Resources res) {
         mScriptR = new ScriptC_levels_relaxed(mRS);
         mScriptF = new ScriptC_levels_full(mRS);
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Mandelbrot.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Mandelbrot.java
index 0b53ddf..649986d 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Mandelbrot.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Mandelbrot.java
@@ -18,12 +18,6 @@
 
 import java.lang.Math;
 
-import android.renderscript.Allocation;
-import android.renderscript.Element;
-import android.renderscript.RenderScript;
-import android.renderscript.Script;
-import android.renderscript.ScriptC;
-import android.renderscript.Type;
 import android.util.Log;
 import android.widget.SeekBar;
 import android.widget.TextView;
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Posterize.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Posterize.java
new file mode 100644
index 0000000..0e949b8
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Posterize.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.imagejb;
+
+import java.lang.Math;
+
+import android.renderscript.Allocation;
+import android.renderscript.Element;
+import android.renderscript.RenderScript;
+import android.renderscript.Script;
+import android.renderscript.ScriptC;
+import android.renderscript.Type;
+import android.renderscript.Short4;
+import android.util.Log;
+
+public class Posterize extends TestBase {
+    private ScriptC_posterize mScript;
+    boolean mUseInvokes;
+
+    Posterize(boolean useInvoke) {
+        mUseInvokes = useInvoke;
+    }
+
+    public void createTest(android.content.res.Resources res) {
+        mScript = new ScriptC_posterize(mRS);
+    }
+
+    void setParams(float intensHigh, float intensLow, int r, int g, int b) {
+        if (mUseInvokes) {
+            mScript.invoke_setParams(intensHigh, intensLow,
+                                     (short)r, (short)g, (short)b);
+        } else {
+            mScript.set_intensityLow(intensLow);
+            mScript.set_intensityHigh(intensHigh);
+            mScript.set_color(new Short4((short)r, (short)g, (short)b, (short)255));
+        }
+    }
+
+    public void runTest() {
+        mScript.set_inputImage(mInPixelsAllocation);
+        setParams(.2f, 0.f, 255, 0, 0);
+        mScript.forEach_root(mInPixelsAllocation, mOutPixelsAllocation);
+        setParams(.4f, 0.2f, 0, 255, 0);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(.6f, 0.4f, 0, 0, 255);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(.8f, 0.6f, 255, 255, 0);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+        setParams(1.0f, 0.8f, 0, 255, 255);
+        mScript.forEach_root(mOutPixelsAllocation, mOutPixelsAllocation);
+    }
+
+}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Shadows.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Shadows.java
index d246d59..8b06965 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Shadows.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Shadows.java
@@ -16,9 +16,6 @@
 
 package com.android.rs.imagejb;
 
-import java.lang.Math;
-
-import android.renderscript.Allocation;
 
 public class Shadows extends TestBase {
     private ScriptC_shadows mScript;
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
index 3de9809..b2358f5 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/TestBase.java
@@ -18,24 +18,13 @@
 
 import android.app.Activity;
 import android.content.Context;
-import android.os.Bundle;
-import android.graphics.BitmapFactory;
-import android.graphics.Bitmap;
-import android.graphics.Canvas;
-import android.renderscript.ScriptC;
 import android.renderscript.RenderScript;
-import android.renderscript.Type;
 import android.renderscript.Allocation;
-import android.renderscript.Element;
-import android.renderscript.Script;
-import android.view.SurfaceView;
-import android.view.SurfaceHolder;
 import android.widget.ImageView;
 import android.widget.SeekBar;
 import android.widget.TextView;
 import android.view.View;
 import android.util.Log;
-import java.lang.Math;
 import android.widget.Spinner;
 
 public class TestBase  {
@@ -47,18 +36,6 @@
     protected Allocation mOutPixelsAllocation;
     protected ImageProcessingActivityJB act;
 
-    private class MessageProcessor extends RenderScript.RSMessageHandler {
-        ImageProcessingActivityJB mAct;
-
-        MessageProcessor(ImageProcessingActivityJB act) {
-            mAct = act;
-        }
-
-        public void run() {
-            mAct.updateDisplay();
-        }
-    }
-
     // Override to use UI elements
     public void onBar1Changed(int progress) {
     }
@@ -99,6 +76,9 @@
         return false;
     }
 
+    public void animateBars(float time) {
+    }
+
     public boolean onSpinner1Setup(Spinner s) {
         s.setVisibility(View.INVISIBLE);
         return false;
@@ -107,7 +87,6 @@
     public final void createBaseTest(ImageProcessingActivityJB ipact) {
         act = ipact;
         mRS = ipact.mProcessor.mRS;
-        mRS.setMessageHandler(new MessageProcessor(act));
 
         mInPixelsAllocation = ipact.mProcessor.mInPixelsAllocation;
         mInPixelsAllocation2 = ipact.mProcessor.mInPixelsAllocation2;
@@ -124,28 +103,6 @@
     public void runTest() {
     }
 
-    final public void runTestSendMessage() {
-        runTest();
-        mRS.sendMessage(0, null);
-    }
-
-    public void finish() {
-        mRS.finish();
-    }
-
     public void destroy() {
-        mRS.setMessageHandler(null);
-    }
-
-    public void updateBitmap(Bitmap b) {
-        mOutPixelsAllocation.copyTo(b);
-    }
-
-    // Override to configure specific benchmark config.
-    public void setupBenchmark() {
-    }
-
-    // Override to reset after benchmark.
-    public void exitBenchmark() {
     }
 }
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vibrance.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vibrance.java
index 09822a9..67498d0 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vibrance.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vibrance.java
@@ -16,10 +16,6 @@
 
 package com.android.rs.imagejb;
 
-import java.lang.Math;
-
-import android.renderscript.Allocation;
-
 public class Vibrance extends TestBase {
     private ScriptC_vibrance mScript;
 
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
index 7984386..ed1d2cd 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
@@ -16,10 +16,6 @@
 
 package com.android.rs.imagejb;
 
-import android.renderscript.Allocation;
-import android.renderscript.Element;
-import android.renderscript.Sampler;
-import android.renderscript.Type;
 import android.widget.SeekBar;
 import android.widget.TextView;
 
@@ -72,6 +68,7 @@
         return true;
     }
 
+
     public void onBar1Changed(int progress) {
         scale = progress / 50.0f;
         do_init();
@@ -93,6 +90,11 @@
         do_init();
     }
 
+    public void animateBars(float time) {
+        scale = time % 2.f;
+        do_init();
+    }
+
     private void do_init() {
         if (approx) {
             if (relaxed)
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/WhiteBalance.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/WhiteBalance.java
index f15aaf5..25c4ff9 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/WhiteBalance.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/WhiteBalance.java
@@ -18,7 +18,6 @@
 
 import java.lang.Math;
 
-import android.renderscript.Allocation;
 
 public class WhiteBalance extends TestBase {
     private ScriptC_wbalance mScript;
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/posterize.rs b/java/tests/ImageProcessing_jb/src/com/android/rs/image/posterize.rs
new file mode 100644
index 0000000..043ea5e
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/posterize.rs
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+rs_allocation inputImage;
+
+float intensityLow = 0.f;
+float intensityHigh;
+uchar4 color;
+const static float3 mono = {0.299f, 0.587f, 0.114f};
+
+void setParams(float intensHigh, float intensLow, uchar r, uchar g, uchar b) {
+    intensityLow = intensLow;
+    intensityHigh = intensHigh;
+    uchar4 hats = {r, g, b, 255};
+    color = hats;
+}
+
+uchar4 RS_KERNEL root(uchar4 v_in, uint32_t x, uint32_t y) {
+    uchar4 refpix = rsGetElementAt_uchar4(inputImage, x, y);
+    float pixelIntensity = dot(rsUnpackColor8888(refpix).rgb, mono);
+    if ((pixelIntensity <= intensityHigh) && (pixelIntensity >= intensityLow)) {
+        return color;
+    } else {
+        return v_in;
+    }
+}
+
+
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 7dcbdf8..40c73b8 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -486,7 +486,7 @@
 }
 
 #ifndef RS_COMPATIBILITY_LIB
-void Allocation::NewBufferListener::onFrameAvailable() {
+void Allocation::NewBufferListener::onFrameAvailable(const BufferItem& /* item */) {
     intptr_t ip = (intptr_t)alloc;
     rsc->sendMessageToClient(&ip, RS_MESSAGE_TO_CLIENT_NEW_BUFFER, 0, sizeof(ip), true);
 }
@@ -498,7 +498,7 @@
     sp<IGraphicBufferProducer> bp;
     sp<IGraphicBufferConsumer> bc;
     BufferQueue::createBufferQueue(&bp, &bc);
-    mGrallocConsumer = new GrallocConsumer(this, bc);
+    mGrallocConsumer = new GrallocConsumer(this, bc, mHal.drvState.grallocFlags);
     bp->incStrong(NULL);
 
     mBufferListener = new NewBufferListener();
diff --git a/rsAllocation.h b/rsAllocation.h
index f197efc..065d7be 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -88,6 +88,8 @@
                 uint32_t shift;
                 uint32_t step;
             } yuv;
+
+            int grallocFlags;
         };
         mutable DrvState drvState;
 
@@ -183,7 +185,7 @@
         const android::renderscript::Context *rsc;
         const android::renderscript::Allocation *alloc;
 
-        virtual void onFrameAvailable();
+        virtual void onFrameAvailable(const BufferItem& item);
     };
 
     sp<NewBufferListener> mBufferListener;
diff --git a/rsGrallocConsumer.cpp b/rsGrallocConsumer.cpp
index c5d37b2..3239365 100644
--- a/rsGrallocConsumer.cpp
+++ b/rsGrallocConsumer.cpp
@@ -30,11 +30,16 @@
 namespace android {
 namespace renderscript {
 
-GrallocConsumer::GrallocConsumer(Allocation *a, const sp<IGraphicBufferConsumer>& bq) :
+GrallocConsumer::GrallocConsumer(Allocation *a, const sp<IGraphicBufferConsumer>& bq, int flags) :
     ConsumerBase(bq, true)
 {
     mAlloc = a;
-    mConsumer->setConsumerUsageBits(GRALLOC_USAGE_SW_READ_OFTEN);
+    if (flags == 0) {
+        flags = GRALLOC_USAGE_SW_READ_OFTEN | GRALLOC_USAGE_RENDERSCRIPT;
+    } else {
+        flags |= GRALLOC_USAGE_RENDERSCRIPT;
+    }
+    mConsumer->setConsumerUsageBits(flags);
     mConsumer->setMaxAcquiredBufferCount(2);
 
     uint32_t y = a->mHal.drvState.lod[0].dimY;
diff --git a/rsGrallocConsumer.h b/rsGrallocConsumer.h
index 9e4fc58..88b4440 100644
--- a/rsGrallocConsumer.h
+++ b/rsGrallocConsumer.h
@@ -44,7 +44,7 @@
   public:
     typedef ConsumerBase::FrameAvailableListener FrameAvailableListener;
 
-    GrallocConsumer(Allocation *, const sp<IGraphicBufferConsumer>& bq);
+    GrallocConsumer(Allocation *, const sp<IGraphicBufferConsumer>& bq, int flags);
 
     virtual ~GrallocConsumer();
     status_t lockNextBuffer();
diff --git a/scriptc/rs_allocation.rsh b/scriptc/rs_allocation.rsh
index 6f3f8d9..03bd1ca 100644
--- a/scriptc/rs_allocation.rsh
+++ b/scriptc/rs_allocation.rsh
@@ -374,8 +374,7 @@
 
 #endif // (defined(RS_VERSION) && (RS_VERSION >= 18))
 
-
-#if (defined(RS_VERSION) && (RS_VERSION >= 999))
+#if (defined(RS_VERSION) && (RS_VERSION >= 22))
 
 #define VOP(T)                                                                   \
     extern T __attribute__((overloadable))                                       \
@@ -424,7 +423,7 @@
 
 #undef VOP
 
-#endif //(defined(RS_VERSION) && (RS_VERSION >= 999))
+#endif //(defined(RS_VERSION) && (RS_VERSION >= 22))
 
 
 #endif