Merge "Subrectangle bug fixes to ARM Blur assembly."
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index ee7b034..91d77ff 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -1059,60 +1059,43 @@
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-PRIVATE(prefetch_clamp1)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            mov         v11.16b, v10.16b
-            sub         x1, x1, #16
-1:          mov         v12.16b, v11.16b
-            movi        v13.8b, #0xff
-            tbz         x11, #2, 1f
-            ext         v12.16b, v12.16b, v12.16b, #4*2
-            sub         x1, x1, #8
-            shl         v13.2d, v13.2d, #32
-1:          tbz         x11, #1, 1f
-            ext         v12.16b, v12.16b, v12.16b, #6*2
-            sub         x1, x1, #4
-            shl         v13.2d, v13.2d, #16
-1:          tbz         x11, #0, 1f
-            ext         v12.16b, v12.16b, v12.16b, #7*2
-            sub         x1, x1, #2
-            shl         v13.2d, v13.2d, #8
-1:          dup         v12.8h, v12.h[6]
-            sxtl        v13.8h, v13.8b
-            bif         v11.16b, v12.16b, v13.16b
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
+PRIVATE(prefetch_clampright1)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #1
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.8h}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
             ret
-END(prefetch_clamp1)
+1:          dup         v12.8h, v11.h[7]
+            ret
+END(prefetch_clampright1)
 
-PRIVATE(prefetch_clamp4)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            sub         x1, x1, #16     // what's this?
-            mov         v11.16b, v10.16b
-1:          dup         v12.2d, v11.d[1]
-            tbz         x11, #2, 1f
-            dup         v12.2d, v11.d[0]
-            sub         x1, x1, #8
-            dup         v11.2d, v11.d[0]
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
+PRIVATE(prefetch_clampright4)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #4
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.2d}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
             ret
-END(prefetch_clamp4)
+1:          dup         v12.2d, v11.d[1]
+            ret
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1130,10 @@
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
 4:          b           4f+4
-           //v12 contains pad word from prefetch_clamp call
+           //v12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
   .if \rem > 0
             b           4f+4
@@ -1209,24 +1192,18 @@
   .else
             dup         v9.2d, v10.d[0]
   .endif
-            tst         x10, #15
+            ands        x12, x10, #15
             beq         2f
-            sub         x12, xzr, x10
-            tbz         x10, #3, 1f
-            mov         v11.16b, v10.16b
-            mov         v10.16b, v9.16b
-1:          tbz         x12, #2, 1f
-            ext         v11.16b, v10.16b, v11.16b, #4*2
-            ext         v10.16b, v9.16b, v10.16b, #4*2
-  .if \step == 1
-  1:        tbz         x12, #1, 1f
-            ext         v11.16b, v10.16b, v11.16b, #2*2
-            ext         v10.16b, v9.16b, v10.16b, #2*2
-  1:        tbz         x12, #0, 1f
-            ext         v11.16b, v10.16b, v11.16b, #1*2
-            ext         v10.16b, v9.16b, v10.16b, #1*2
-  .endif
-1:          sub         x1, x1, x10
+            sub         sp, sp, #32
+            st1         {v10.8h,v11.8h}, [sp]
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            sub         x1, x1, x10
             sub         x15, x15, x10
             sub         x19, x19, x10
             bic         x10, x10, #15
@@ -1363,13 +1340,13 @@
             b           3b
 4:          tbz         x3, #2, 1f
             st1         {v15.s}[0], [x0], #4
-            ext         v15.16b, v15.16b, v15.16b, #4*2
+            ext         v15.8b, v15.8b, v15.8b, #4
 1:          tbz         x3, #1, 1f
             st1         {v15.h}[0], [x0], #2
-            ext         v15.16b, v15.16b, v15.16b, #2*2
+            ext         v15.8b, v15.8b, v15.8b, #2
 1:          tbz         x3, #0, 5f
             st1         {v15.b}[0], [x0], #1
-            ext         v15.16b, v15.16b, v15.16b, #1*2
+            ext         v15.8b, v15.8b, v15.8b, #1
 5:          nop
 .endm
 
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index 8fc47f5..593e10c 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -15,6 +15,7 @@
  */
 
 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
 #define END(f) .fnend; .size f, .-f;
 
 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
@@ -1049,7 +1050,7 @@
 /* Dedicated function wrapper for the fetch macro, for the cases where
  * performance isn't that important, to keep code size down.
  */
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
             push        {r10,r11}
             fetch
             pop         {r10,r11}
@@ -1060,61 +1061,46 @@
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-ENTRY(prefetch_clamp1)
-            rsb         r11, r11, #0
-            tst         r11, #8
+PRIVATE(prefetch_clampright1)
+            ands        r12, r11, #15
             beq         1f
-            vmov.u16    q11, q10
-            sub         r1, r1, #16
-1:          vmov.u16    q12, q11
-            vmov.i8     d26, #0xff
-            tst         r11, #4
-            beq         1f
-            vext.u16    q12, q12, q12, #4
-            sub         r1, r1, #8
-            vshl.u64    d26, d26, #32
-1:          tst         r11, #2
-            beq         1f
-            vext.u16    q12, q12, q12, #6
-            sub         r1, r1, #4
-            vshl.u64    d26, d26, #16
-1:          tst         r11, #1
-            beq         1f
-            vext.u16    q12, q12, q12, #7
-            sub         r1, r1, #2
-            vshl.u64    d26, d26, #8
-1:          vdup.u16    q12, d25[2]
-            vmovl.s8    q13, d26
-            vbif        q11, q12, q13
-1:          tst         r11, #8
-            beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #1
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u16    {d24[]}, [r12]
+            vld1.u16    {d25[]}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
             bx          lr
-END(prefetch_clamp1)
+1:          vdup.u16    q12, d23[3]
+            bx          lr
+END(prefetch_clampright1)
 
-ENTRY(prefetch_clamp4)
-            rsb         r11, r11, #0
-            tst         r11, #8
+PRIVATE(prefetch_clampright4)
+            ands        r12, r11, #15
             beq         1f
-            sub         r1, r1, #16
-            vmov.u16    q11, q10
-1:          vmov        d24, d23
-            tst         r11, #4
-            beq         1f
-            vmov        d24, d22
-            sub         r1, r1, #8
-            vmov        d23, d22
-1:          vmov        d25, d24
-            tst         r11, #8
-            beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #4
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u64    {d24}, [r12]
+            vld1.u64    {d25}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
             bx          lr
-END(prefetch_clamp4)
+1:          vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            bx          lr
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1133,10 @@
             prefetch_out \qa, \qb, \store, q10, q11, d23
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, q10, q11, d23
 4:          b           4f+4
-            @q12 contains pad word from prefetch_clam call
+            @q12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, q12, q12, d25
   .if \rem > 0
             b           4f+4
@@ -1205,28 +1191,18 @@
             vmov.u16    d18, d20
             vmov.u16    d19, d20
   .endif
-            tst         r10, #15
+            ands        r12, r10, #15
             beq         2f
-            rsb         r12, r10, #0
-            tst         r10, #8
-            beq         1f
-            vmov.u16    q11, q10
-            vmov.u16    q10, q9
-1:          tst         r12, #4
-            beq         1f
-            vext.u16    q11, q10, q11, #4
-            vext.u16    q10, q9, q10, #4
-  .if \step == 1
-  1:        tst         r12, #2
-            beq         1f
-            vext.u16    q11, q10, q11, #2
-            vext.u16    q10, q9, q10, #2
-  1:        tst         r12, #1
-            beq         1f
-            vext.u16    q11, q10, q11, #1
-            vext.u16    q10, q9, q10, #1
-  .endif
-1:          sub         r1, r1, r10
+            sub         sp, sp, #32
+            vst1.u16    {q10,q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            sub         r1, r1, r10
             bic         r10, r10, #15
             add         r1, r1, r10
 2:
@@ -1383,7 +1359,7 @@
 .endm
 
 .irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
             push        {r12,lr}
 
             sub         r1, r1, r8
@@ -1397,7 +1373,7 @@
 .endr
 
 .irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
             sub         r12, sp, #0x200
             bic         r9, r12, #0x3fc
             mov         sp, r9