Arbitrary sub-rectangle handling for ColorMatrix.
AArch64 code for odd-length cases, and correct pointer offsets.
Change-Id: I28049a768a1e3e65611898904fa42bd295208871
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 0f3af5b..9b234f4 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -130,24 +130,32 @@
void (*column[4])(void);
void (*store)(void);
void (*load)(void);
+ void (*store_end)(void);
+ void (*load_end)(void);
} FunctionTab_t;
-extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+extern "C" void rsdIntrinsicColorMatrix_int_K(
void *out, void const *in, size_t count,
FunctionTab_t const *fns,
int16_t const *mult, int32_t const *add);
-extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
- FunctionTab_t const *fns,
- uint32_t mask, int dt, int st);
-
-extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+extern "C" void rsdIntrinsicColorMatrix_float_K(
void *out, void const *in, size_t count,
FunctionTab_t const *fns,
float const *mult, float const *add);
+/* The setup functions fill in function tables to be used by above functions;
+ * this code also eliminates jump-to-another-jump cases by short-circuiting
+ * empty functions. While it's not performance critical, it works out easier
+ * to write the set-up code in assembly than to try to expose the same symbols
+ * and write the code in C.
+ */
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+ FunctionTab_t *fns,
+ uint32_t mask, int dt, int st);
+
extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
- FunctionTab_t const *fns,
+ FunctionTab_t *fns,
uint32_t mask, int dt, int st);
#endif
@@ -874,8 +882,8 @@
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
- uchar *out = (uchar *)p->out;
- uchar *in = (uchar *)p->in;
+ uchar *out = (uchar *)p->out + outstep * xstart;
+ uchar *in = (uchar *)p->in + instep * xstart;
uint32_t x1 = xstart;
uint32_t x2 = xend;
@@ -902,15 +910,14 @@
}
#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
- size_t done;
if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
- done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+ rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
} else {
- done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+ rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
}
- x1 += done;
- out += outstep * done;
- in += instep * done;
+ x1 += len;
+ out += outstep * len;
+ in += instep * len;
}
#endif
}
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
index 7a6d4c5..3fcfa25 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -595,8 +595,377 @@
ld1 {v20.4s}, [x1], #16
br x4
+colormatrix_int_stu1_end:
+ uqxtn v12.8b, v8.8h
+ tbz x2, #2, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v12.h}[1], [x0], #2
+1: tbz x2, #0, 1f
+ st1 {v12.b}[1], [x0], #1
+1: b colormatrix_int_realend
-/* size_t rsdIntrinsicColorMatrix_int_K(
+colormatrix_int_stu2_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ zip1 v12.16b, v12.16b, v13.16b
+ tbz x2, #2, 1f
+ st1 {v12.d}[1], [x0], #8
+1: tbz x2, #1, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #0, 1f
+ st1 {v12.h}[1], [x0], #2
+1: b colormatrix_int_realend
+
+colormatrix_int_stu3_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ movi v15.8b, #0
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_int_realend
+
+colormatrix_int_stu4_end:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ uqxtn v15.8b, v11.8h
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_int_realend
+
+
+colormatrix_int_ldu1_end:
+ tbz x2, #2, 1f
+ ld1 {v15.s}[3], [x1], #4
+1: tbz x2, #1, 1f
+ ld1 {v15.h}[5], [x1], #2
+1: tbz x2, #0, 1f
+ ld1 {v15.b}[9], [x1], #1
+1: uxtl2 v12.8h, v15.16b
+ br x4
+
+colormatrix_int_ldu2_end:
+ tbz x2, #2, 1f
+ ld1 {v15.d}[1], [x1], #8
+1: tbz x2, #1, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #0, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: uzp1 v14.16b, v15.16b, v15.16b
+ uzp2 v15.16b, v15.16b, v15.16b
+ uxtl v12.8h, v14.8b
+ uxtl v13.8h, v15.8b
+ br x4
+
+colormatrix_int_ldu3_end:
+ tbz x2, #2, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1: uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ br x4
+
+colormatrix_int_ldu4_end:
+ tbz x2, #2, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+ ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1: uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ uxtl v15.8h, v15.8b
+ br x4
+
+colormatrix_float_stu1_end:
+ fcvtzs v12.4s, v8.4s, #1
+ fcvtzs v13.4s, v16.4s, #1
+ sqrshrun v12.4h, v12.4s, #1
+ sqrshrun2 v12.8h, v13.4s, #1
+ uqxtn v12.8b, v12.8h
+ tbz x2, #2, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v12.h}[1], [x0], #2
+1: tbz x2, #0, 1f
+ st1 {v12.b}[1], [x0], #1
+1: b colormatrix_float_realend
+
+colormatrix_float_stu2_end:
+ fcvtzs v12.4s, v8.4s, #1
+ fcvtzs v13.4s, v9.4s, #1
+ fcvtzs v14.4s, v16.4s, #1
+ fcvtzs v15.4s, v17.4s, #1
+ sqrshrun v12.4h, v12.4s, #1
+ sqrshrun v13.4h, v13.4s, #1
+ sqrshrun v14.4h, v14.4s, #1
+ sqrshrun v15.4h, v15.4s, #1
+ zip1 v12.8h, v12.8h, v13.8h
+ zip1 v13.8h, v14.8h, v15.8h
+ uqxtn v12.8b, v12.8h
+ uqxtn2 v12.16b, v13.8h
+ tbz x2, #2, 1f
+ st1 {v12.d}[1], [x0], #8
+1: tbz x2, #1, 1f
+ st1 {v12.s}[1], [x0], #4
+1: tbz x2, #0, 1f
+ st1 {v12.h}[1], [x0], #2
+1: b colormatrix_float_realend
+
+colormatrix_float_stu3_end:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ uqxtn v12.8b, v24.8h
+ uqxtn v13.8b, v25.8h
+ uqxtn v14.8b, v26.8h
+ movi v15.8b, #0
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stu4_end:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v27.4s, v11.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ fcvtzs v31.4s, v19.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun v27.4h, v27.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ sqrshrun2 v27.8h, v31.4s, #1
+ uqxtn v12.8b, v24.8h
+ uqxtn v13.8b, v25.8h
+ uqxtn v14.8b, v26.8h
+ uqxtn v15.8b, v27.8h
+ tbz x2, #2, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1: tbz x2, #1, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+ st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1: tbz x2, #0, 1f
+ st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stf1_end:
+ tbz x2, #2, 1f
+ st1 {v16.4s}, [x0], #16
+1: tbz x2, #1, 1f
+ st1 {v8.d}[1], [x0], #8
+1: tbz x2, #0, 1f
+ st1 {v8.s}[1], [x0], #4
+1: b colormatrix_float_realend
+
+colormatrix_float_stf2_end:
+ tbz x2, #2, 1f
+ st2 {v16.4s, v17.4s}, [x0], #32
+1: tbz x2, #1, 1f
+ st2 {v8.s,v9.s}[2], [x0], #8
+ st2 {v8.s,v9.s}[3], [x0], #8
+1: tbz x2, #0, 1f
+ st2 {v8.s,v9.s}[1], [x0], #8
+1: b colormatrix_float_realend
+
+colormatrix_float_stf3_end:
+ movi v11.16b, #0
+ movi v19.16b, #0
+colormatrix_float_stf4_end:
+ tbz x2, #2, 1f
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+1: tbz x2, #1, 1f
+ st4 {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
+ st4 {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
+1: tbz x2, #0, 1f
+ st4 {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
+1: b colormatrix_float_realend
+
+colormatrix_float_ldu1_end:
+ tbz x2, #2, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #1, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: tbz x2, #0, 1f
+ ld1 {v15.b}[1], [x1], #1
+1: uxtl v15.8h, v15.8b
+ uxtl v12.4s, v15.4h
+ uxtl2 v20.4s, v15.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v20.4s, v20.4s
+ br x4
+
+colormatrix_float_ldu2_end:
+ tbz x2, #2, 1f
+ ld1 {v15.d}[1], [x1], #8
+1: tbz x2, #1, 1f
+ ld1 {v15.s}[1], [x1], #4
+1: tbz x2, #0, 1f
+ ld1 {v15.h}[1], [x1], #2
+1: uxtl v14.8h, v15.8b
+ uxtl2 v15.8h, v15.16b
+ uzp1 v12.8h, v14.8h, v14.8h
+ uzp2 v13.8h, v14.8h, v14.8h
+ uzp1 v20.8h, v15.8h, v15.8h
+ uzp2 v21.8h, v15.8h, v15.8h
+ uxtl v12.4s, v12.4h
+ uxtl v13.4s, v13.4h
+ uxtl v20.4s, v20.4h
+ uxtl v21.4s, v21.4h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ br x4
+
+colormatrix_float_ldu3_end:
+ tbz x2, #2, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1: uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ br x4
+
+colormatrix_float_ldu4_end:
+ tbz x2, #2, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1: tbz x2, #1, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+ ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1: tbz x2, #0, 1f
+ ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1: uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v23.8h, v23.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl v15.4s, v23.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ uxtl2 v23.4s, v23.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v15.4s, v15.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ br x4
+
+colormatrix_float_ldf1_end:
+ tbz x2, #2, 1f
+ ld1 {v20.4s}, [x1], #16
+1: tbz x2, #1, 1f
+ ld1 {v12.d}[1], [x1], #8
+1: tbz x2, #0, 1f
+ ld1 {v12.s}[1], [x1], #4
+1: br x4
+
+colormatrix_float_ldf2_end:
+ tbz x2, #2, 1f
+ ld2 {v20.4s,v21.4s}, [x1], #32
+1: tbz x2, #1, 1f
+ ld2 {v12.s,v13.s}[2], [x1], #8
+ ld2 {v12.s,v13.s}[3], [x1], #8
+1: tbz x2, #0, 1f
+ ld2 {v12.s,v13.s}[1], [x1], #8
+1: br x4
+
+colormatrix_float_ldf3_end:
+colormatrix_float_ldf4_end:
+ tbz x2, #2, 1f
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+1: tbz x2, #1, 1f
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
+1: tbz x2, #0, 1f
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
+1: br x4
+
+/* void rsdIntrinsicColorMatrix_int_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
@@ -605,7 +974,6 @@
* int32_t const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_int_K)
- stp x8,x9, [sp, #-16]!
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
@@ -636,10 +1004,23 @@
br x9
colormatrix_int_end:
+ adds x2, x2, #8
+ bls colormatrix_int_realend
+ mov x16, x8
+ ldp x8, x9, [x3], #16
+ cmp x4, x16
+ csel x4, x8, x4, eq
+ cmp x5, x16
+ csel x5, x8, x5, eq
+ cmp x6, x16
+ csel x6, x8, x6, eq
+ cmp x7, x16
+ csel x7, x8, x7, eq
+ br x9
+
+colormatrix_int_realend:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
- ldp x8,x9, [sp], #16
- add x0, x2, #8
ret
END(rsdIntrinsicColorMatrix_int_K)
@@ -650,28 +1031,35 @@
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
- adr x4, 2f
- ldrsh x2, [x4, x2, LSL #1]
- add x2, x2, x4
- adr x4, 3f
- ldrsh x3, [x4, x3, LSL #1]
- add x3, x3, x4
+ adr x7, 2f
+ add x4, x7, x2, LSL #2
+ ldrsh x2, [x4], #2
+ ldrsh x4, [x4]
+ add x2, x2, x7
+ add x4, x4, x7
+ adr x7, 3f
+ add x5, x7, x3, LSL #2
+ ldrsh x3, [x5], #2
+ ldrsh x5, [x5]
+ add x3, x3, x7
+ add x5, x5, x7
stp x2, x3, [x0, #32]
+ stp x4, x5, [x0, #48]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
- adr x4, 4f
+ adr x7, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
- ldrsh x2, [x4, x2]
- add x2, x2, x4
+ ldrsh x2, [x7, x2]
+ add x2, x2, x7
9: str x2, [x0], #8
lsr x1, x1, #5
- add x4, x4, #2
+ add x7, x7, #2
subs x3, x3, #1
bne 1b
@@ -690,13 +1078,21 @@
.align 4
2: .hword colormatrix_int_stu1-2b
+ .hword colormatrix_int_stu1_end-2b
.hword colormatrix_int_stu2-2b
+ .hword colormatrix_int_stu2_end-2b
.hword colormatrix_int_stu3-2b
+ .hword colormatrix_int_stu3_end-2b
.hword colormatrix_int_stu4-2b
+ .hword colormatrix_int_stu4_end-2b
3: .hword colormatrix_int_ldu1-3b
+ .hword colormatrix_int_ldu1_end-3b
.hword colormatrix_int_ldu2-3b
+ .hword colormatrix_int_ldu2_end-3b
.hword colormatrix_int_ldu3-3b
+ .hword colormatrix_int_ldu3_end-3b
.hword colormatrix_int_ldu4-3b
+ .hword colormatrix_int_ldu4_end-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_int_col0_\i-4b
@@ -713,7 +1109,7 @@
END(rsdIntrinsicColorMatrixSetup_int_K)
-/* size_t rsdIntrinsicColorMatrix_float_K(
+/* void rsdIntrinsicColorMatrix_float_K(
* void *out, // x0
* void const *in, // x1
* size_t count, // x2
@@ -722,7 +1118,6 @@
* float const *add); // x5
*/
ENTRY(rsdIntrinsicColorMatrix_float_K)
- stp x8,x9, [sp, #-16]!
sub x7, sp, #32
sub sp, sp, #64
st1 {v8.1d-v11.1d}, [sp]
@@ -753,10 +1148,23 @@
br x9
colormatrix_float_end:
+ adds x2, x2, #8
+ bls colormatrix_int_realend
+ mov x16, x8
+ ldp x8,x9, [x3], #16
+ cmp x4, x16
+ csel x4, x8, x4, eq
+ cmp x5, x16
+ csel x5, x8, x5, eq
+ cmp x6, x16
+ csel x6, x8, x6, eq
+ cmp x7, x16
+ csel x7, x8, x7, eq
+ br x9
+
+colormatrix_float_realend:
ld1 {v8.1d-v11.1d}, [sp], #32
ld1 {v12.1d-v15.1d}, [sp], #32
- ldp x8,x9, [sp], #16
- add x0, x2, #8
ret
END(rsdIntrinsicColorMatrix_float_K)
@@ -767,28 +1175,35 @@
* int st); // x3
*/
ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
- adr x4, 2f
- ldrsh x2, [x4, x2, LSL #1]
- add x2, x2, x4
- adr x4, 3f
- ldrsh x3, [x4, x3, LSL #1]
- add x3, x3, x4
+ adr x7, 2f
+ add x4, x7, x2, LSL #2
+ ldrsh x2, [x4], #2
+ ldrsh x4, [x4]
+ add x2, x2, x7
+ add x4, x4, x7
+ adr x7, 3f
+ add x5, x7, x3, LSL #2
+ ldrsh x3, [x5], #2
+ ldrsh x5, [x5]
+ add x3, x3, x7
+ add x5, x5, x7
stp x2, x3, [x0, #32]
+ stp x4, x5, [x0, #48]
/* For each column function, if the matrix is all zeroes then write NULL,
* otherwise look up the appropriate function and store that. */
mov x3, #4
- adr x4, 4f
+ adr x7, 4f
1: ands x2, x1, #15
beq 9f
and x2, x1, #31
lsl x2, x2, #3
- ldrsh x2, [x4, x2]
- add x2, x2, x4
+ ldrsh x2, [x7, x2]
+ add x2, x2, x7
9: str x2, [x0], #8
lsr x1, x1, #5
- add x4, x4, #2
+ add x7, x7, #2
subs x3, x3, #1
bne 1b
@@ -807,21 +1222,37 @@
.align 4
2: .hword colormatrix_float_stu1-2b
+ .hword colormatrix_float_stu1_end-2b
.hword colormatrix_float_stu2-2b
+ .hword colormatrix_float_stu2_end-2b
.hword colormatrix_float_stu3-2b
+ .hword colormatrix_float_stu3_end-2b
.hword colormatrix_float_stu4-2b
+ .hword colormatrix_float_stu4_end-2b
.hword colormatrix_float_stf1-2b
+ .hword colormatrix_float_stf1_end-2b
.hword colormatrix_float_stf2-2b
+ .hword colormatrix_float_stf2_end-2b
.hword colormatrix_float_stf3-2b
+ .hword colormatrix_float_stf3_end-2b
.hword colormatrix_float_stf4-2b
+ .hword colormatrix_float_stf4_end-2b
3: .hword colormatrix_float_ldu1-3b
+ .hword colormatrix_float_ldu1_end-3b
.hword colormatrix_float_ldu2-3b
+ .hword colormatrix_float_ldu2_end-3b
.hword colormatrix_float_ldu3-3b
+ .hword colormatrix_float_ldu3_end-3b
.hword colormatrix_float_ldu4-3b
+ .hword colormatrix_float_ldu4_end-3b
.hword colormatrix_float_ldf1-3b
+ .hword colormatrix_float_ldf1_end-3b
.hword colormatrix_float_ldf2-3b
+ .hword colormatrix_float_ldf2_end-3b
.hword colormatrix_float_ldf3-3b
+ .hword colormatrix_float_ldf3_end-3b
.hword colormatrix_float_ldf4-3b
+ .hword colormatrix_float_ldf4_end-3b
4:
.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
.hword colormatrix_float_col0_\i-4b