Merge changes I683078ff,I426fba9f,I2fcc57ac
* changes:
Optimisations to 3DLUT assembly.
Make Blur AArch64 assembly position-independent.
Add AArch64 assembly for ColorMatrix.
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 9cb4847..d44f872 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -40,8 +40,10 @@
LOCAL_ASFLAGS_arm64 += -no-integrated-as
#LOCAL_SRC_FILES_arm64 += \
+# rsCpuIntrinsics_advsimd_3DLUT.S \
# rsCpuIntrinsics_advsimd_Blend.S \
# rsCpuIntrinsics_advsimd_Blur.S \
+# rsCpuIntrinsics_advsimd_ColorMatrix.S \
# rsCpuIntrinsics_advsimd_YuvToRGB.S
ifeq ($(ARCH_ARM_HAVE_NEON),true)
@@ -52,9 +54,10 @@
LOCAL_CFLAGS_arm += -DARCH_ARM_HAVE_VFP
LOCAL_SRC_FILES_arm += \
rsCpuIntrinsics_neon.S \
- rsCpuIntrinsics_neon_ColorMatrix.S \
+ rsCpuIntrinsics_neon_3DLUT.S \
rsCpuIntrinsics_neon_Blend.S \
rsCpuIntrinsics_neon_Blur.S \
+ rsCpuIntrinsics_neon_ColorMatrix.S \
rsCpuIntrinsics_neon_YuvToRGB.S \
convolve/convolve_copy_neon.s \
convolve/convolve_avg_neon.s \
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index bfa3f73..c19eca3 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -52,9 +52,10 @@
mLUT.set(static_cast<Allocation *>(data));
}
-extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut,
- size_t lut_stride_y, size_t lut_stride_z,
- uint32_t count, const void *constants);
+extern "C" size_t rsdIntrinsic3DLUT_K(void *dst, void const *in, size_t count,
+ void const *lut,
+ int32_t pitchy, int32_t pitchz,
+ int dimx, int dimy, int dimz);
void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
@@ -85,21 +86,18 @@
while (x1 < x2) {
#if defined(ARCH_ARM_HAVE_VFP)
if (gArchUseSIMD) {
- int32_t len = (x2 - x1 - 1) >> 1;
- if(len > 0) {
- const short neon_constants[] = {
- static_cast<short>(coordMul.x), static_cast<short>(coordMul.y),
- static_cast<short>(coordMul.z), 0, 0, 0, 0, static_cast<short>(0xffff),
+ int32_t len = x2 - x1;
+ if(len >= 8) {
+ size_t done;
+ done = len - rsdIntrinsic3DLUT_K(out, in, len,
+ bp, stride_y, stride_z,
+ dims.x, dims.y, dims.z);
- };
-
- rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
- x1 += len << 1;
- out += len << 1;
- in += len << 1;
+ x1 += done;
+ out += done;
+ in += done;
}
}
-
#endif
int4 baseCoord = convert_int4(*in) * coordMul;
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 87db9ba..5d4241a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,6 +125,32 @@
} u;
} Key_t;
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+typedef struct {
+ void (*column[4])(void);
+ void (*store)(void);
+ void (*load)(void);
+} FunctionTab_t;
+
+extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+ FunctionTab_t const *fns,
+ uint32_t mask, int dt, int st);
+
+extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ float const *mult, float const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+ FunctionTab_t const *fns,
+ uint32_t mask, int dt, int st);
+#endif
+
class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
public:
virtual void populateScript(Script *);
@@ -146,9 +172,12 @@
// The following four fields are read as constants
// by the SIMD assembly code.
short ip[16];
- int ipa[16];
+ int ipa[4];
float tmpFp[16];
- float tmpFpa[16];
+ float tmpFpa[4];
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ FunctionTab_t mFnTab;
+#endif
static void kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
@@ -212,9 +241,9 @@
}
}
if (ipa[0] != 0) key.u.addMask |= 0x1;
- if (ipa[4] != 0) key.u.addMask |= 0x2;
- if (ipa[8] != 0) key.u.addMask |= 0x4;
- if (ipa[12] != 0) key.u.addMask |= 0x8;
+ if (ipa[1] != 0) key.u.addMask |= 0x2;
+ if (ipa[2] != 0) key.u.addMask |= 0x4;
+ if (ipa[3] != 0) key.u.addMask |= 0x8;
}
// Look for a dot product where the r,g,b colums are the same
@@ -257,13 +286,16 @@
case 3:
key.u.outVecSize = 2;
key.u.coeffMask &= ~0x8888;
+ key.u.addMask &= 7;
break;
case 2:
key.u.outVecSize = 1;
key.u.coeffMask &= ~0xCCCC;
+ key.u.addMask &= 3;
break;
default:
key.u.coeffMask &= ~0xEEEE;
+ key.u.addMask &= 1;
break;
}
@@ -278,7 +310,7 @@
return key;
}
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
#define DEF_SYM(x) \
extern "C" uint32_t _N_ColorMatrix_##x; \
@@ -408,7 +440,7 @@
bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
mBufSize = 4096;
//StopWatch build_time("rs cm: build time");
mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -676,18 +708,12 @@
float add = 0.f;
if (fpMul > 254.f) add = 0.5f;
for(int ct=0; ct < 4; ct++) {
- tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
+ tmpFpa[ct] = fpa[ct] * addMul + add;
//ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
- tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
- tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
- tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
}
for(int ct=0; ct < 4; ct++) {
- ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
- ipa[ct * 4 + 1] = ipa[ct * 4];
- ipa[ct * 4 + 2] = ipa[ct * 4];
- ipa[ct * 4 + 3] = ipa[ct * 4];
+ ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
}
}
@@ -768,9 +794,9 @@
//ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
sum.x += add[0];
- sum.y += add[4];
- sum.z += add[8];
- sum.w += add[12];
+ sum.y += add[1];
+ sum.z += add[2];
+ sum.w += add[3];
//ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
@@ -826,12 +852,27 @@
//if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
if(x2 > x1) {
- int32_t len = (x2 - x1) >> 2;
- if((cp->mOptKernel != NULL) && (len > 0)) {
- cp->mOptKernel(out, in, cp->ip, len);
- x1 += len << 2;
- out += outstep * (len << 2);
- in += instep * (len << 2);
+ int32_t len = x2 - x1;
+ if (gArchUseSIMD) {
+ if((cp->mOptKernel != NULL) && (len >= 4)) {
+ cp->mOptKernel(out, in, cp->ip, len >> 2);
+ x1 += len;
+ out += outstep * len;
+ in += instep * len;
+ }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ else {
+ size_t done;
+ if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+ done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+ } else {
+ done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+ }
+ x1 += done;
+ out += outstep * done;
+ in += instep * done;
+ }
+#endif
}
while(x1 != x2) {
@@ -872,8 +913,29 @@
mOptKernel = NULL;
if (build(key)) {
mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
- mLastKey = key;
}
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ else {
+ int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ uint32_t mm = 0;
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+ m = ((m * 0x249) >> 9) & 15;
+ m |= ((key.u.addMask >> i) & 1) << 4;
+ mm |= m << (i * 5);
+ }
+
+ if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+ rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+ } else {
+ rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+ }
+ }
+#endif
+ mLastKey = key;
}
}
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
new file mode 100644
index 0000000..ebceb24
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+ smov x6, \src0
+ smov x7, \src1
+
+ add x6, x6, x3
+ add x7, x7, x3
+
+ ld1 {v16.2s}, [x6], x4
+ ld1 {v17.2s}, [x7], x4
+
+ ld1 {v18.2s}, [x6], x5
+ ld1 {v19.2s}, [x7], x5
+
+ dup v8.8b, \yr0
+ dup v9.8b, \yr1
+ /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+ zip1 v12.16b, v5.16b, v16.16b
+ zip1 v13.16b, v5.16b, v17.16b
+ umlsl v12.8h, v16.8b, v8.8b
+ umlsl v13.8h, v17.8b, v9.8b
+ umlal v12.8h, v18.8b, v8.8b
+ umlal v13.8h, v19.8b, v9.8b
+
+ ld1 {v18.2s}, [x6]
+ ld1 {v19.2s}, [x7]
+
+ sub x6, x6, x4
+ sub x7, x7, x4
+
+ ld1 {v16.2s}, [x6]
+ ld1 {v17.2s}, [x7]
+
+ /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+ zip1 v14.16b, v5.16b, v16.16b
+ zip1 v15.16b, v5.16b, v17.16b
+ umlsl v14.8h, v16.8b, v8.8b
+ umlsl v15.8h, v17.8b, v9.8b
+ umlal v14.8h, v18.8b, v8.8b
+ umlal v15.8h, v19.8b, v9.8b
+
+ /* Z interpolate, lane 0 v12/v14 -> v10 */
+ ushll v8.4s, v12.4h, #8
+ ushll2 v9.4s, v12.8h, #8
+ umlsl v8.4s, v12.4h, \zr0
+ umlsl2 v9.4s, v12.8h, \zr0
+ umlal v8.4s, v14.4h, \zr0
+ umlal2 v9.4s, v14.8h, \zr0
+ rshrn v10.4h, v8.4s, #8
+ rshrn2 v10.8h, v9.4s, #8
+
+ /* Z interpolate, lane 1 v13/v15 -> v11 */
+ ushll v8.4s, v13.4h, #8
+ ushll2 v9.4s, v13.8h, #8
+ umlsl v8.4s, v13.4h, \zr1
+ umlsl2 v9.4s, v13.8h, \zr1
+ umlal v8.4s, v15.4h, \zr1
+ umlal2 v9.4s, v15.8h, \zr1
+ rshrn v11.4h, v8.4s, #8
+ rshrn2 v11.8h, v9.4s, #8
+
+ /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+ ushll v8.4s, v10.4h, #8
+ ushll v9.4s, v11.4h, #8
+ umlsl v8.4s, v10.4h, \xr0
+ umlsl v9.4s, v11.4h, \xr1
+ umlal2 v8.4s, v10.8h, \xr0
+ umlal2 v9.4s, v11.8h, \xr1
+ shrn v14.4h, v8.4s, #8
+ shrn2 v14.8h, v9.4s, #8
+
+ /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+ uqrshrn2 \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+ uqrshrn2 \dst, v14.8h, #8
+.else
+ uqrshrn \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* size_t rsdIntrinsic3DLUT_K(
+ * void *dst, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * void const *lut, // x3
+ * int32_t pitchy, // w4
+ * int32_t pitchz, // w5
+ * int dimx, // w6
+ * int dimy, // w7
+ * int dimz); // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+ ldr w8, [sp]
+ stp d8, d9, [sp, #-64]!
+ stp d10, d11, [sp, #16]
+ stp d12, d13, [sp, #32]
+ stp d14, d15, [sp, #48]
+ movi v4.8b, #1
+ ins v4.h[0], w6
+ ins v4.h[1], w7
+ ins v4.h[2], w8
+ ins v4.s[2], w4
+ ins v4.s[3], w5
+ movi v5.16b, #0
+
+ b 2f
+
+ .align 6
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = lut
+ * x4 = pitchy
+ * x5 = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+
+1: ld4 {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+ uxtl v2.8h, v2.8b
+ mul v0.8h, v0.8h, v4.h[0]
+ mul v1.8h, v1.8h, v4.h[1]
+ mul v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero. Strictly this is
+ * correct, except for the llegal access problem.
+ */
+ usra v0.8h, v0.8h, #8
+ usra v1.8h, v1.8h, #8
+ usra v2.8h, v2.8h, #8
+
+ ushr v12.8h, v0.8h, #8
+ ushr v13.8h, v1.8h, #8
+ ushr v14.8h, v2.8h, #8
+ bic v0.8h, #0xff, LSL #8
+ xtn v1.8b, v1.8h
+ bic v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+ ushll v6.4s, v12.4h, #2
+ ushll2 v7.4s, v12.8h, #2
+ uxtl v8.4s, v13.4h
+ uxtl2 v9.4s, v13.8h
+ uxtl v10.4s, v14.4h
+ uxtl2 v11.4s, v14.8h
+ mla v6.4s, v8.4s, v4.s[2]
+ mla v7.4s, v9.4s, v4.s[2]
+ mla v6.4s, v10.4s, v4.s[3]
+ mla v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+ /* lanes 0 and 1 */
+ lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+ /* lanes 2 and 3 */
+ lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+ /* lanes 4 and 5 */
+ lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+ /* lanes 6 and 7 */
+ lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+ uzp1 v6.16b, v20.16b, v21.16b
+ uzp2 v7.16b, v20.16b, v21.16b
+ uzp1 v20.16b, v6.16b, v7.16b
+ uzp2 v22.16b, v6.16b, v7.16b
+ mov v21.d[0], v20.d[1]
+ mov v23.8b, v3.8b
+
+ st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+
+2: subs x2, x2, #8
+ bhs 1b
+ add x0, x2, #8
+ ldp d14, d15, [sp, #48]
+ ldp d12, d13, [sp, #32]
+ ldp d10, d11, [sp, #16]
+ ldp d8, d9, [sp], #64
+ ret
+END(rsdIntrinsic3DLUT_K)
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 202f903..c4a85c2 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -15,6 +15,7 @@
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define PRIVATE(f) .text; .align 4; .type f,#function; f:
#define END(f) .size f, .-f;
.set FRACTION_BITS, 7
@@ -54,7 +55,7 @@
* q0-q3 -- coefficient table
* x13 = -pitch
* x15 = top-row in
- * x16 = bottom-row in
+ * x19 = bottom-row in
* Output:
* x1 += 16
* q10,q11 -- 16 convolved columns
@@ -82,7 +83,7 @@
umull v12.4s, v14.4h, v0.h[0]
ifcc sub \reg, \reg, x5, LSL #6
umull2 v13.4s, v14.8h, v0.h[0]
- mov x11, x16
+ mov x11, x19
umull v14.4s, v15.4h, v0.h[0]
ifcc add \reg, \reg, x5, LSL #3
umull2 v15.4s, v15.8h, v0.h[0]
@@ -101,7 +102,7 @@
uaddl v16.8h, v10.8b, v11.8b
ifcc cmp x7, #i
uaddl2 v11.8h, v10.16b, v11.16b
- ifcc csel x11, x16, x11, lo
+ ifcc csel x11, x19, x11, lo
umlal v12.4s, v16.4h, v\dreg\doth[\lane]
umlal2 v13.4s, v16.8h, v\dreg\doth[\lane]
// prfm PLDL1KEEP,[x10, #32] // TODO: confirm
@@ -123,7 +124,7 @@
uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS
add x15, x15, #16
uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS
- add x16, x16, #16
+ add x19, x19, #16
uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS
uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS
.endm /*}}}*/
@@ -142,16 +143,16 @@
* more data that won't be used and it means that rotating the window involves
* more mov operations.
*
- * When the buffer gets too big the buffer at [r9] is used.
+ * When the buffer gets too big the buffer at [x9] is used.
*
* Input:
* q4-q11 -- convoltion window
- * r9 -- pointer to additional convolution window data
+ * x9 -- pointer to additional convolution window data
* Output:
- * r9 -- updated buffer pointer (if used)
+ * x9 -- updated buffer pointer (if used)
* d31 -- result to be stored
* Modifies:
- * r12 -- temp buffer pointer
+ * x12 -- temp buffer pointer
* q12-q13 -- temporaries for load and vext operations.
* q14-q15 -- intermediate sums
*/
@@ -160,17 +161,19 @@
umull v14.4s, v9.4h, v0.h[0]
umull2 v15.4s, v9.8h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
- .xword 107f
- .xword 108f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
+ .hword 107f-100b
+ .hword 108f-100b
.align 4
108: umlal v14.4s, v8.4h, v1.h[0]
umlal2 v15.4s, v8.8h, v1.h[0]
@@ -232,25 +235,27 @@
umull v14.4s, v8.4h, v0.h[0]
umull2 v15.4s, v8.8h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
- .xword 107f
- .xword 108f
- .xword 109f
- .xword 110f
- .xword 111f
- .xword 112f
- .xword 113f
- .xword 114f
- .xword 115f
- .xword 116f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
+ .hword 107f-100b
+ .hword 108f-100b
+ .hword 109f-100b
+ .hword 110f-100b
+ .hword 111f-100b
+ .hword 112f-100b
+ .hword 113f-100b
+ .hword 114f-100b
+ .hword 115f-100b
+ .hword 116f-100b
.align 4
116: //ext v12.16b, v6.16b, v7.16b, #0*2
//ext v13.16b, v10.16b, v11.16b, #0*2
@@ -365,34 +370,36 @@
umull v14.4s, v12.4h, v0.h[0]
umull2 v15.4s, v12.8h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
- .xword 107f
- .xword 108f
- .xword 109f
- .xword 110f
- .xword 111f
- .xword 112f
- .xword 113f
- .xword 114f
- .xword 115f
- .xword 116f
- .xword 117f
- .xword 118f
- .xword 119f
- .xword 120f
- .xword 121f
- .xword 122f
- .xword 123f
- .xword 124f
- .xword 125f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
+ .hword 107f-100b
+ .hword 108f-100b
+ .hword 109f-100b
+ .hword 110f-100b
+ .hword 111f-100b
+ .hword 112f-100b
+ .hword 113f-100b
+ .hword 114f-100b
+ .hword 115f-100b
+ .hword 116f-100b
+ .hword 117f-100b
+ .hword 118f-100b
+ .hword 119f-100b
+ .hword 120f-100b
+ .hword 121f-100b
+ .hword 122f-100b
+ .hword 123f-100b
+ .hword 124f-100b
+ .hword 125f-100b
.align 4
125: ext v12.16b, v3.16b, v4.16b, #6*2
ext v13.16b, v10.16b, v11.16b, #0*2
@@ -564,15 +571,17 @@
umull v14.4s, v7.4h, v0.h[0]
umull2 v15.4s, v7.8h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
.align 4
106: umlal v14.4s, v4.4h, v0.h[6]
umlal2 v15.4s, v4.8h, v0.h[6]
@@ -616,21 +625,23 @@
umull v14.4s, v4.4h, v0.h[0]
umull2 v15.4s, v4.8h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
- .xword 107f
- .xword 108f
- .xword 109f
- .xword 110f
- .xword 111f
- .xword 112f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
+ .hword 107f-100b
+ .hword 108f-100b
+ .hword 109f-100b
+ .hword 110f-100b
+ .hword 111f-100b
+ .hword 112f-100b
.align 4
112: add x12, x9, #0x1a0
bic x12, x12, #0x200
@@ -751,34 +762,36 @@
umull v14.4s, v12.4h, v0.h[0]
umull v15.4s, v13.4h, v0.h[0]
- adr x12, 199f-8
- ldr x12, [x12, x5, LSL #3]
+ adr x16, 100f
+ ldrsh x12, [x16, x5, LSL #1]
+ add x12, x12, x16
br x12
- 199: .xword 101f
- .xword 102f
- .xword 103f
- .xword 104f
- .xword 105f
- .xword 106f
- .xword 107f
- .xword 108f
- .xword 109f
- .xword 110f
- .xword 111f
- .xword 112f
- .xword 113f
- .xword 114f
- .xword 115f
- .xword 116f
- .xword 117f
- .xword 118f
- .xword 119f
- .xword 120f
- .xword 121f
- .xword 122f
- .xword 123f
- .xword 124f
- .xword 125f
+ 100: .hword -4
+ .hword 101f-100b
+ .hword 102f-100b
+ .hword 103f-100b
+ .hword 104f-100b
+ .hword 105f-100b
+ .hword 106f-100b
+ .hword 107f-100b
+ .hword 108f-100b
+ .hword 109f-100b
+ .hword 110f-100b
+ .hword 111f-100b
+ .hword 112f-100b
+ .hword 113f-100b
+ .hword 114f-100b
+ .hword 115f-100b
+ .hword 116f-100b
+ .hword 117f-100b
+ .hword 118f-100b
+ .hword 119f-100b
+ .hword 120f-100b
+ .hword 121f-100b
+ .hword 122f-100b
+ .hword 123f-100b
+ .hword 124f-100b
+ .hword 125f-100b
.align 4
125: add x12, x9, #0x0d0
bic x12, x12, #0x200
@@ -1043,7 +1056,7 @@
/* Dedicated function wrapper for the fetch macro, for the cases where
* performance isn't that important, to keep code size down.
*/
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
stp x10, x11, [sp, #-16]!
fetch
ldp x10, x11, [sp], #16
@@ -1055,10 +1068,10 @@
* hand edge of the window when starting too close to the right hand edge of
* the image.
*/
-ENTRY(prefetch_clamp1)
+PRIVATE(prefetch_clamp1)
sub x11, xzr, x11
sub x15, x15, x1
- sub x16, x16, x1
+ sub x19, x19, x1
tbz x11, #3, 1f
mov v11.16b, v10.16b
sub x1, x1, #16
@@ -1084,14 +1097,14 @@
mov v11.16b, v12.16b
1: sub x11, xzr, x11
add x15, x15, x1
- add x16, x16, x1
+ add x19, x19, x1
ret
END(prefetch_clamp1)
-ENTRY(prefetch_clamp4)
+PRIVATE(prefetch_clamp4)
sub x11, xzr, x11
sub x15, x15, x1
- sub x16, x16, x1
+ sub x19, x19, x1
tbz x11, #3, 1f
sub x1, x1, #16 // what's this?
mov v11.16b, v10.16b
@@ -1105,7 +1118,7 @@
mov v11.16b, v12.16b
1: sub x11, xzr, x11
add x15, x15, x1
- add x16, x16, x1
+ add x19, x19, x1
ret
END(prefetch_clamp4)
@@ -1174,7 +1187,7 @@
* x9 -- buffer (if needed)
* x13 = -pitch
* x15 = top-row in
- * x16 = bottom-row in
+ * x19 = bottom-row in
* Output:
* x1 += rlf + min(count, rrt)
* Modifies:
@@ -1221,11 +1234,11 @@
.endif
1: sub x1, x1, x10
sub x15, x15, x10
- sub x16, x16, x10
+ sub x19, x19, x10
bic x10, x10, #15
add x1, x1, x10
add x15, x15, x10
- add x16, x16, x10
+ add x19, x19, x10
2:
.if \step > 1
/* it's only in the uchar2 and uchar4 cases where the register file
@@ -1276,7 +1289,7 @@
* x9 = buffer
* x13 = -pitch
* x15 = top-row in
- * x16 = bottom-row in
+ * x19 = bottom-row in
* Modifies
* x8 = fetch code pointer
*/
@@ -1324,10 +1337,10 @@
1: sub x1, x1, #16
sub x15, x15, #16
- sub x16, x16, #16
+ sub x19, x19, #16
add x1, x1, x4
add x15, x15, x4
- add x16, x16, x4
+ add x19, x19, x4
bl fetch_generic_asm
.if \step==1
@@ -1373,7 +1386,7 @@
.endm
.irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
stp x29,x30, [sp, #-16]!
prefetch step=1, max_r=\r
@@ -1386,7 +1399,7 @@
.endr
.irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
sub x12, sp, #0x200
bic x9, x12, #0x3fc
mov sp, x9
@@ -1421,17 +1434,13 @@
* uint16_t *tab); // [sp,#8]
*/
ENTRY(rsdIntrinsicBlurU1_K)
- stp x16,x30, [sp, #-80]!
- stp x14,x15, [sp, #16]
- stp x12,x13, [sp, #32]
- stp x10,x11, [sp, #48]
- stp x8,x9, [sp, #64]
+ stp x19,x30, [sp, #-16]!
sub x8, sp, #32
sub sp, sp, #64
st1 {v8.1d - v11.1d}, [sp]
st1 {v12.1d - v15.1d}, [x8]
mov x8, x5 // x
- ldr w5, [sp,#144] // r
+ ldr w5, [sp,#80] // r
sub x9, x2, x8
sub x10, x3, x6
mov x2, x4 // pitch
@@ -1439,7 +1448,7 @@
sub x7, x10, #1
sub x9, x9, x3
- ldr x12, [sp, #152] // tab
+ ldr x12, [sp, #88] // tab
add x0, x0, x8
add x1, x1, x8
@@ -1460,7 +1469,7 @@
sub x13, xzr, x2
msub x15, x2, x6, x1
- madd x16, x2, x7, x1
+ madd x19, x2, x7, x1
ld1 {v0.8h,v1.8h}, [x12], #32
ld1 {v2.8h,v3.8h}, [x12], #32
@@ -1474,11 +1483,7 @@
1: ld1 {v8.1d - v11.1d}, [sp], #32
ld1 {v12.1d - v15.1d}, [sp], #32
- ldp x8,x9, [sp, #64]
- ldp x10,x11, [sp, #48]
- ldp x12,x13, [sp, #32]
- ldp x14,x15, [sp, #16]
- ldp x12,x30, [sp], #80
+ ldp x19,x30, [sp], #16
ret
END(rsdIntrinsicBlurU1_K)
@@ -1495,17 +1500,13 @@
* uint16_t *tab); // [sp,#8]
*/
ENTRY(rsdIntrinsicBlurU4_K)
- stp x16,x30, [sp, #-80]!
- stp x14,x15, [sp, #16]
- stp x12,x13, [sp, #32]
- stp x10,x11, [sp, #48]
- stp x8,x9, [sp, #64]
+ stp x19,x30, [sp, #-16]!
sub x8, sp, #32
sub sp, sp, #64
st1 {v8.1d - v11.1d}, [sp]
st1 {v12.1d - v15.1d}, [x8]
mov x8, x5 // x
- ldr w5, [sp,#144] // r
+ ldr w5, [sp,#80] // r
sub x9, x2, x8
sub x10, x3, x6
mov x2, x4 // pitch
@@ -1513,7 +1514,7 @@
sub x7, x10, #1
sub x9, x9, x3
- ldr x12, [sp, #152]
+ ldr x12, [sp, #88]
add x0, x0, x8, LSL #2
add x1, x1, x8, LSL #2
@@ -1535,7 +1536,7 @@
sub x13, xzr, x2
msub x15, x2, x6, x1
- madd x16, x2, x7, x1
+ madd x19, x2, x7, x1
ld1 {v0.8h,v1.8h}, [x12], #32
ld1 {v2.8h,v3.8h}, [x12], #32
@@ -1549,10 +1550,6 @@
1: ld1 {v8.1d - v11.1d}, [sp], #32
ld1 {v12.1d - v15.1d}, [sp], #32
- ldp x8,x9, [sp, #64]
- ldp x10,x11, [sp, #48]
- ldp x12,x13, [sp, #32]
- ldp x14,x15, [sp, #16]
- ldp x12,x30, [sp], #80
+ ldp x19,x30, [sp], #16
ret
END(rsdIntrinsicBlurU4_K)
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
new file mode 100644
index 0000000..7a6d4c5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -0,0 +1,838 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fmla \opd, \opa, \opb
+ .else
+ fmul \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fadd \opd, \opa, \opb
+ .else
+ mov \stupidsyntax1, \stupidsyntax2
+ .endif
+ .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal \opd, \opa, \opb
+ .else
+ smull \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal2 \opd, \opa, \opb
+ .else
+ smull2 \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+colormatrix_int_col0_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+.align 6
+colormatrix_int_col1_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+colormatrix_int_col1_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+.align 6
+colormatrix_int_col2_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+colormatrix_int_col2_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+.align 6
+colormatrix_int_col3_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+colormatrix_int_col3_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+.align 5
+colormatrix_float_col0_\i:
+ vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 4
+colormatrix_float_col0_n\i:
+ vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 5
+colormatrix_float_col1_\i:
+ vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 4
+colormatrix_float_col1_n\i:
+ vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 5
+colormatrix_float_col2_\i:
+ vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 4
+colormatrix_float_col2_n\i:
+ vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 5
+colormatrix_float_col3_\i:
+ vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.align 4
+colormatrix_float_col3_n\i:
+ vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v23.8h, v23.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl v15.4s, v23.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ uxtl2 v23.4s, v23.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v15.4s, v15.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ br x4
+
+.align 5
+colormatrix_int_ldu4:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ uxtl v15.8h, v15.8b
+ br x4
+
+.align 6
+colormatrix_float_ldu3:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ br x4
+
+colormatrix_int_ldu3:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ br x4
+
+.align 5
+colormatrix_float_ldu1:
+ ld1 {v20.8b}, [x1], #8
+ uxtl v20.8h, v20.8b
+ uxtl v12.4s, v20.4h
+ uxtl2 v20.4s, v20.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v20.4s, v20.4s
+ br x4
+
+.align 6
+colormatrix_float_ldu2:
+ ld2 {v20.8b,v21.8b}, [x1], #16
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ br x4
+
+.align 4
+colormatrix_int_ldu2:
+ ld2 {v12.8b,v13.8b}, [x1], #16
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ br x4
+
+.align 6
+colormatrix_float_stu4:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v27.4s, v11.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ fcvtzs v31.4s, v19.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun v27.4h, v27.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ sqrshrun2 v27.8h, v31.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ uqxtn v27.8b, v27.8h
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu4:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ uqxtn v15.8b, v11.8h
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu3:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ movi v27.8b, #0
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 4
+colormatrix_int_ldu1:
+ ld1 {v12.8b}, [x1], #8
+ uxtl v12.8h, v12.8b
+ br x4
+
+.align 5
+colormatrix_int_stu3:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ movi v15.8b, #0
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu2:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ subs x2, x2, #8
+ st2 {v24.8b,v25.8b}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu2:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ subs x2, x2, #8
+ st2 {v12.8b,v13.8b}, [x0], #16
+ blo colormatrix_int_end
+ br x9
+
+.align 5
+colormatrix_int_stu1:
+ uqxtn v12.8b, v8.8h
+ subs x2, x2, #8
+ st1 {v12.8b}, [x0], #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_float_ldf3:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 6
+colormatrix_float_stu1:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ uqxtn v24.8b, v24.8h
+ subs x2, x2, #8
+ st1 {v24.8b}, [x0], #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_stf3:
+ movi v11.16b, #0
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ movi v19.16b, #0
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_float_stf4:
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf4:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 5
+colormatrix_float_stf2:
+ st2 {v8.4s, v9.4s}, [x0], #32
+ subs x2, x2, #8
+ st2 {v16.4s, v17.4s}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf2:
+ ld2 {v12.4s,v13.4s}, [x1], #32
+ ld2 {v20.4s,v21.4s}, [x1], #32
+ br x4
+
+.align 5
+colormatrix_float_stf1:
+ st1 {v8.4s}, [x0], #16
+ subs x2, x2, #8
+ st1 {v16.4s}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf1:
+ ld1 {v12.4s}, [x1], #16
+ ld1 {v20.4s}, [x1], #16
+ br x4
+
+
+/* size_t rsdIntrinsicColorMatrix_int_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * int16_t const *mult, // x4
+ * int32_t const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+ stp x8,x9, [sp, #-16]!
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.8h,v1.8h}, [x4], #32
+ ld1 {v4.4s}, [x5], #16
+
+ ldp x4,x5, [x3],#16
+ ldp x6,x7, [x3],#16
+ ldp x8,x9, [x3],#16
+
+ dup v12.4s, v4.s[0]
+ dup v13.4s, v4.s[1]
+ dup v14.4s, v4.s[2]
+ dup v15.4s, v4.s[3]
+ sqshrun v8.4h, v12.4s, #8
+ sqshrun2 v8.8h, v12.4s, #8
+ sqshrun v9.4h, v13.4s, #8
+ sqshrun2 v9.8h, v13.4s, #8
+ sqshrun v10.4h, v14.4s, #8
+ sqshrun2 v10.8h, v14.4s, #8
+ sqshrun v11.4h, v15.4s, #8
+ sqshrun2 v11.8h, v15.4s, #8
+
+ subs x2, x2, #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_int_end:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ldp x8,x9, [sp], #16
+ add x0, x2, #8
+ ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+ adr x4, 2f
+ ldrsh x2, [x4, x2, LSL #1]
+ add x2, x2, x4
+ adr x4, 3f
+ ldrsh x3, [x4, x3, LSL #1]
+ add x3, x3, x4
+ stp x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adr x4, 4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #3
+ ldrsh x2, [x4, x2]
+ add x2, x2, x4
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x4, x4, #2
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+ .align 4
+2: .hword colormatrix_int_stu1-2b
+ .hword colormatrix_int_stu2-2b
+ .hword colormatrix_int_stu3-2b
+ .hword colormatrix_int_stu4-2b
+3: .hword colormatrix_int_ldu1-3b
+ .hword colormatrix_int_ldu2-3b
+ .hword colormatrix_int_ldu3-3b
+ .hword colormatrix_int_ldu4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .hword colormatrix_int_col0_\i-4b
+ .hword colormatrix_int_col1_\i-4b-2
+ .hword colormatrix_int_col2_\i-4b-4
+ .hword colormatrix_int_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .hword colormatrix_int_col0_n\i-4b
+ .hword colormatrix_int_col1_n\i-4b-2
+ .hword colormatrix_int_col2_n\i-4b-4
+ .hword colormatrix_int_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_int_K)
+
+
+/* size_t rsdIntrinsicColorMatrix_float_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * float const *mult, // x4
+ * float const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+ stp x8,x9, [sp, #-16]!
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+ ld1r {v4.4s}, [x5], #4
+ ld1r {v5.4s}, [x5], #4
+ ld1r {v6.4s}, [x5], #4
+ ld1r {v7.4s}, [x5], #4
+
+ ldp x4,x5, [x3], #16
+ ldp x6,x7, [x3], #16
+ ldp x8,x9, [x3], #16
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+ mov v10.16b, v6.16b
+ mov v11.16b, v7.16b
+
+ mov v16.16b, v4.16b
+ mov v17.16b, v5.16b
+ mov v18.16b, v6.16b
+ mov v19.16b, v7.16b
+
+ subs x2, x2, #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_end:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ldp x8,x9, [sp], #16
+ add x0, x2, #8
+ ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+ adr x4, 2f
+ ldrsh x2, [x4, x2, LSL #1]
+ add x2, x2, x4
+ adr x4, 3f
+ ldrsh x3, [x4, x3, LSL #1]
+ add x3, x3, x4
+ stp x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adr x4, 4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #3
+ ldrsh x2, [x4, x2]
+ add x2, x2, x4
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x4, x4, #2
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+ .align 4
+2: .hword colormatrix_float_stu1-2b
+ .hword colormatrix_float_stu2-2b
+ .hword colormatrix_float_stu3-2b
+ .hword colormatrix_float_stu4-2b
+ .hword colormatrix_float_stf1-2b
+ .hword colormatrix_float_stf2-2b
+ .hword colormatrix_float_stf3-2b
+ .hword colormatrix_float_stf4-2b
+3: .hword colormatrix_float_ldu1-3b
+ .hword colormatrix_float_ldu2-3b
+ .hword colormatrix_float_ldu3-3b
+ .hword colormatrix_float_ldu4-3b
+ .hword colormatrix_float_ldf1-3b
+ .hword colormatrix_float_ldf2-3b
+ .hword colormatrix_float_ldf3-3b
+ .hword colormatrix_float_ldf4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .hword colormatrix_float_col0_\i-4b
+ .hword colormatrix_float_col1_\i-4b-2
+ .hword colormatrix_float_col2_\i-4b-4
+ .hword colormatrix_float_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .hword colormatrix_float_col0_n\i-4b
+ .hword colormatrix_float_col1_n\i-4b-2
+ .hword colormatrix_float_col2_n\i-4b-4
+ .hword colormatrix_float_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_float_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 0092d0e..ee10884 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -285,211 +285,3 @@
bx lr
END(rsdIntrinsicConvolve5x5_K)
-
-
-/* 3D LUT */
-
-/*
- r0 = dst
- r1 = src
- r2 = cube base pointer
- r3 = cube Y stride
- r4 = cube Z stride
- r5 = count
- xr10 = * constants
-
- d0 / q0 = weight 1 p1
- d1 = weight 2 p1
-
- d2 / q1 = weight 1 p2
- d3 = weight 2 p2
-
- d4 / q2 = src1
- d5 = src2
-
- d6 / q3 = baseCoord
- d7 = baseCoord
-
- d8 / q4 = coord1 p1
- d9 =
-
- d10 / q5 = coord1 p2
- d11 =
-
- d12 / q6 =
- d13 =
-
- d14 / q7 =
- d15 =
-
-
- d16 / q8 = x0 y0 z0
- d17 = x1 y0 z0
- d18 / q9 = x0 y1 z0
- d19 = x1 y1 z0
- d20 / q10 = x0 y0 z1
- d21 = x1 y0 z1
- d22 / q11 = x0 y1 z1
- d23 = x1 y1 z1
-
- d24 / q12 = alpha mash
- d25 = current pixel alpha
- d26 / q13 = 4, y stride
- d27 = z stride, 0
- d28 / q14 = 0x8000
- d29 = 0x7fff
- d30 / q15 = 0, 0, 0, 0xffff
-
-
- d31 = coordMult
-*/
-
-ENTRY(rsdIntrinsic3DLUT_K)
- push {r4-r8, r10, r11, lr}
- vpush {q4-q7}
-
- /* load Z stride in r4 */
- ldr r4, [sp, #32 + 64]
-
- /* Load count */
- ldr r5, [sp, #36 + 64]
-
- vmov.u16 d28, #0x8000
- vmov.u16 d29, #0x7fff
- vmov.u32 d24, #0xff000000
-
- /* load constants using r10 */
- ldr r10, [sp, #40 + 64]
- vld1.32 {d31}, [r10]!
- vld1.32 {d30}, [r10]!
-
- mov r6, #4
- vmov d26, r6, r3
- mov r6, #0
- vmov d27, r4, r6
-
- add r8, r3, r4
-
-
-
-1:
- vld1.8 {d4}, [r1]!
- vand.u8 d25, d4, d24
- vmovl.u8 q2, d4
-
-
- vmull.u16 q3, d4, d31
- vshr.u32 q4, q3, #15 // coord1 p1
- vmovn.u32 d1, q3
- vand.u16 d1, d29 // weight 2
- vsub.u16 d0, d28, d1 // weight 1
- vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0
-
- vmull.u16 q3, d5, d31
- vshr.u32 q5, q3, #15 // coord1 p2
- vmovn.u32 d3, q3
- vand.u16 d3, d29 // weight 2
- vsub.u16 d2, d28, d3 // weight 1
- vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0
-
- vpadd.u32 d8, d8, d9
- vpadd.u32 d9, d10, d11
- vpadd.u32 d8, d8, d9
- vmov r6, r7, d8 // base pointers
-
- add r6, r6, r2
- add r7, r7, r2
-
- vld1.8 {d16}, [r6]
- add r11, r6, r3
- vld1.8 {d18}, [r11]
- add r11, r6, r4
- vld1.8 {d20}, [r11]
- add r11, r6, r8
- vld1.8 {d22}, [r11]
-
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
- vmull.u16 q6, d16, d0[0]
- vmlal.u16 q6, d17, d1[0]
- vshrn.u32 d16, q6, #7
- vmull.u16 q6, d18, d0[0]
- vmlal.u16 q6, d19, d1[0]
- vshrn.u32 d18, q6, #7
- vmull.u16 q6, d20, d0[0]
- vmlal.u16 q6, d21, d1[0]
- vshrn.u32 d20, q6, #7
- vmull.u16 q6, d22, d0[0]
- vmlal.u16 q6, d23, d1[0]
- vshrn.u32 d22, q6, #7
-
- vmull.u16 q6, d16, d0[1]
- vmlal.u16 q6, d18, d1[1]
- vshrn.u32 d16, q6, #15
- vmull.u16 q6, d20, d0[1]
- vmlal.u16 q6, d22, d1[1]
- vshrn.u32 d18, q6, #15
-
- vmull.u16 q6, d16, d0[2]
- vmlal.u16 q6, d18, d1[2]
- vshrn.u32 d14, q6, #15
-
-
- vld1.8 {d16}, [r7]
- add r11, r7, r3
- vld1.8 {d18}, [r11]
- add r11, r7, r4
- vld1.8 {d20}, [r11]
- add r11, r7, r8
- vld1.8 {d22}, [r11]
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
- vmull.u16 q6, d16, d2[0]
- vmlal.u16 q6, d17, d3[0]
- vshrn.u32 d16, q6, #7
- vmull.u16 q6, d18, d2[0]
- vmlal.u16 q6, d19, d3[0]
- vshrn.u32 d18, q6, #7
- vmull.u16 q6, d20, d2[0]
- vmlal.u16 q6, d21, d3[0]
- vshrn.u32 d20, q6, #7
- vmull.u16 q6, d22, d2[0]
- vmlal.u16 q6, d23, d3[0]
- vshrn.u32 d22, q6, #7
-
- vmull.u16 q6, d16, d2[1]
- vmlal.u16 q6, d18, d3[1]
- vshrn.u32 d16, q6, #15
- vmull.u16 q6, d20, d2[1]
- vmlal.u16 q6, d22, d3[1]
- vshrn.u32 d18, q6, #15
-
- vmull.u16 q6, d16, d2[2]
- vmlal.u16 q6, d18, d3[2]
- vshrn.u32 d15, q6, #15
-
- vrshrn.u16 d14, q7, #8
-
- vbic.u8 d14, d14, d24 // mix in alpha
- vorr.u8 d14, d14, d25
- vst1.32 {d14}, [r0]!
-
-
- /* Are we done? */
- subs r5, r5, #1
- bne 1b
-
- /* Yup, bye */
- vpop {q4-q7}
- pop {r4-r8, r10, r11, lr}
- bx lr
-
-END(rsdIntrinsic3DLUT_K)
-
-
diff --git a/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
new file mode 100644
index 0000000..597154b
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_3DLUT.S
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
+
+ vmov.s32 r6, r7, \src
+
+ add r6, r6, r3
+ add r7, r7, r3
+
+ vld1.u8 d16, [r6], r4
+ vld1.u8 d17, [r7], r4
+
+ vld1.u8 d18, [r6], r5
+ vld1.u8 d19, [r7], r5
+
+ vdup.u8 d6, \yr0
+ vdup.u8 d7, \yr1
+ /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
+ vshll.u8 q12, d16, #8
+ vshll.u8 q13, d17, #8
+ vmlsl.u8 q12, d16, d6
+ vmlsl.u8 q13, d17, d7
+ vmlal.u8 q12, d18, d6
+ vmlal.u8 q13, d19, d7
+
+ vld1.u8 d18, [r6]
+ vld1.u8 d19, [r7]
+
+ sub r6, r6, r4
+ sub r7, r7, r4
+
+ vld1.u8 d16, [r6]
+ vld1.u8 d17, [r7]
+
+ /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q15, d17, #8
+ vmlsl.u8 q14, d16, d6
+ vmlsl.u8 q15, d17, d7
+ vmlal.u8 q14, d18, d6
+ vmlal.u8 q15, d19, d7
+
+ /* Z interpolate, lane 0 q12/q14 -> q10 */
+ vshll.u16 q8, d24, #8
+ vshll.u16 q9, d25, #8
+ vmlsl.u16 q8, d24, \zr0
+ vmlsl.u16 q9, d25, \zr0
+ vmlal.u16 q8, d28, \zr0
+ vmlal.u16 q9, d29, \zr0
+ vrshrn.u32 d20, q8, #8
+ vrshrn.u32 d21, q9, #8
+
+ /* Z interpolate, lane 1 q13/q15 -> q11 */
+ vshll.u16 q8, d26, #8
+ vshll.u16 q9, d27, #8
+ vmlsl.u16 q8, d26, \zr1
+ vmlsl.u16 q9, d27, \zr1
+ vmlal.u16 q8, d30, \zr1
+ vmlal.u16 q9, d31, \zr1
+ vrshrn.u32 d22, q8, #8
+ vrshrn.u32 d23, q9, #8
+
+ /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
+ vshll.u16 q8, d20, #8
+ vshll.u16 q9, d22, #8
+ vmlsl.u16 q8, d20, \xr0
+ vmlsl.u16 q9, d22, \xr1
+ vmlal.u16 q8, d21, \xr0
+ vmlal.u16 q9, d23, \xr1
+ vshrn.u32 d28, q8, #8
+ vshrn.u32 d29, q9, #8
+
+ /* pack lanes 0-1 -> d12 */
+ vqrshrn.u16 \dst, q14, #8
+.endm
+
+/* size_t rsdIntrinsic3DLUT_K(
+ * void *dst, // r0
+ * void const *in, // r1
+ * size_t count, // r2
+ * void const *lut, // r3
+ * int32_t pitchy, // [sp]
+ * int32_t pitchz, // [sp+#4]
+ * int dimx, // [sp+#8]
+ * int dimy, // [sp+#12]
+ * int dimz); // [sp+#16]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+ push {r4,r5,r6,r7}
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+ ldr r6, [sp, #24]
+ ldr r7, [sp, #28]
+ ldr r12, [sp, #32]
+ vpush {d8-d15}
+
+ vmov.u8 d8, #1
+ vmov.u16 d8[0], r6
+ vmov.u16 d8[1], r7
+ vmov.u16 d8[2], r12
+ vmov.s32 d9, r4, r5
+
+ b 2f
+
+ .align 6
+/* r0 = dst
+ * r1 = src
+ * r2 = count
+ * r3 = lut
+ * r4 = pitchy
+ * r5 = pitchz
+ * r6 = offset0
+ * r7 = offset1
+ */
+
+1: vld4.u8 {d0,d2,d4,d6}, [r1]!
+ vmov d10, d6
+/* q0,q1,q2,q5 source data
+ * q4 dimensions and pitches
+ * q3, scratch register for scalar access
+ */
+ vmov q3, q4
+ vmovl.u8 q0, d0
+ vmovl.u8 q1, d2
+ vmovl.u8 q2, d4
+ vmul.u16 q0, q0, d6[0]
+ vmul.u16 q1, q1, d6[1]
+ vmul.u16 q2, q2, d6[2]
+
+/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero. Strictly this is
+ * correct, except for the llegal access problem.
+ */
+ vsra.u16 q0, q0, #8
+ vsra.u16 q1, q1, #8
+ vsra.u16 q2, q2, #8
+
+ vshr.u16 q12, q0, #8
+ vshr.u16 q13, q1, #8
+ vshr.u16 q14, q2, #8
+
+ vbic.u16 q0, #0xff00
+ vmovn.u16 d2, q1
+ vbic.u16 q2, #0xff00
+
+/* q0,d2,q2 fractional offset
+ * q12,q13,q14 integer offset
+ */
+
+ vshll.u16 q6, d24, #2
+ vshll.u16 q7, d25, #2
+ vmovl.u16 q8, d26
+ vmovl.u16 q9, d27
+ vmovl.u16 q10, d28
+ vmovl.u16 q11, d29
+ vmla.s32 q6, q8, d9[0]
+ vmla.s32 q7, q9, d9[0]
+ vmla.s32 q6, q10, d9[1]
+ vmla.s32 q7, q11, d9[1]
+
+/* q6,q7 list of table offsets */
+
+ /* lanes 0 and 1 */
+ lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
+
+ /* lanes 2 and 3 */
+ lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
+
+ /* lanes 4 and 5 */
+ lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
+
+ /* lanes 6 and 7 */
+ lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
+
+ vuzp.u8 d12, d13
+ vuzp.u8 d14, d15
+ vuzp.u8 d12, d14
+ vuzp.u8 d13, d15
+
+ vmov.u8 d15, d10
+
+ vst4.u8 {d12,d13,d14,d15}, [r0]!
+
+2: subs r2, #8
+ bhs 1b
+ add r0, r2, #8
+ vpop {d8-d15}
+ pop {r4,r5,r6,r7}
+ bx lr
+END(rsdIntrinsic3DLUT_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 249ac58..34162ee 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -29,10 +29,10 @@
vpush {q4-q7}
vld1.16 {q2}, [r2]!
vld1.16 {q3}, [r2]!
- vld1.32 {q4}, [r2]!
- vld1.32 {q5}, [r2]!
- vld1.32 {q6}, [r2]!
- vld1.32 {q7}, [r2]!
+ vld1.32 {d8[],d9[]}, [r2]!
+ vld1.32 {d10[],d11[]}, [r2]!
+ vld1.32 {d12[],d13[]}, [r2]!
+ vld1.32 {d14[],d15[]}, [r2]!
veor q0, q0
veor q1, q1
veor q9, q9
@@ -43,15 +43,15 @@
SNIP_START(_N_ColorMatrix_prefix_f)
stmfd sp!, {r4, lr}
vpush {q4-q7}
- add r2, #96
+ add r2, #48
vld1.32 {q4}, [r2]!
vld1.32 {q5}, [r2]!
vld1.32 {q6}, [r2]!
vld1.32 {q7}, [r2]!
- vld1.32 {q8}, [r2]!
- vld1.32 {q9}, [r2]!
- vld1.32 {q10}, [r2]!
- vld1.32 {q11}, [r2]!
+ vld1.32 {d16[],d17[]}, [r2]!
+ vld1.32 {d18[],d19[]}, [r2]!
+ vld1.32 {d20[],d21[]}, [r2]!
+ vld1.32 {d22[],d23[]}, [r2]!
veor q1, q1
veor q2, q2
veor q3, q3