Add AArch64 assembly for ColorMatrix.
Change-Id: I2fcc57aceea08243d5132287f6de053b846c5fe7
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 9cb4847..5699777 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -42,6 +42,7 @@
#LOCAL_SRC_FILES_arm64 += \
# rsCpuIntrinsics_advsimd_Blend.S \
# rsCpuIntrinsics_advsimd_Blur.S \
+# rsCpuIntrinsics_advsimd_ColorMatrix.S \
# rsCpuIntrinsics_advsimd_YuvToRGB.S
ifeq ($(ARCH_ARM_HAVE_NEON),true)
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 87db9ba..5d4241a 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -125,6 +125,32 @@
} u;
} Key_t;
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+typedef struct {
+ void (*column[4])(void);
+ void (*store)(void);
+ void (*load)(void);
+} FunctionTab_t;
+
+extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+ FunctionTab_t const *fns,
+ uint32_t mask, int dt, int st);
+
+extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+ void *out, void const *in, size_t count,
+ FunctionTab_t const *fns,
+ float const *mult, float const *add);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+ FunctionTab_t const *fns,
+ uint32_t mask, int dt, int st);
+#endif
+
class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
public:
virtual void populateScript(Script *);
@@ -146,9 +172,12 @@
// The following four fields are read as constants
// by the SIMD assembly code.
short ip[16];
- int ipa[16];
+ int ipa[4];
float tmpFp[16];
- float tmpFpa[16];
+ float tmpFpa[4];
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ FunctionTab_t mFnTab;
+#endif
static void kernel(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
@@ -212,9 +241,9 @@
}
}
if (ipa[0] != 0) key.u.addMask |= 0x1;
- if (ipa[4] != 0) key.u.addMask |= 0x2;
- if (ipa[8] != 0) key.u.addMask |= 0x4;
- if (ipa[12] != 0) key.u.addMask |= 0x8;
+ if (ipa[1] != 0) key.u.addMask |= 0x2;
+ if (ipa[2] != 0) key.u.addMask |= 0x4;
+ if (ipa[3] != 0) key.u.addMask |= 0x8;
}
// Look for a dot product where the r,g,b colums are the same
@@ -257,13 +286,16 @@
case 3:
key.u.outVecSize = 2;
key.u.coeffMask &= ~0x8888;
+ key.u.addMask &= 7;
break;
case 2:
key.u.outVecSize = 1;
key.u.coeffMask &= ~0xCCCC;
+ key.u.addMask &= 3;
break;
default:
key.u.coeffMask &= ~0xEEEE;
+ key.u.addMask &= 1;
break;
}
@@ -278,7 +310,7 @@
return key;
}
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
#define DEF_SYM(x) \
extern "C" uint32_t _N_ColorMatrix_##x; \
@@ -408,7 +440,7 @@
bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
-#if defined(ARCH_ARM_HAVE_NEON) && !defined(FAKE_ARM64_BUILD)
+#if defined(ARCH_ARM_HAVE_VFP) && !defined(FAKE_ARM64_BUILD)
mBufSize = 4096;
//StopWatch build_time("rs cm: build time");
mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
@@ -676,18 +708,12 @@
float add = 0.f;
if (fpMul > 254.f) add = 0.5f;
for(int ct=0; ct < 4; ct++) {
- tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
+ tmpFpa[ct] = fpa[ct] * addMul + add;
//ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
- tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
- tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
- tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
}
for(int ct=0; ct < 4; ct++) {
- ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
- ipa[ct * 4 + 1] = ipa[ct * 4];
- ipa[ct * 4 + 2] = ipa[ct * 4];
- ipa[ct * 4 + 3] = ipa[ct * 4];
+ ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
}
}
@@ -768,9 +794,9 @@
//ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
sum.x += add[0];
- sum.y += add[4];
- sum.z += add[8];
- sum.w += add[12];
+ sum.y += add[1];
+ sum.z += add[2];
+ sum.w += add[3];
//ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
@@ -826,12 +852,27 @@
//if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
if(x2 > x1) {
- int32_t len = (x2 - x1) >> 2;
- if((cp->mOptKernel != NULL) && (len > 0)) {
- cp->mOptKernel(out, in, cp->ip, len);
- x1 += len << 2;
- out += outstep * (len << 2);
- in += instep * (len << 2);
+ int32_t len = x2 - x1;
+ if (gArchUseSIMD) {
+ if((cp->mOptKernel != NULL) && (len >= 4)) {
+ cp->mOptKernel(out, in, cp->ip, len >> 2);
+ x1 += len;
+ out += outstep * len;
+ in += instep * len;
+ }
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ else {
+ size_t done;
+ if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+ done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+ } else {
+ done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+ }
+ x1 += done;
+ out += outstep * done;
+ in += instep * done;
+ }
+#endif
}
while(x1 != x2) {
@@ -872,8 +913,29 @@
mOptKernel = NULL;
if (build(key)) {
mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
- mLastKey = key;
}
+#if defined(ARCH_ARM_HAVE_VFP) && defined(FAKE_ARM64_BUILD)
+ else {
+ int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+ uint32_t mm = 0;
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+ m = ((m * 0x249) >> 9) & 15;
+ m |= ((key.u.addMask >> i) & 1) << 4;
+ mm |= m << (i * 5);
+ }
+
+ if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+ rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+ } else {
+ rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+ }
+ }
+#endif
+ mLastKey = key;
}
}
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
new file mode 100644
index 0000000..7a6d4c5
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -0,0 +1,838 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fmla \opd, \opa, \opb
+ .else
+ fmul \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1)
+ fadd \opd, \opa, \opb
+ .else
+ mov \stupidsyntax1, \stupidsyntax2
+ .endif
+ .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal \opd, \opa, \opb
+ .else
+ smull \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+ .if (\i) & \mask
+ .if (\i) & (\mask - 1 + 16)
+ smlal2 \opd, \opa, \opb
+ .else
+ smull2 \opd, \opa, \opb
+ .endif
+ .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+colormatrix_int_col0_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[0]
+ dup v7.4s, v4.s[0]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4]
+ sqshrun v8.4h, v6.4s, #8
+ sqshrun2 v8.8h, v7.4s, #8
+ br x5
+
+.align 6
+colormatrix_int_col1_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+colormatrix_int_col1_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[1]
+ dup v7.4s, v4.s[1]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5]
+ sqshrun v9.4h, v6.4s, #8
+ sqshrun2 v9.8h, v7.4s, #8
+ br x6
+
+.align 6
+colormatrix_int_col2_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+colormatrix_int_col2_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[2]
+ dup v7.4s, v4.s[2]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6]
+ sqshrun v10.4h, v6.4s, #8
+ sqshrun2 v10.8h, v7.4s, #8
+ br x7
+
+.align 6
+colormatrix_int_col3_\i:
+ .if \i & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+colormatrix_int_col3_n\i:
+ .if (\i^31) & 16
+ dup v6.4s, v4.s[3]
+ dup v7.4s, v4.s[3]
+ .endif
+ vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3]
+ vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7]
+ vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3]
+ vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7]
+ vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3]
+ vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7]
+ vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3]
+ vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7]
+ sqshrun v11.4h, v6.4s, #8
+ sqshrun2 v11.8h, v7.4s, #8
+ br x8
+
+.align 5
+colormatrix_float_col0_\i:
+ vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 4
+colormatrix_float_col0_n\i:
+ vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b
+ vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0]
+ vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0]
+ vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0]
+ vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0]
+ vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b
+ br x5
+
+.align 5
+colormatrix_float_col1_\i:
+ vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 4
+colormatrix_float_col1_n\i:
+ vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b
+ vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1]
+ vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1]
+ vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1]
+ vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1]
+ vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b
+ br x6
+
+.align 5
+colormatrix_float_col2_\i:
+ vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 4
+colormatrix_float_col2_n\i:
+ vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b
+ vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2]
+ vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2]
+ vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2]
+ vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2]
+ vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b
+ br x7
+
+.align 5
+colormatrix_float_col3_\i:
+ vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.align 4
+colormatrix_float_col3_n\i:
+ vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b
+ vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3]
+ vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3]
+ vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3]
+ vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3]
+ vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b
+ br x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v23.8h, v23.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl v15.4s, v23.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ uxtl2 v23.4s, v23.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v15.4s, v15.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ br x4
+
+.align 5
+colormatrix_int_ldu4:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ uxtl v15.8h, v15.8b
+ br x4
+
+.align 6
+colormatrix_float_ldu3:
+ ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl v14.4s, v22.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ uxtl2 v22.4s, v22.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v14.4s, v14.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v22.4s, v22.4s
+ br x4
+
+colormatrix_int_ldu3:
+ ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ uxtl v14.8h, v14.8b
+ br x4
+
+.align 5
+colormatrix_float_ldu1:
+ ld1 {v20.8b}, [x1], #8
+ uxtl v20.8h, v20.8b
+ uxtl v12.4s, v20.4h
+ uxtl2 v20.4s, v20.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v20.4s, v20.4s
+ br x4
+
+.align 6
+colormatrix_float_ldu2:
+ ld2 {v20.8b,v21.8b}, [x1], #16
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v12.4s, v20.4h
+ uxtl v13.4s, v21.4h
+ uxtl2 v20.4s, v20.8h
+ uxtl2 v21.4s, v21.8h
+ ucvtf v12.4s, v12.4s
+ ucvtf v13.4s, v13.4s
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ br x4
+
+.align 4
+colormatrix_int_ldu2:
+ ld2 {v12.8b,v13.8b}, [x1], #16
+ uxtl v12.8h, v12.8b
+ uxtl v13.8h, v13.8b
+ br x4
+
+.align 6
+colormatrix_float_stu4:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v27.4s, v11.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ fcvtzs v31.4s, v19.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun v27.4h, v27.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ sqrshrun2 v27.8h, v31.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ uqxtn v27.8b, v27.8h
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu4:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ uqxtn v15.8b, v11.8h
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu3:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v26.4s, v10.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ fcvtzs v30.4s, v18.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun v26.4h, v26.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ sqrshrun2 v26.8h, v30.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ uqxtn v26.8b, v26.8h
+ movi v27.8b, #0
+ subs x2, x2, #8
+ st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+.align 4
+colormatrix_int_ldu1:
+ ld1 {v12.8b}, [x1], #8
+ uxtl v12.8h, v12.8b
+ br x4
+
+.align 5
+colormatrix_int_stu3:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ uqxtn v14.8b, v10.8h
+ movi v15.8b, #0
+ subs x2, x2, #8
+ st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+ blo colormatrix_int_end
+ br x9
+
+.align 6
+colormatrix_float_stu2:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v25.4s, v9.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ fcvtzs v29.4s, v17.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun v25.4h, v25.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ sqrshrun2 v25.8h, v29.4s, #1
+ uqxtn v24.8b, v24.8h
+ uqxtn v25.8b, v25.8h
+ subs x2, x2, #8
+ st2 {v24.8b,v25.8b}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_int_stu2:
+ uqxtn v12.8b, v8.8h
+ uqxtn v13.8b, v9.8h
+ subs x2, x2, #8
+ st2 {v12.8b,v13.8b}, [x0], #16
+ blo colormatrix_int_end
+ br x9
+
+.align 5
+colormatrix_int_stu1:
+ uqxtn v12.8b, v8.8h
+ subs x2, x2, #8
+ st1 {v12.8b}, [x0], #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_float_ldf3:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 6
+colormatrix_float_stu1:
+ fcvtzs v24.4s, v8.4s, #1
+ fcvtzs v28.4s, v16.4s, #1
+ sqrshrun v24.4h, v24.4s, #1
+ sqrshrun2 v24.8h, v28.4s, #1
+ uqxtn v24.8b, v24.8h
+ subs x2, x2, #8
+ st1 {v24.8b}, [x0], #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_stf3:
+ movi v11.16b, #0
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ movi v19.16b, #0
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+.align 5
+colormatrix_float_stf4:
+ st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+ subs x2, x2, #8
+ st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf4:
+ ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+ ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+ br x4
+
+.align 5
+colormatrix_float_stf2:
+ st2 {v8.4s, v9.4s}, [x0], #32
+ subs x2, x2, #8
+ st2 {v16.4s, v17.4s}, [x0], #32
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf2:
+ ld2 {v12.4s,v13.4s}, [x1], #32
+ ld2 {v20.4s,v21.4s}, [x1], #32
+ br x4
+
+.align 5
+colormatrix_float_stf1:
+ st1 {v8.4s}, [x0], #16
+ subs x2, x2, #8
+ st1 {v16.4s}, [x0], #16
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_ldf1:
+ ld1 {v12.4s}, [x1], #16
+ ld1 {v20.4s}, [x1], #16
+ br x4
+
+
+/* size_t rsdIntrinsicColorMatrix_int_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * int16_t const *mult, // x4
+ * int32_t const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+ stp x8,x9, [sp, #-16]!
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.8h,v1.8h}, [x4], #32
+ ld1 {v4.4s}, [x5], #16
+
+ ldp x4,x5, [x3],#16
+ ldp x6,x7, [x3],#16
+ ldp x8,x9, [x3],#16
+
+ dup v12.4s, v4.s[0]
+ dup v13.4s, v4.s[1]
+ dup v14.4s, v4.s[2]
+ dup v15.4s, v4.s[3]
+ sqshrun v8.4h, v12.4s, #8
+ sqshrun2 v8.8h, v12.4s, #8
+ sqshrun v9.4h, v13.4s, #8
+ sqshrun2 v9.8h, v13.4s, #8
+ sqshrun v10.4h, v14.4s, #8
+ sqshrun2 v10.8h, v14.4s, #8
+ sqshrun v11.4h, v15.4s, #8
+ sqshrun2 v11.8h, v15.4s, #8
+
+ subs x2, x2, #8
+ blo colormatrix_int_end
+ br x9
+
+colormatrix_int_end:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ldp x8,x9, [sp], #16
+ add x0, x2, #8
+ ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+ adr x4, 2f
+ ldrsh x2, [x4, x2, LSL #1]
+ add x2, x2, x4
+ adr x4, 3f
+ ldrsh x3, [x4, x3, LSL #1]
+ add x3, x3, x4
+ stp x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adr x4, 4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #3
+ ldrsh x2, [x4, x2]
+ add x2, x2, x4
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x4, x4, #2
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+ .align 4
+2: .hword colormatrix_int_stu1-2b
+ .hword colormatrix_int_stu2-2b
+ .hword colormatrix_int_stu3-2b
+ .hword colormatrix_int_stu4-2b
+3: .hword colormatrix_int_ldu1-3b
+ .hword colormatrix_int_ldu2-3b
+ .hword colormatrix_int_ldu3-3b
+ .hword colormatrix_int_ldu4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .hword colormatrix_int_col0_\i-4b
+ .hword colormatrix_int_col1_\i-4b-2
+ .hword colormatrix_int_col2_\i-4b-4
+ .hword colormatrix_int_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .hword colormatrix_int_col0_n\i-4b
+ .hword colormatrix_int_col1_n\i-4b-2
+ .hword colormatrix_int_col2_n\i-4b-4
+ .hword colormatrix_int_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_int_K)
+
+
+/* size_t rsdIntrinsicColorMatrix_float_K(
+ * void *out, // x0
+ * void const *in, // x1
+ * size_t count, // x2
+ * fntab_t const *fns, // x3
+ * float const *mult, // x4
+ * float const *add); // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+ stp x8,x9, [sp, #-16]!
+ sub x7, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d-v11.1d}, [sp]
+ st1 {v12.1d-v15.1d}, [x7]
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+ ld1r {v4.4s}, [x5], #4
+ ld1r {v5.4s}, [x5], #4
+ ld1r {v6.4s}, [x5], #4
+ ld1r {v7.4s}, [x5], #4
+
+ ldp x4,x5, [x3], #16
+ ldp x6,x7, [x3], #16
+ ldp x8,x9, [x3], #16
+
+ mov v8.16b, v4.16b
+ mov v9.16b, v5.16b
+ mov v10.16b, v6.16b
+ mov v11.16b, v7.16b
+
+ mov v16.16b, v4.16b
+ mov v17.16b, v5.16b
+ mov v18.16b, v6.16b
+ mov v19.16b, v7.16b
+
+ subs x2, x2, #8
+ blo colormatrix_float_end
+ br x9
+
+colormatrix_float_end:
+ ld1 {v8.1d-v11.1d}, [sp], #32
+ ld1 {v12.1d-v15.1d}, [sp], #32
+ ldp x8,x9, [sp], #16
+ add x0, x2, #8
+ ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ * fntab_t const *fns, // x0
+ * uint32_t mask, // x1
+ * int dt, // x2
+ * int st); // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+ adr x4, 2f
+ ldrsh x2, [x4, x2, LSL #1]
+ add x2, x2, x4
+ adr x4, 3f
+ ldrsh x3, [x4, x3, LSL #1]
+ add x3, x3, x4
+ stp x2, x3, [x0, #32]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+ mov x3, #4
+ adr x4, 4f
+1: ands x2, x1, #15
+ beq 9f
+ and x2, x1, #31
+ lsl x2, x2, #3
+ ldrsh x2, [x4, x2]
+ add x2, x2, x4
+9: str x2, [x0], #8
+ lsr x1, x1, #5
+ add x4, x4, #2
+ subs x3, x3, #1
+ bne 1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+ ldr x2, [x0]
+ mov x3, #4
+1: ldr x1, [x0, #-8]!
+ cmp x1, #0
+ csel x2, x1, x2, ne
+ str x2, [x0]
+ subs x3, x3, #1
+ bne 1b
+ ret
+
+ .align 4
+2: .hword colormatrix_float_stu1-2b
+ .hword colormatrix_float_stu2-2b
+ .hword colormatrix_float_stu3-2b
+ .hword colormatrix_float_stu4-2b
+ .hword colormatrix_float_stf1-2b
+ .hword colormatrix_float_stf2-2b
+ .hword colormatrix_float_stf3-2b
+ .hword colormatrix_float_stf4-2b
+3: .hword colormatrix_float_ldu1-3b
+ .hword colormatrix_float_ldu2-3b
+ .hword colormatrix_float_ldu3-3b
+ .hword colormatrix_float_ldu4-3b
+ .hword colormatrix_float_ldf1-3b
+ .hword colormatrix_float_ldf2-3b
+ .hword colormatrix_float_ldf3-3b
+ .hword colormatrix_float_ldf4-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .hword colormatrix_float_col0_\i-4b
+ .hword colormatrix_float_col1_\i-4b-2
+ .hword colormatrix_float_col2_\i-4b-4
+ .hword colormatrix_float_col3_\i-4b-6
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ .hword colormatrix_float_col0_n\i-4b
+ .hword colormatrix_float_col1_n\i-4b-2
+ .hword colormatrix_float_col2_n\i-4b-4
+ .hword colormatrix_float_col3_n\i-4b-6
+.endr
+END(rsdIntrinsicColorMatrixSetup_float_K)
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
index 249ac58..34162ee 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -29,10 +29,10 @@
vpush {q4-q7}
vld1.16 {q2}, [r2]!
vld1.16 {q3}, [r2]!
- vld1.32 {q4}, [r2]!
- vld1.32 {q5}, [r2]!
- vld1.32 {q6}, [r2]!
- vld1.32 {q7}, [r2]!
+ vld1.32 {d8[],d9[]}, [r2]!
+ vld1.32 {d10[],d11[]}, [r2]!
+ vld1.32 {d12[],d13[]}, [r2]!
+ vld1.32 {d14[],d15[]}, [r2]!
veor q0, q0
veor q1, q1
veor q9, q9
@@ -43,15 +43,15 @@
SNIP_START(_N_ColorMatrix_prefix_f)
stmfd sp!, {r4, lr}
vpush {q4-q7}
- add r2, #96
+ add r2, #48
vld1.32 {q4}, [r2]!
vld1.32 {q5}, [r2]!
vld1.32 {q6}, [r2]!
vld1.32 {q7}, [r2]!
- vld1.32 {q8}, [r2]!
- vld1.32 {q9}, [r2]!
- vld1.32 {q10}, [r2]!
- vld1.32 {q11}, [r2]!
+ vld1.32 {d16[],d17[]}, [r2]!
+ vld1.32 {d18[],d19[]}, [r2]!
+ vld1.32 {d20[],d21[]}, [r2]!
+ vld1.32 {d22[],d23[]}, [r2]!
veor q1, q1
veor q2, q2
veor q3, q3