Color Matrix improvements.
Change-Id: I4594ea43a0a2b298a9ad66bd5e63d8b829d4f620
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index 5becdf1..b3de4f4 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -34,7 +34,8 @@
ifeq ($(ARCH_ARM_HAVE_NEON),true)
LOCAL_CFLAGS += -DARCH_ARM_HAVE_NEON
LOCAL_SRC_FILES+= \
- rsCpuIntrinsics_neon.S
+ rsCpuIntrinsics_neon.S \
+ rsCpuIntrinsics_neon_ColorMatrix.S
endif
ifeq ($(ARCH_ARM_HAVE_VFP),true)
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 3fc322c..4a57c89 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -14,9 +14,19 @@
* limitations under the License.
*/
+#include <sys/mman.h>
+#include <unistd.h>
#include "rsCpuIntrinsic.h"
#include "rsCpuIntrinsicInlines.h"
+#include "linkloader/include/MemChunk.h"
+
+#include <sys/mman.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+//#include <utils/StopWatch.h>
+
using namespace android;
using namespace android::renderscript;
@@ -34,49 +44,425 @@
virtual ~RsdCpuScriptIntrinsicColorMatrix();
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
+ virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+ virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc);
+
protected:
float fp[16];
+ float fpa[4];
short ip[16];
+ int ipa[4];
- static void kernel4x4(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep);
- static void kernel3x3(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep);
- static void kernelDot(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep);
+ static void kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep);
+
+ int64_t mLastKey;
+ unsigned char *mBuf;
+ size_t mBufSize;
+
+ int64_t computeKey(const Element *ein, const Element *eout);
+
+ bool build(int64_t key);
+
+ void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
+
};
}
}
-void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
- size_t dataLength) {
- rsAssert(slot == 0);
- memcpy (fp, data, dataLength);
- for(int ct=0; ct < 16; ct++) {
- ip[ct] = (short)(fp[ct] * 255.f + 0.5f);
+#define CM_IN_VEC_SIZE_MASK 0x00000003
+#define CM_OUT_VEC_SIZE_MASK 0x0000000c
+#define CM_IN_TYPE_SIZE_MASK 0x000000F0
+#define CM_OUT_TYPE_SIZE_MASK 0x00000F00
+#define CM_DOT_MASK 0x00001000
+#define CM_ADD_MASK 0x00002000
+#define CM_COPY_ALPHA 0x00004000
+#define CM_MATRIX_MASK 0xFFFF0000
+
+
+int64_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
+ const Element *ein, const Element *eout) {
+
+ int64_t key = 0;
+
+ // Compute a unique code key for this operation
+
+ // Add to the key the input and output types
+ key |= (ein->getVectorSize() - 1);
+ key |= (eout->getVectorSize() - 1) << 2;
+
+ bool hasFloat = false;
+ if (ein->getType() == RS_TYPE_FLOAT_32) {
+ hasFloat = true;
+ key |= 1 << 4;
+ }
+ if (eout->getType() == RS_TYPE_FLOAT_32) {
+ hasFloat = true;
+ key |= 1 << 8;
}
- mRootPtr = &kernel4x4;
- if ((ip[3] == 0) && (ip[7] == 0) && (ip[11] == 0) &&
- (ip[12] == 0) && (ip[13] == 0) && (ip[14] == 0) && (ip[15] == 255)) {
- mRootPtr = &kernel3x3;
-
- if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
- (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
- (ip[8] == ip[9]) && (ip[8] == ip[10])) {
- mRootPtr = &kernelDot;
+ // Mask in the bits indicating which coefficients in the
+ // color matrix are needed.
+ if (hasFloat) {
+ for (uint32_t i=0; i < 16; i++) {
+ if (fabs(fp[i]) != 0.f) {
+ key |= (uint32_t)(1 << (i + 16));
+ }
+ }
+ } else {
+ for (uint32_t i=0; i < 16; i++) {
+ if (ip[i] != 0) {
+ key |= (uint32_t)(1 << (i + 16));
+ }
}
}
+
+ // Look for a dot product where the r,g,b colums are the same
+ if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
+ (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
+ (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
+ (ip[12] == ip[13]) && (ip[12] == ip[14])) {
+
+ key |= CM_DOT_MASK;
+ }
+
+ // Is alpha a simple copy
+ if (!(key & 0x08880000) && (ip[15] == 256)) {
+ key |= CM_COPY_ALPHA;
+ }
+
+ //ALOGE("build key %08x, %08x", (int32_t)(key >> 32), (int32_t)key);
+ return key;
}
-extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, const short *coef, uint32_t count);
-extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, const short *coef, uint32_t count);
+#if defined(ARCH_ARM_HAVE_NEON)
+
+#define DEF_SYM(x) \
+ extern "C" uint32_t _N_ColorMatrix_##x; \
+ extern "C" uint32_t _N_ColorMatrix_##x##_end; \
+ extern "C" uint32_t _N_ColorMatrix_##x##_len;
+
+DEF_SYM(prefix)
+DEF_SYM(postfix1)
+DEF_SYM(postfix2)
+DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_2)
+DEF_SYM(load_u8_1)
+DEF_SYM(store_u8_4)
+DEF_SYM(store_u8_2)
+DEF_SYM(store_u8_1)
+DEF_SYM(unpack_u8_4)
+DEF_SYM(unpack_u8_3)
+DEF_SYM(unpack_u8_2)
+DEF_SYM(unpack_u8_1)
+DEF_SYM(pack_u8_4)
+DEF_SYM(pack_u8_3)
+DEF_SYM(pack_u8_2)
+DEF_SYM(pack_u8_1)
+DEF_SYM(dot)
+DEF_SYM(add_0_u8)
+DEF_SYM(add_1_u8)
+DEF_SYM(add_2_u8)
+DEF_SYM(add_3_u8)
+
+#define ADD_CHUNK(x) \
+ memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
+ buf += _N_ColorMatrix_##x##_len
+
+
+static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
+ size_t off = (target - buf - 8) >> 2;
+ rsAssert(((off & 0xff000000) == 0) ||
+ ((off & 0xff000000) == 0xff000000));
+
+ uint32_t op = (condition << 28);
+ op |= 0xa << 24; // branch
+ op |= 0xffffff & off;
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm, bool q, bool sz) {
+ rsAssert(vd < 32);
+ rsAssert(vm < 32);
+ rsAssert(vn < 32);
+
+ uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
+ op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
+ op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
+ if (q) op |= 1 << 6;
+ if (sz) op |= 1 << 8;
+ return op;
+}
+
+static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
+ //vmlal.s16 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3), false, false);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+
+static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
+ //vmull.s16 Q#1, D#1, D#2[#]
+ uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3), false, false);
+ ((uint32_t *)buf)[0] = op;
+ return buf + 4;
+}
+#endif
+
+
+bool RsdCpuScriptIntrinsicColorMatrix::build(int64_t key) {
+#if defined(ARCH_ARM_HAVE_NEON)
+ mBufSize = 4096;
+ //StopWatch build_time("rs cm: build time");
+ mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (!mBuf) {
+ return false;
+ }
+
+ uint8_t *buf = mBuf;
+
+ // Add the function prefix
+ // Store the address for the loop return
+ ADD_CHUNK(prefix);
+ uint8_t *buf2 = buf;
+
+ // Load the incoming r,g,b,a as needed
+ switch(key & CM_IN_VEC_SIZE_MASK) {
+ case 3:
+ ADD_CHUNK(load_u8_4);
+ if (key & CM_COPY_ALPHA) {
+ ADD_CHUNK(unpack_u8_3);
+ } else {
+ ADD_CHUNK(unpack_u8_4);
+ }
+ break;
+ case 2:
+ ADD_CHUNK(load_u8_4);
+ ADD_CHUNK(unpack_u8_3);
+ break;
+ case 1:
+ ADD_CHUNK(load_u8_2);
+ ADD_CHUNK(unpack_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(load_u8_1);
+ ADD_CHUNK(unpack_u8_1);
+ break;
+ }
+
+ // Add multiply and accumulate
+ // use MULL to init the output register,
+ // use MLAL from there
+ bool linit[4] = {false, false, false, false};
+ if (key & (1 << 16)) {
+ buf = addVMULL_S16(buf, 8, 24, 4, 0);
+ linit[0] = true;
+ }
+ if (!(key & CM_DOT_MASK)) {
+ if (key & (1 << 17)) {
+ buf = addVMULL_S16(buf, 9, 24, 4, 1);
+ linit[1] = true;
+ }
+ if (key & (1 << 18)) {
+ buf = addVMULL_S16(buf, 10, 24, 4, 2);
+ linit[2] = true;
+ }
+ }
+ if (key & (1 << 19)) {
+ buf = addVMULL_S16(buf, 11, 24, 4, 3);
+ linit[3] = true;
+ }
+
+ if (key & (1 << 20)) {
+ if (linit[0]) {
+ buf = addVMLAL_S16(buf, 8, 26, 5, 0);
+ } else {
+ buf = addVMULL_S16(buf, 8, 26, 5, 0);
+ linit[0] = true;
+ }
+ }
+ if (!(key & CM_DOT_MASK)) {
+ if (key & (1 << 21)) {
+ if (linit[1]) {
+ buf = addVMLAL_S16(buf, 9, 26, 5, 1);
+ } else {
+ buf = addVMULL_S16(buf, 9, 26, 5, 1);
+ linit[1] = true;
+ }
+ }
+ if (key & (1 << 22)) {
+ if (linit[2]) {
+ buf = addVMLAL_S16(buf, 10, 26, 5, 2);
+ } else {
+ buf = addVMULL_S16(buf, 10, 26, 5, 2);
+ linit[2] = true;
+ }
+ }
+ }
+ if (key & (1 << 23)) {
+ if (linit[3]) {
+ buf = addVMLAL_S16(buf, 11, 26, 5, 3);
+ } else {
+ buf = addVMULL_S16(buf, 11, 26, 5, 3);
+ linit[3] = true;
+ }
+ }
+
+ if (key & (1 << 24)) {
+ if (linit[0]) {
+ buf = addVMLAL_S16(buf, 8, 28, 6, 0);
+ } else {
+ buf = addVMULL_S16(buf, 8, 28, 6, 0);
+ linit[0] = true;
+ }
+ }
+ if (!(key & CM_DOT_MASK)) {
+ if (key & (1 << 25)) {
+ if (linit[1]) {
+ buf = addVMLAL_S16(buf, 9, 28, 6, 1);
+ } else {
+ buf = addVMULL_S16(buf, 9, 28, 6, 1);
+ linit[1] = true;
+ }
+ }
+ if (key & (1 << 26)) {
+ if (linit[2]) {
+ buf = addVMLAL_S16(buf, 10, 28, 6, 2);
+ } else {
+ buf = addVMULL_S16(buf, 10, 28, 6, 2);
+ linit[2] = true;
+ }
+ }
+ }
+ if (key & (1 << 27)) {
+ if (linit[3]) {
+ buf = addVMLAL_S16(buf, 11, 28, 6, 3);
+ } else {
+ buf = addVMULL_S16(buf, 11, 28, 6, 3);
+ linit[3] = true;
+ }
+ }
+
+ if (key & (1 << 28)) {
+ if (linit[0]) {
+ buf = addVMLAL_S16(buf, 8, 30, 7, 0);
+ } else {
+ buf = addVMULL_S16(buf, 8, 30, 7, 0);
+ linit[0] = true;
+ }
+ }
+ if (!(key & CM_DOT_MASK)) {
+ if (key & (1 << 29)) {
+ if (linit[1]) {
+ buf = addVMLAL_S16(buf, 9, 30, 7, 1);
+ } else {
+ buf = addVMULL_S16(buf, 9, 30, 7, 1);
+ linit[1] = true;
+ }
+ }
+ if (key & (1 << 30)) {
+ if (linit[2]) {
+ buf = addVMLAL_S16(buf, 10, 30, 7, 2);
+ } else {
+ buf = addVMULL_S16(buf, 10, 30, 7, 2);
+ linit[2] = true;
+ }
+ }
+ }
+ if (!(key & CM_COPY_ALPHA)) {
+ if (key & (1 << 31)) {
+ if (linit[3]) {
+ buf = addVMLAL_S16(buf, 11, 30, 7, 3);
+ } else {
+ buf = addVMULL_S16(buf, 11, 30, 7, 3);
+ linit[3] = true;
+ }
+ }
+ }
+
+ // If we have a dot product, perform the special pack.
+ if (key & CM_DOT_MASK) {
+ ADD_CHUNK(pack_u8_1);
+ ADD_CHUNK(dot);
+ } else {
+ switch(key & CM_IN_VEC_SIZE_MASK) {
+ case 3:
+ ADD_CHUNK(pack_u8_4);
+ break;
+ case 2:
+ ADD_CHUNK(pack_u8_3);
+ break;
+ case 1:
+ ADD_CHUNK(pack_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(pack_u8_1);
+ break;
+ }
+ }
+
+ // Write out result
+ switch(key & CM_IN_VEC_SIZE_MASK) {
+ case 3:
+ case 2:
+ ADD_CHUNK(store_u8_4);
+ break;
+ case 1:
+ ADD_CHUNK(store_u8_2);
+ break;
+ case 0:
+ ADD_CHUNK(store_u8_1);
+ break;
+ }
+
+ // Loop, branch, and cleanup
+ ADD_CHUNK(postfix1);
+ buf = addBranch(buf, buf2, 0x01);
+ ADD_CHUNK(postfix2);
+
+ int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
+ if (ret == -1) {
+ ALOGE("mprotect error %i", ret);
+ return false;
+ }
+
+ cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
+ return true;
+#else
+ return false;
+#endif
+}
+
+void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
+ size_t dataLength) {
+ switch(slot) {
+ case 0:
+ memcpy (fp, data, dataLength);
+ for(int ct=0; ct < 16; ct++) {
+ ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
+ }
+ break;
+ case 1:
+ memcpy (fpa, data, dataLength);
+ for(int ct=0; ct < 4; ct++) {
+ ipa[ct] = (int)(fpa[ct] * 256.f + 0.5f);
+ }
+ break;
+ default:
+ rsAssert(0);
+ break;
+ }
+
+ mRootPtr = &kernel;
+}
+
static void One(const RsForEachStubParamStruct *p, uchar4 *out,
const uchar4 *py, const float* coeff) {
@@ -108,9 +494,9 @@
*out = convert_uchar4(sum);
}
-void RsdCpuScriptIntrinsicColorMatrix::kernel4x4(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep) {
+void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
+ uint32_t xstart, uint32_t xend,
+ uint32_t instep, uint32_t outstep) {
RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
uchar4 *out = (uchar4 *)p->out;
uchar4 *in = (uchar4 *)p->in;
@@ -118,15 +504,13 @@
uint32_t x2 = xend;
if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrix4x4_K(out, in, cp->ip, len);
+ if((cp->mOptKernel != NULL) && (len > 0)) {
+ cp->mOptKernel(out, in, cp->ip, len);
x1 += len << 2;
out += len << 2;
in += len << 2;
}
-#endif
while(x1 != x2) {
One(p, out++, in++, cp->fp);
@@ -135,79 +519,57 @@
}
}
-void RsdCpuScriptIntrinsicColorMatrix::kernel3x3(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep) {
- RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
- uchar4 *out = (uchar4 *)p->out;
- uchar4 *in = (uchar4 *)p->in;
- uint32_t x1 = xstart;
- uint32_t x2 = xend;
+void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
+ uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
- if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrix3x3_K(out, in, cp->ip, len);
- x1 += len << 2;
- out += len << 2;
- in += len << 2;
- }
-#endif
-
- while(x1 != x2) {
- One(p, out++, in++, cp->fp);
- x1++;
+ int64_t key = computeKey(ain->mHal.state.type->getElement(),
+ aout->mHal.state.type->getElement());
+ if ((mOptKernel == NULL) || (mLastKey != key)) {
+ if (mBuf) munmap(mBuf, mBufSize);
+ mBuf = NULL;
+ mOptKernel = NULL;
+ if (build(key)) {
+ mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
+ mLastKey = key;
}
}
}
-void RsdCpuScriptIntrinsicColorMatrix::kernelDot(const RsForEachStubParamStruct *p,
- uint32_t xstart, uint32_t xend,
- uint32_t instep, uint32_t outstep) {
- RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
- uchar4 *out = (uchar4 *)p->out;
- uchar4 *in = (uchar4 *)p->in;
- uint32_t x1 = xstart;
- uint32_t x2 = xend;
+void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
+ uint32_t slot, const Allocation * ain, Allocation * aout,
+ const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
- if(x2 > x1) {
-#if defined(ARCH_ARM_HAVE_NEON)
- int32_t len = (x2 - x1) >> 2;
- if(len > 0) {
- rsdIntrinsicColorMatrixDot_K(out, in, cp->ip, len);
- x1 += len << 2;
- out += len << 2;
- in += len << 2;
- }
-#endif
- while(x1 != x2) {
- One(p, out++, in++, cp->fp);
- x1++;
- }
- }
}
-
RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
: RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
+ mLastKey = 0;
+ mBuf = NULL;
+ mBufSize = 0;
+ mOptKernel = NULL;
const static float defaultMatrix[] = {
1.f, 0.f, 0.f, 0.f,
0.f, 1.f, 0.f, 0.f,
0.f, 0.f, 1.f, 0.f,
0.f, 0.f, 0.f, 1.f
};
+ const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
+ setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
}
RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
+ if (mBuf) munmap(mBuf, mBufSize);
+ mBuf = NULL;
+ mOptKernel = NULL;
}
void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
- s->mHal.info.exportedVariableCount = 1;
+ s->mHal.info.exportedVariableCount = 2;
}
RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index c8dc9bf..52fd565 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -104,173 +104,6 @@
bx lr
END(rsdIntrinsicConvolve3x3_K)
-/*
- r0 = dst
- r1 = src
- r2 = matrix
- r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrix4x4_K)
- stmfd sp!, {r4, lr}
- vpush {q4-q7}
-
- vld1.16 {q2}, [r2]!
- vld1.16 {q3}, [r2]!
-
-1:
- vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
- vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
- vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
- vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
- vmovl.u8 q12, d0 /* R */
- vmovl.u8 q13, d1 /* G */
- vmovl.u8 q14, d2 /* B */
- vmovl.u8 q15, d3 /* A */
-
- vmull.s16 q8, d24, d4[0]
- vmull.s16 q9, d24, d4[1]
- vmull.s16 q10, d24, d4[2]
- vmull.s16 q11, d24, d4[3]
-
- vmlal.s16 q8, d26, d5[0]
- vmlal.s16 q9, d26, d5[1]
- vmlal.s16 q10, d26, d5[2]
- vmlal.s16 q11, d26, d5[3]
-
- vmlal.s16 q8, d28, d6[0]
- vmlal.s16 q9, d28, d6[1]
- vmlal.s16 q10, d28, d6[2]
- vmlal.s16 q11, d28, d6[3]
-
- vmlal.s16 q8, d30, d7[0]
- vmlal.s16 q9, d30, d7[1]
- vmlal.s16 q10, d30, d7[2]
- vmlal.s16 q11, d30, d7[3]
-
- vshrn.i32 d24, q8, #8
- vshrn.i32 d26, q9, #8
- vshrn.i32 d28, q10, #8
- vshrn.i32 d30, q11, #8
-
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vqmovun.s16 d2, q14
- vqmovun.s16 d3, q15
-
- vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
- vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
- vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
- vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
- subs r3, r3, #1
- bne 1b
-
- vpop {q4-q7}
- ldmfd sp!, {r4, lr}
- bx lr
-END(rsdIntrinsicColorMatrix4x4_K)
-
-/*
- r0 = dst
- r1 = src
- r2 = matrix
- r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrix3x3_K)
- stmfd sp!, {r4, lr}
- vpush {q4-q7}
-
- vld1.16 {q2}, [r2]!
- vld1.16 {q3}, [r2]!
-
-1:
- vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
- vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
- vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
- vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
- vmovl.u8 q12, d0
- vmovl.u8 q13, d1
- vmovl.u8 q14, d2
-
- vmull.s16 q8, d24, d4[0]
- vmull.s16 q9, d24, d4[1]
- vmull.s16 q10, d24, d4[2]
-
- vmlal.s16 q8, d26, d5[0]
- vmlal.s16 q9, d26, d5[1]
- vmlal.s16 q10, d26, d5[2]
-
- vmlal.s16 q8, d28, d6[0]
- vmlal.s16 q9, d28, d6[1]
- vmlal.s16 q10, d28, d6[2]
-
- vshrn.i32 d24, q8, #8
- vshrn.i32 d26, q9, #8
- vshrn.i32 d28, q10, #8
-
- vqmovun.s16 d0, q12
- vqmovun.s16 d1, q13
- vqmovun.s16 d2, q14
-
- vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
- vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
- vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
- vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
- subs r3, r3, #1
- bne 1b
-
- vpop {q4-q7}
- ldmfd sp!, {r4, lr}
- bx lr
-END(rsdIntrinsicColorMatrix3x3_K)
-
-/*
- r0 = dst
- r1 = src
- r2 = matrix
- r3 = length
-*/
-ENTRY(rsdIntrinsicColorMatrixDot_K)
- stmfd sp!, {r4, lr}
- vpush {q4-q7}
-
- vld1.16 {q2}, [r2]!
- vld1.16 {q3}, [r2]!
-
-1:
- vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
- vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
- vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
- vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
-
- vmovl.u8 q12, d0
- vmovl.u8 q13, d1
- vmovl.u8 q14, d2
-
- vmull.s16 q8, d24, d4[0]
- vmlal.s16 q8, d26, d5[0]
- vmlal.s16 q8, d28, d6[0]
- vshrn.i32 d24, q8, #8
- vqmovun.s16 d0, q12
- vmov.u8 d1, d0
- vmov.u8 d2, d0
-
- vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
- vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
- vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
- vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
-
- subs r3, r3, #1
- bne 1b
-
- vpop {q4-q7}
- ldmfd sp!, {r4, lr}
- bx lr
-END(rsdIntrinsicColorMatrixDot_K)
-
/*
static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
diff --git a/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
new file mode 100644
index 0000000..ce8c033
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_neon_ColorMatrix.S
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define SNIP_START(x) \
+ .globl x; x:
+
+#define SNIP_END(x) \
+ .globl x##_end; x##_end: \
+ .globl x##_len; x##_len: \
+ .word x##_end-x
+
+SNIP_START(_N_ColorMatrix_prefix)
+ stmfd sp!, {r4, lr}
+ vpush {q4-q7}
+ vld1.16 {q2}, [r2]!
+ vld1.16 {q3}, [r2]!
+ vld1.16 {d8}, [r2]!
+SNIP_END(_N_ColorMatrix_prefix)
+
+SNIP_START(_N_ColorMatrix_postfix1)
+ subs r3, r3, #1
+ #bne 1b
+SNIP_END(_N_ColorMatrix_postfix1)
+
+SNIP_START(_N_ColorMatrix_postfix2)
+
+ #mov r0, #0
+ #ldr r0, [r0]
+
+ vpop {q4-q7}
+ ldmfd sp!, {r4, lr}
+ bx lr
+SNIP_END(_N_ColorMatrix_postfix2)
+
+SNIP_START(_N_ColorMatrix_load_u8_4)
+ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+ vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+ vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+ vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_4)
+
+SNIP_START(_N_ColorMatrix_load_u8_2)
+ vld2.8 {d0[0],d1[0]}, [r1]!
+ vld2.8 {d0[1],d1[1]}, [r1]!
+ vld2.8 {d0[2],d1[2]}, [r1]!
+ vld2.8 {d0[3],d1[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_2)
+
+SNIP_START(_N_ColorMatrix_load_u8_1)
+ vld1.32 {d0}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_1)
+
+SNIP_START(_N_ColorMatrix_store_u8_4)
+ vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+ vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+ vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+ vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_4)
+
+SNIP_START(_N_ColorMatrix_store_u8_2)
+ vst2.8 {d0[0],d1[0]}, [r0]!
+ vst2.8 {d0[1],d1[1]}, [r0]!
+ vst2.8 {d0[2],d1[2]}, [r0]!
+ vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_2)
+
+SNIP_START(_N_ColorMatrix_store_u8_1)
+ vst1.32 {d0}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_1)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_4)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+ vmovl.u8 q14, d2 /* B */
+ vmovl.u8 q15, d3 /* A */
+SNIP_END(_N_ColorMatrix_unpack_u8_4)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_3)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+ vmovl.u8 q14, d2 /* B */
+SNIP_END(_N_ColorMatrix_unpack_u8_3)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_2)
+ vmovl.u8 q12, d0 /* R */
+ vmovl.u8 q13, d1 /* G */
+SNIP_END(_N_ColorMatrix_unpack_u8_2)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_1)
+ vmovl.u8 q12, d0 /* R */
+SNIP_END(_N_ColorMatrix_unpack_u8_1)
+
+SNIP_START(_N_ColorMatrix_pack_u8_4)
+ vshrn.i32 d24, q8, #8
+ vshrn.i32 d26, q9, #8
+ vshrn.i32 d28, q10, #8
+ vshrn.i32 d30, q11, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+SNIP_END(_N_ColorMatrix_pack_u8_4)
+
+SNIP_START(_N_ColorMatrix_pack_u8_3)
+ vshrn.i32 d24, q8, #8
+ vshrn.i32 d26, q9, #8
+ vshrn.i32 d28, q10, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vqmovun.s16 d2, q14
+SNIP_END(_N_ColorMatrix_pack_u8_3)
+
+SNIP_START(_N_ColorMatrix_pack_u8_2)
+ vshrn.i32 d24, q8, #8
+ vshrn.i32 d26, q9, #8
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+SNIP_END(_N_ColorMatrix_pack_u8_2)
+
+SNIP_START(_N_ColorMatrix_pack_u8_1)
+ vshrn.i32 d24, q8, #8
+ vqmovun.s16 d0, q12
+SNIP_END(_N_ColorMatrix_pack_u8_1)
+
+SNIP_START(_N_ColorMatrix_dot)
+ vmov.u8 d1, d0
+ vmov.u8 d2, d0
+SNIP_END(_N_ColorMatrix_dot)
+