Merge "Fix uchar blur performance."
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 1229f79..cdeac0b 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -56,6 +56,9 @@
void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
+ memset(fp, 0, sizeof(fp));
+ memset(ip, 0, sizeof(ip));
+
// Compute gaussian weights for the blur
// e is the euler's number
float e = 2.718281828459045f;
@@ -144,6 +147,7 @@
extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
+extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
static void OneVFU4(float4 *out,
const uchar *ptrIn, int iStride, const float* gPtr, int ct,
@@ -174,22 +178,40 @@
out->xyzw = blurredPixel;
x1++;
out++;
+ ptrIn++;
}
}
static void OneVFU1(float *out,
const uchar *ptrIn, int iStride, const float* gPtr, int ct, int len) {
+ while(len && (((int)ptrIn) & 0x3)) {
+ const uchar *pi = ptrIn;
+ float blurredPixel = 0;
+ const float* gp = gPtr;
+
+ for (int r = 0; r < ct; r++) {
+ float pf = (float)pi[0];
+ blurredPixel += pf * gp[0];
+ pi += iStride;
+ gp++;
+ }
+ out[0] = blurredPixel;
+ len--;
+ out++;
+ ptrIn++;
+ }
+
#if defined(ARCH_ARM_HAVE_NEON)
{
int t = len >> 2;
t &= ~1;
if(t) {
- rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, len);
+ rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t << 2);
+ len -= t << 2;
+ ptrIn += t << 2;
+ out += t << 2;
}
- len -= t << 2;
- ptrIn += t << 2;
- out += t << 2;
}
#endif
@@ -207,6 +229,7 @@
out[0] = blurredPixel;
len--;
out++;
+ ptrIn++;
}
}
@@ -244,6 +267,7 @@
void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
uint32_t xstart, uint32_t xend,
uint32_t instep, uint32_t outstep) {
+ // fixme
float buf[4 * 2048];
RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
if (!cp->alloc.get()) {
@@ -321,16 +345,20 @@
}
x1 = xstart;
- while ((x1 < (uint32_t)cp->iradius) && (x1 < x2)) {
+ while ((x1 < x2) &&
+ ((x1 < (uint32_t)cp->iradius) || (((int)out) & 0x3))) {
OneHU1(p, out, x1, buf, cp->fp, cp->iradius);
out++;
x1++;
}
-#if 0//defined(ARCH_ARM_HAVE_NEON)
+#if defined(ARCH_ARM_HAVE_NEON)
if ((x1 + cp->iradius) < x2) {
- rsdIntrinsicBlurHFU4_K(out, ((float4 *)buf) - cp->iradius, cp->fp, cp->iradius * 2 + 1, x1, 0, x2 - cp->iradius);
- out += (x2 - cp->iradius) - x1;
- x1 = x2 - cp->iradius;
+ uint32_t len = x2 - (x1 + cp->iradius);
+ len >>= 2;
+ rsdIntrinsicBlurHFU1_K(out, ((float4 *)buf) - cp->iradius, cp->fp,
+ cp->iradius * 2 + 1, x1, x1+len);
+ out += len << 2;
+ x1 += len << 2;
}
#endif
while(x2 > x1) {
diff --git a/cpu_ref/rsCpuIntrinsics_neon.S b/cpu_ref/rsCpuIntrinsics_neon.S
index 53c116d..8e883d0 100644
--- a/cpu_ref/rsCpuIntrinsics_neon.S
+++ b/cpu_ref/rsCpuIntrinsics_neon.S
@@ -377,6 +377,42 @@
bx lr
END(rsdIntrinsicBlurHFU4_K)
+ENTRY(rsdIntrinsicBlurHFU1_K)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ ldr r4, [sp, #32+64]
+ ldr r5, [sp, #32+64 + 4]
+
+1:
+ add r7, r1, r4, lsl #4 /* const uchar *pi = ptrIn + x1 * 4; */
+ mov r10, r2
+ mov r11, r3
+
+ veor q0, q0
+
+2:
+ vld1.32 {q1}, [r7]
+ add r7, r7, #4
+ vld1.32 {d4[0]}, [r10]!
+ vmla.f32 q0, q1, d4[0]
+ subs r11, r11, #1
+ bne 2b
+
+ vcvt.s32.f32 q0, q0
+ vmovn.u32 d0, q0
+ vmovn.u16 d0, q0
+
+ vst1.32 {d0[0]}, [r0]!
+ add r4, r4, #1
+ cmp r4, r5
+ bne 1b
+
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+END(rsdIntrinsicBlurHFU1_K)
+
/*
r0 = dst
r1 = Y