Refactor ARM Blur prefill logic.
Refactor the prefill logic for ARM (and improve documentation along the way) so
as to fix some cases where data is read outside of the source image, and to
minimise the remaining cases which must fall back to the C implementation.
Change-Id: I3d06416b40c48dea06258e9f7bb5ddc246d7c710
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index cac10d8..9d51e68 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -297,7 +297,7 @@
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
- if (gArchUseSIMD) {
+ if (gArchUseSIMD && info->dim.x >= 4) {
rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * info->current.y),
info->dim.x, info->dim.y,
stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
@@ -368,10 +368,15 @@
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
- if (gArchUseSIMD) {
- rsdIntrinsicBlurU1_K(out, pin + stride * info->current.y, info->dim.x, info->dim.y,
- stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
- return;
+ if (gArchUseSIMD && info->dim.x >= 16) {
+ // The specialisation for r<=8 has an awkward prefill case, which is
+ // fiddly to resolve, where starting close to the right edge can cause
+ // a read beyond the end of input. So avoid that case here.
+ if (cp->mIradius > 8 || (info->dim.x - rsMax(0, (int32_t)x1 - 8)) >= 16) {
+ rsdIntrinsicBlurU1_K(out, pin + stride * info->current.y, info->dim.x, info->dim.y,
+ stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
+ return;
+ }
}
#endif