Refactor initialization of even/odd masks in parameters for SpCHW micro-kernels
Replace huge switch statements with arithmetic operations
PiperOrigin-RevId: 276163876
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 4df6b72..8828f3a 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -40,20 +40,20 @@
} scalar;
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
struct {
+ float min;
+ float max;
XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels
XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
- float min;
- float max;
} neon;
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
struct {
+ XNN_ALIGN(16) float max[4];
+ XNN_ALIGN(16) float min[4];
XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels
XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels
XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels
- XNN_ALIGN(16) float max[4];
- XNN_ALIGN(16) float min[4];
} sse;
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
};
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index e22ac53..72648ac 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -429,234 +429,50 @@
float output_max)
{
union xnn_f32_spchw_params params;
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- switch (width % 4) {
- case 0:
- params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[1] = 0;
- params.sse.mask[2] = 0;
- params.sse.mask[3] = 0;
- break;
- case 2:
- params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[2] = 0;
- params.sse.mask[3] = 0;
- break;
- case 3:
- params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask[3] = 0;
- break;
- }
- switch (width % 8) {
- case 0:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = 0;
- params.sse.mask_even[2] = 0;
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = 0;
- params.sse.mask_odd[1] = 0;
- params.sse.mask_odd[2] = 0;
- params.sse.mask_odd[3] = 0;
- break;
- case 2:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = 0;
- params.sse.mask_even[2] = 0;
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = 0;
- params.sse.mask_odd[2] = 0;
- params.sse.mask_odd[3] = 0;
- break;
- case 3:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = 0;
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = 0;
- params.sse.mask_odd[2] = 0;
- params.sse.mask_odd[3] = 0;
- break;
- case 4:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = 0;
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[2] = 0;
- params.sse.mask_odd[3] = 0;
- break;
- case 5:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[2] = 0;
- params.sse.mask_odd[3] = 0;
- break;
- case 6:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[3] = 0;
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[3] = 0;
- break;
- case 7:
- params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.sse.mask_odd[3] = 0;
- break;
- }
- for (uint32_t i = 0; i < 4; i++) {
- params.sse.max[i] = output_max;
- params.sse.min[i] = output_min;
- }
-#elif XNN_ARCH_ARM || XNN_ARCH_ARM64
- switch (width % 4) {
- case 0:
- params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[1] = 0;
- params.neon.mask[2] = 0;
- params.neon.mask[3] = 0;
- break;
- case 2:
- params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[2] = 0;
- params.neon.mask[3] = 0;
- break;
- case 3:
- params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask[3] = 0;
- break;
- }
- switch (width % 8) {
- case 0:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = 0;
- params.neon.mask_even[2] = 0;
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = 0;
- params.neon.mask_odd[1] = 0;
- params.neon.mask_odd[2] = 0;
- params.neon.mask_odd[3] = 0;
- break;
- case 2:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = 0;
- params.neon.mask_even[2] = 0;
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = 0;
- params.neon.mask_odd[2] = 0;
- params.neon.mask_odd[3] = 0;
- break;
- case 3:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = 0;
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = 0;
- params.neon.mask_odd[2] = 0;
- params.neon.mask_odd[3] = 0;
- break;
- case 4:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = 0;
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[2] = 0;
- params.neon.mask_odd[3] = 0;
- break;
- case 5:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[2] = 0;
- params.neon.mask_odd[3] = 0;
- break;
- case 6:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[3] = 0;
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[3] = 0;
- break;
- case 7:
- params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params.neon.mask_odd[3] = 0;
- break;
- }
- params.neon.max = output_max;
- params.neon.min = output_min;
-#else
- params.scalar.max = output_max;
- params.scalar.min = output_min;
-#endif
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse.max[i] = output_max;
+ params.sse.min[i] = output_min;
+ }
+
+ const uint32_t w4 = (width - 1) & 3;
+ params.sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask[1] = -(uint32_t) (w4 >= 1);
+ params.sse.mask[2] = -(uint32_t) (w4 >= 2);
+ params.sse.mask[3] = -(uint32_t) (w4 >= 3);
+
+ const uint32_t w8 = (width - 1) & 7;
+ params.sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.sse.mask_even[1] = -(uint32_t) (w8 >= 2);
+ params.sse.mask_even[2] = -(uint32_t) (w8 >= 4);
+ params.sse.mask_even[3] = -(uint32_t) (w8 >= 6);
+ params.sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
+ params.sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
+ params.sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
+ params.sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
+ #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
+ params.neon.max = output_max;
+ params.neon.min = output_min;
+
+ const uint32_t w4 = (width - 1) & 3;
+ params.neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask[1] = -(uint32_t) (w4 >= 1);
+ params.neon.mask[2] = -(uint32_t) (w4 >= 2);
+ params.neon.mask[3] = -(uint32_t) (w4 >= 3);
+
+ const uint32_t w8 = (width - 1) & 7;
+ params.neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params.neon.mask_even[1] = -(uint32_t) (w8 >= 2);
+ params.neon.mask_even[2] = -(uint32_t) (w8 >= 4);
+ params.neon.mask_even[3] = -(uint32_t) (w8 >= 6);
+ params.neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
+ params.neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
+ params.neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
+ params.neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
+ #else
+ params.scalar.max = output_max;
+ params.scalar.min = output_min;
+ #endif
return params;
}
@@ -665,223 +481,37 @@
uint32_t width)
{
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- switch (width % 4) {
- case 0:
- params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[1] = 0;
- params->sse.mask[2] = 0;
- params->sse.mask[3] = 0;
- break;
- case 2:
- params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[2] = 0;
- params->sse.mask[3] = 0;
- break;
- case 3:
- params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask[3] = 0;
- break;
- }
- switch (width % 8) {
- case 0:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = 0;
- params->sse.mask_even[2] = 0;
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = 0;
- params->sse.mask_odd[1] = 0;
- params->sse.mask_odd[2] = 0;
- params->sse.mask_odd[3] = 0;
- break;
- case 2:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = 0;
- params->sse.mask_even[2] = 0;
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = 0;
- params->sse.mask_odd[2] = 0;
- params->sse.mask_odd[3] = 0;
- break;
- case 3:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = 0;
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = 0;
- params->sse.mask_odd[2] = 0;
- params->sse.mask_odd[3] = 0;
- break;
- case 4:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = 0;
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[2] = 0;
- params->sse.mask_odd[3] = 0;
- break;
- case 5:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[2] = 0;
- params->sse.mask_odd[3] = 0;
- break;
- case 6:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[3] = 0;
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[3] = 0;
- break;
- case 7:
- params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->sse.mask_odd[3] = 0;
- break;
- }
+ const uint32_t w4 = (width - 1) & 3;
+ params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask[1] = -(uint32_t) (w4 >= 1);
+ params->sse.mask[2] = -(uint32_t) (w4 >= 2);
+ params->sse.mask[3] = -(uint32_t) (w4 >= 3);
+
+ const uint32_t w8 = (width - 1) & 7;
+ params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
+ params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
+ params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
+ params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
+ params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
+ params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
+ params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
#elif XNN_ARCH_ARM || XNN_ARCH_ARM64
- switch (width % 4) {
- case 0:
- params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[1] = 0;
- params->neon.mask[2] = 0;
- params->neon.mask[3] = 0;
- break;
- case 2:
- params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[2] = 0;
- params->neon.mask[3] = 0;
- break;
- case 3:
- params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask[3] = 0;
- break;
- }
- switch (width % 8) {
- case 0:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[3] = UINT32_C(0xFFFFFFFF);
- break;
- case 1:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = 0;
- params->neon.mask_even[2] = 0;
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = 0;
- params->neon.mask_odd[1] = 0;
- params->neon.mask_odd[2] = 0;
- params->neon.mask_odd[3] = 0;
- break;
- case 2:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = 0;
- params->neon.mask_even[2] = 0;
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = 0;
- params->neon.mask_odd[2] = 0;
- params->neon.mask_odd[3] = 0;
- break;
- case 3:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = 0;
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = 0;
- params->neon.mask_odd[2] = 0;
- params->neon.mask_odd[3] = 0;
- break;
- case 4:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = 0;
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[2] = 0;
- params->neon.mask_odd[3] = 0;
- break;
- case 5:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[2] = 0;
- params->neon.mask_odd[3] = 0;
- break;
- case 6:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[3] = 0;
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[3] = 0;
- break;
- case 7:
- params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_even[3] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[0] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[1] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[2] = UINT32_C(0xFFFFFFFF);
- params->neon.mask_odd[3] = 0;
- break;
- }
+ const uint32_t w4 = (width - 1) & 3;
+ params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask[1] = -(uint32_t) (w4 >= 1);
+ params->neon.mask[2] = -(uint32_t) (w4 >= 2);
+ params->neon.mask[3] = -(uint32_t) (w4 >= 3);
+
+ const uint32_t w8 = (width - 1) & 7;
+ params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
+ params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
+ params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
+ params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
+ params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
+ params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
+ params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
+ params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
#endif
}