DWCONV microkernels in WAsm SIMD intrinsics
PiperOrigin-RevId: 316687431
diff --git a/src/f32-igemm/wasmsimd-splat.c.in b/src/f32-igemm/wasmsimd-splat.c.in
index a4edeaf..56891d8 100644
--- a/src/f32-igemm/wasmsimd-splat.c.in
+++ b/src/f32-igemm/wasmsimd-splat.c.in
@@ -113,14 +113,6 @@
p -= ${MR} * sizeof(void*);
} while (p != 0);
- const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
- $for N in range(0, NR, 4):
- $for M in range(MR):
- $if X86:
- vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vacc${M}x${ABC[N:N+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[N:N+4]}, vmax));
- $else:
- vacc${M}x${ABC[N:N+4]} = wasm_f32x4_min(vacc${M}x${ABC[N:N+4]}, vmax);
-
const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
$for N in range(0, NR, 4):
$for M in range(MR):
@@ -129,6 +121,14 @@
$else:
vacc${M}x${ABC[N:N+4]} = wasm_f32x4_max(vacc${M}x${ABC[N:N+4]}, vmin);
+ const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
+ $for N in range(0, NR, 4):
+ $for M in range(MR):
+ $if X86:
+ vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vacc${M}x${ABC[N:N+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[N:N+4]}, vmax));
+ $else:
+ vacc${M}x${ABC[N:N+4]} = wasm_f32x4_min(vacc${M}x${ABC[N:N+4]}, vmax);
+
if XNN_LIKELY(nc >= ${NR}) {
$for M in reversed(range(MR)):
wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]});