DWCONV microkernels in WAsm SIMD intrinsics

PiperOrigin-RevId: 316687431
diff --git a/src/f32-igemm/wasmsimd-splat.c.in b/src/f32-igemm/wasmsimd-splat.c.in
index a4edeaf..56891d8 100644
--- a/src/f32-igemm/wasmsimd-splat.c.in
+++ b/src/f32-igemm/wasmsimd-splat.c.in
@@ -113,14 +113,6 @@
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
 
-    const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
-    $for N in range(0, NR, 4):
-      $for M in range(MR):
-        $if X86:
-          vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vacc${M}x${ABC[N:N+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[N:N+4]}, vmax));
-        $else:
-          vacc${M}x${ABC[N:N+4]} = wasm_f32x4_min(vacc${M}x${ABC[N:N+4]}, vmax);
-
     const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
     $for N in range(0, NR, 4):
       $for M in range(MR):
@@ -129,6 +121,14 @@
         $else:
           vacc${M}x${ABC[N:N+4]} = wasm_f32x4_max(vacc${M}x${ABC[N:N+4]}, vmin);
 
+    const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
+    $for N in range(0, NR, 4):
+      $for M in range(MR):
+        $if X86:
+          vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vacc${M}x${ABC[N:N+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[N:N+4]}, vmax));
+        $else:
+          vacc${M}x${ABC[N:N+4]} = wasm_f32x4_min(vacc${M}x${ABC[N:N+4]}, vmax);
+
     if XNN_LIKELY(nc >= ${NR}) {
       $for M in reversed(range(MR)):
         wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]});