Optimize FP32 requantization in WAsm SIMD QS8/QC8/QU8 GEMM/IGEMM/DWCONV

PiperOrigin-RevId: 414178544
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
index b8b8559..09c73a8 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -105,15 +105,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -121,6 +118,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
index e01a418..9b8639e 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -103,15 +103,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -119,6 +116,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 6e4ad26..25f5ada 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -80,15 +80,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -96,6 +93,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 2f8fb3e..63bdbf1 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -80,15 +80,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -96,6 +93,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
index 87434c3..eb139fd 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
@@ -92,15 +92,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -108,6 +105,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
index fdeb345..e4d1865 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
@@ -88,15 +88,12 @@
     w = (const void*) ((const float*) w + 4);
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
 
@@ -104,6 +101,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
index dccfdc3..1f72985 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -132,18 +132,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -152,6 +148,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
index fec9666..da4a3b0 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -130,18 +130,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -150,6 +146,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 65fd553..a1d0731 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -101,18 +101,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -121,6 +117,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index fc3fdc8..e95f115 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -101,18 +101,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -121,6 +117,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
index bb5457c..d9b2b90 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
@@ -121,18 +121,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -141,6 +137,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
index 4fec0f9..024a8da 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
@@ -117,18 +117,14 @@
     vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123);
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -137,6 +133,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
index f5b064d..6848319 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -159,21 +159,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -184,6 +179,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 1fbeb7c..89a7179 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -157,21 +157,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -182,6 +177,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 9255131..300be44 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -122,21 +122,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -147,6 +142,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 9ec32b1..182e2f7 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -122,21 +122,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -147,6 +142,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
index 8ab3e1b..8019253 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c
@@ -150,21 +150,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -175,6 +170,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
index d8e5fa6..c6823f0 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c
@@ -146,21 +146,16 @@
     vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123);
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -171,6 +166,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
index dab7127..3a59537 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -186,24 +186,18 @@
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
     vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc3x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc3x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
     vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+    vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -215,6 +209,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 439f78c..1ff47ad 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -184,24 +184,18 @@
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
     vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc3x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc3x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
     vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+    vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -213,6 +207,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 06a6c74..cd1a279 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -143,24 +143,18 @@
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
     vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc3x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc3x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
     vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+    vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -172,6 +166,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 622b192..042399c 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -143,24 +143,18 @@
     vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123);
     vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123);
 
-    const v128_t voutput_min_less_zero_point = wasm_v128_load(params->wasmsimd.output_min_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmax(voutput_min_less_zero_point, vacc3x0123);
-
-    const v128_t voutput_max_less_zero_point = wasm_v128_load(params->wasmsimd.output_max_less_zero_point);
-    vacc0x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc0x0123);
-    vacc1x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc1x0123);
-    vacc2x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc2x0123);
-    vacc3x0123 = wasm_f32x4_pmin(voutput_max_less_zero_point, vacc3x0123);
-
     const v128_t vmagic_bias = wasm_v128_load(params->wasmsimd.magic_bias);
     vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias);
     vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias);
     vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias);
     vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias);
 
+    const v128_t vmagic_min = wasm_v128_load(params->wasmsimd.magic_min);
+    vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min);
+    vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min);
+    vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min);
+    vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min);
+
     const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load(params->wasmsimd.magic_bias_less_output_zero_point);
     vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point);
     vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point);
@@ -172,6 +166,9 @@
 
     v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc23x0123);
 
+    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+    vout = wasm_i8x16_min(vout, voutput_max);
+
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
       *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);