Leverage f32x4.pmin and f32x4.pmax WAsm SIMD instructions

Warning: this change makes XNNPACK binaries for WebAssembly SIMD incompatible with Chrome versions earlier than 87
PiperOrigin-RevId: 397776648
diff --git a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined-x2.c b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined-x2.c
index dacb452..6b84621 100644
--- a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined-x2.c
+++ b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined-x2.c
@@ -92,14 +92,14 @@
           viCDEF = wasm_v128_load(input + 12);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -132,10 +132,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -162,8 +162,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -189,8 +189,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -216,8 +216,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined.c b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined.c
index dab595b..68b88ed 100644
--- a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined.c
+++ b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-pipelined.c
@@ -68,14 +68,14 @@
           viCDEF = wasm_v128_load(input + 12);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -108,10 +108,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -138,8 +138,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -165,8 +165,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -192,8 +192,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x2.c b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x2.c
index c137a59..824d48a 100644
--- a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x2.c
+++ b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x2.c
@@ -98,14 +98,14 @@
           vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -138,10 +138,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -168,8 +168,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -195,8 +195,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -222,8 +222,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x4.c b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x4.c
index d1a0be6..679e725 100644
--- a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x4.c
@@ -138,14 +138,14 @@
           vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -178,10 +178,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -208,8 +208,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -235,8 +235,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -262,8 +262,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86.c b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86.c
index 9bc0957..839dc62 100644
--- a/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86.c
+++ b/src/f32-spmm/gen/16x1-minmax-wasmsimd-x86.c
@@ -58,14 +58,14 @@
           vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -98,10 +98,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -128,8 +128,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -155,8 +155,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -182,8 +182,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined-x2.c b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined-x2.c
index 7f0a107..0b1a3d6 100644
--- a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined-x2.c
+++ b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined-x2.c
@@ -124,22 +124,22 @@
           viSTUV = wasm_v128_load(input + 28);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      v128_t voutGHIJ = wasm_v128_bitselect(vaccGHIJ, vmax, wasm_f32x4_le(vaccGHIJ, vmax));
-      v128_t voutKLMN = wasm_v128_bitselect(vaccKLMN, vmax, wasm_f32x4_le(vaccKLMN, vmax));
-      v128_t voutOPQR = wasm_v128_bitselect(vaccOPQR, vmax, wasm_f32x4_le(vaccOPQR, vmax));
-      v128_t voutSTUV = wasm_v128_bitselect(vaccSTUV, vmax, wasm_f32x4_le(vaccSTUV, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
-      voutGHIJ = wasm_v128_bitselect(vmin, voutGHIJ, wasm_f32x4_lt(voutGHIJ, vmin));
-      voutKLMN = wasm_v128_bitselect(vmin, voutKLMN, wasm_f32x4_lt(voutKLMN, vmin));
-      voutOPQR = wasm_v128_bitselect(vmin, voutOPQR, wasm_f32x4_lt(voutOPQR, vmin));
-      voutSTUV = wasm_v128_bitselect(vmin, voutSTUV, wasm_f32x4_lt(voutSTUV, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
+      v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
+      v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
+      v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
+      voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
+      voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
+      voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
+      voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -182,14 +182,14 @@
             vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-        v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-        vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-        voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+        v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+        vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+        voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -221,10 +221,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -251,8 +251,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -278,8 +278,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -305,8 +305,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined.c b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined.c
index 9762ff3..5cba2bb 100644
--- a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined.c
+++ b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-pipelined.c
@@ -84,22 +84,22 @@
           viSTUV = wasm_v128_load(input + 28);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      v128_t voutGHIJ = wasm_v128_bitselect(vaccGHIJ, vmax, wasm_f32x4_le(vaccGHIJ, vmax));
-      v128_t voutKLMN = wasm_v128_bitselect(vaccKLMN, vmax, wasm_f32x4_le(vaccKLMN, vmax));
-      v128_t voutOPQR = wasm_v128_bitselect(vaccOPQR, vmax, wasm_f32x4_le(vaccOPQR, vmax));
-      v128_t voutSTUV = wasm_v128_bitselect(vaccSTUV, vmax, wasm_f32x4_le(vaccSTUV, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
-      voutGHIJ = wasm_v128_bitselect(vmin, voutGHIJ, wasm_f32x4_lt(voutGHIJ, vmin));
-      voutKLMN = wasm_v128_bitselect(vmin, voutKLMN, wasm_f32x4_lt(voutKLMN, vmin));
-      voutOPQR = wasm_v128_bitselect(vmin, voutOPQR, wasm_f32x4_lt(voutOPQR, vmin));
-      voutSTUV = wasm_v128_bitselect(vmin, voutSTUV, wasm_f32x4_lt(voutSTUV, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
+      v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
+      v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
+      v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
+      voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
+      voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
+      voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
+      voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -142,14 +142,14 @@
             vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-        v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-        vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-        voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+        v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+        vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+        voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -181,10 +181,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -211,8 +211,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -238,8 +238,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -265,8 +265,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x2.c b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x2.c
index af6e29e..48b1f76 100644
--- a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x2.c
+++ b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x2.c
@@ -138,22 +138,22 @@
           vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      v128_t voutGHIJ = wasm_v128_bitselect(vaccGHIJ, vmax, wasm_f32x4_le(vaccGHIJ, vmax));
-      v128_t voutKLMN = wasm_v128_bitselect(vaccKLMN, vmax, wasm_f32x4_le(vaccKLMN, vmax));
-      v128_t voutOPQR = wasm_v128_bitselect(vaccOPQR, vmax, wasm_f32x4_le(vaccOPQR, vmax));
-      v128_t voutSTUV = wasm_v128_bitselect(vaccSTUV, vmax, wasm_f32x4_le(vaccSTUV, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
-      voutGHIJ = wasm_v128_bitselect(vmin, voutGHIJ, wasm_f32x4_lt(voutGHIJ, vmin));
-      voutKLMN = wasm_v128_bitselect(vmin, voutKLMN, wasm_f32x4_lt(voutKLMN, vmin));
-      voutOPQR = wasm_v128_bitselect(vmin, voutOPQR, wasm_f32x4_lt(voutOPQR, vmin));
-      voutSTUV = wasm_v128_bitselect(vmin, voutSTUV, wasm_f32x4_lt(voutSTUV, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
+      v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
+      v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
+      v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
+      voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
+      voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
+      voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
+      voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -196,14 +196,14 @@
             vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-        v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-        vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-        voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+        v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+        vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+        voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -235,10 +235,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -265,8 +265,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -292,8 +292,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -319,8 +319,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x4.c b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x4.c
index 0c265b5..13b92f1 100644
--- a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86-x4.c
@@ -210,22 +210,22 @@
           vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      v128_t voutGHIJ = wasm_v128_bitselect(vaccGHIJ, vmax, wasm_f32x4_le(vaccGHIJ, vmax));
-      v128_t voutKLMN = wasm_v128_bitselect(vaccKLMN, vmax, wasm_f32x4_le(vaccKLMN, vmax));
-      v128_t voutOPQR = wasm_v128_bitselect(vaccOPQR, vmax, wasm_f32x4_le(vaccOPQR, vmax));
-      v128_t voutSTUV = wasm_v128_bitselect(vaccSTUV, vmax, wasm_f32x4_le(vaccSTUV, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
-      voutGHIJ = wasm_v128_bitselect(vmin, voutGHIJ, wasm_f32x4_lt(voutGHIJ, vmin));
-      voutKLMN = wasm_v128_bitselect(vmin, voutKLMN, wasm_f32x4_lt(voutKLMN, vmin));
-      voutOPQR = wasm_v128_bitselect(vmin, voutOPQR, wasm_f32x4_lt(voutOPQR, vmin));
-      voutSTUV = wasm_v128_bitselect(vmin, voutSTUV, wasm_f32x4_lt(voutSTUV, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
+      v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
+      v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
+      v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
+      voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
+      voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
+      voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
+      voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -268,14 +268,14 @@
             vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-        v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-        vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-        voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+        v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+        vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+        voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -307,10 +307,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -337,8 +337,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -364,8 +364,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -391,8 +391,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86.c b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86.c
index acdeee4..59c4d95 100644
--- a/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86.c
+++ b/src/f32-spmm/gen/32x1-minmax-wasmsimd-x86.c
@@ -70,22 +70,22 @@
           vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-      v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-      v128_t voutGHIJ = wasm_v128_bitselect(vaccGHIJ, vmax, wasm_f32x4_le(vaccGHIJ, vmax));
-      v128_t voutKLMN = wasm_v128_bitselect(vaccKLMN, vmax, wasm_f32x4_le(vaccKLMN, vmax));
-      v128_t voutOPQR = wasm_v128_bitselect(vaccOPQR, vmax, wasm_f32x4_le(vaccOPQR, vmax));
-      v128_t voutSTUV = wasm_v128_bitselect(vaccSTUV, vmax, wasm_f32x4_le(vaccSTUV, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-      vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-      voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
-      voutGHIJ = wasm_v128_bitselect(vmin, voutGHIJ, wasm_f32x4_lt(voutGHIJ, vmin));
-      voutKLMN = wasm_v128_bitselect(vmin, voutKLMN, wasm_f32x4_lt(voutKLMN, vmin));
-      voutOPQR = wasm_v128_bitselect(vmin, voutOPQR, wasm_f32x4_lt(voutOPQR, vmin));
-      voutSTUV = wasm_v128_bitselect(vmin, voutSTUV, wasm_f32x4_lt(voutSTUV, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+      v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+      v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
+      v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
+      v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
+      v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+      vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+      voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
+      voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
+      voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
+      voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
+      voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       wasm_v128_store(output + 8, vout89AB);
@@ -128,14 +128,14 @@
             vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
-        v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
-        vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
-        voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
+        v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
+        vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
+        voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -167,10 +167,10 @@
             vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-        vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+        vout4567 = wasm_f32x4_pmax(vmin, vout4567);
         wasm_v128_store(output, vout0123);
 
         wasm_v128_store(output + 4, vout4567);
@@ -197,8 +197,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -224,8 +224,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -251,8 +251,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined-x2.c b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined-x2.c
index f3a6ffe..fcb826e 100644
--- a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined-x2.c
+++ b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined-x2.c
@@ -68,8 +68,8 @@
           vi0123 = wasm_v128_load(input + 0);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
       wasm_v128_store(output, vout0123);
       output = (float*restrict) ((uintptr_t) output + output_stride);
     } while (--n != 0);
@@ -96,8 +96,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -123,8 +123,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined.c b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined.c
index 709af30..4a131ad 100644
--- a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined.c
+++ b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-pipelined.c
@@ -56,8 +56,8 @@
           vi0123 = wasm_v128_load(input + 0);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
       wasm_v128_store(output, vout0123);
       output = (float*restrict) ((uintptr_t) output + output_stride);
     } while (--n != 0);
@@ -84,8 +84,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -111,8 +111,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x2.c b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x2.c
index 7b25d36..ec43dc4 100644
--- a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x2.c
+++ b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x2.c
@@ -68,8 +68,8 @@
           vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
       wasm_v128_store(output, vout0123);
       output = (float*restrict) ((uintptr_t) output + output_stride);
     } while (--n != 0);
@@ -96,8 +96,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -123,8 +123,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x4.c b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x4.c
index 14a2917..aaff9e9 100644
--- a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86-x4.c
@@ -84,8 +84,8 @@
           vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
       wasm_v128_store(output, vout0123);
       output = (float*restrict) ((uintptr_t) output + output_stride);
     } while (--n != 0);
@@ -112,8 +112,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -139,8 +139,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86.c b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86.c
index a2f8442..5f59238 100644
--- a/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86.c
+++ b/src/f32-spmm/gen/4x1-minmax-wasmsimd-x86.c
@@ -49,8 +49,8 @@
           vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
       wasm_v128_store(output, vout0123);
       output = (float*restrict) ((uintptr_t) output + output_stride);
     } while (--n != 0);
@@ -77,8 +77,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -104,8 +104,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined-x2.c b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined-x2.c
index 7f6b2a7..184ff2c 100644
--- a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined-x2.c
+++ b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined-x2.c
@@ -76,10 +76,10 @@
           vi4567 = wasm_v128_load(input + 4);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -107,8 +107,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -134,8 +134,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -161,8 +161,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined.c b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined.c
index 1799671..58fcfad 100644
--- a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined.c
+++ b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-pipelined.c
@@ -60,10 +60,10 @@
           vi4567 = wasm_v128_load(input + 4);
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -91,8 +91,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -118,8 +118,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -145,8 +145,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x2.c b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x2.c
index 135bb0b..af43803 100644
--- a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x2.c
+++ b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x2.c
@@ -78,10 +78,10 @@
           vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -109,8 +109,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -136,8 +136,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -163,8 +163,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x4.c b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x4.c
index 526641e..fb50997 100644
--- a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86-x4.c
@@ -102,10 +102,10 @@
           vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -133,8 +133,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -160,8 +160,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -187,8 +187,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86.c b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86.c
index cd130c0..10bbe69 100644
--- a/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86.c
+++ b/src/f32-spmm/gen/8x1-minmax-wasmsimd-x86.c
@@ -52,10 +52,10 @@
           vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
         } while (--nnz != 0);
       }
-      v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-      v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
-      vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
-      vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
+      v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+      v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
+      vout0123 = wasm_f32x4_pmax(vmin, vout0123);
+      vout4567 = wasm_f32x4_pmax(vmin, vout4567);
       wasm_v128_store(output, vout0123);
       wasm_v128_store(output + 4, vout4567);
       output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -83,8 +83,8 @@
             vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
-        vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
+        v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
+        vout0123 = wasm_f32x4_pmax(vmin, vout0123);
         wasm_v128_store(output, vout0123);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -110,8 +110,8 @@
             vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
           } while (--nnz != 0);
         }
-        v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
-        vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
+        v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
+        vout01 = wasm_f32x4_pmax(vmin, vout01);
         *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
@@ -137,8 +137,8 @@
             vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
           } while (--nnz != 0);
         }
-        v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
-        vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
+        v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
+        vout0 = wasm_f32x4_pmax(vmin, vout0);
         *output = wasm_f32x4_extract_lane(vout0, 0);
 
         output = (float*restrict) ((uintptr_t) output + output_stride);
diff --git a/src/f32-spmm/wasmsimd-pipelined.c.in b/src/f32-spmm/wasmsimd-pipelined.c.in
index 139e628..dfc2bb6 100644
--- a/src/f32-spmm/wasmsimd-pipelined.c.in
+++ b/src/f32-spmm/wasmsimd-pipelined.c.in
@@ -72,9 +72,9 @@
       }
       $if X86:
         $for M in range(0, MR, 4):
-          v128_t vout${ABC[M:M+4]} = wasm_v128_bitselect(vacc${ABC[M:M+4]}, vmax, wasm_f32x4_le(vacc${ABC[M:M+4]}, vmax));
+          v128_t vout${ABC[M:M+4]} = wasm_f32x4_pmin(vmax, vacc${ABC[M:M+4]});
         $for M in range(0, MR, 4):
-          vout${ABC[M:M+4]} = wasm_v128_bitselect(vmin, vout${ABC[M:M+4]}, wasm_f32x4_lt(vout${ABC[M:M+4]}, vmin));
+          vout${ABC[M:M+4]} = wasm_f32x4_pmax(vmin, vout${ABC[M:M+4]});
       $else:
         $for M in range(0, MR, 4):
           v128_t vout${ABC[M:M+4]} = wasm_f32x4_min(vacc${ABC[M:M+4]}, vmax);
@@ -133,17 +133,17 @@
           }
           $if SUBMR == 1:
             $if X86:
-              v128_t vout${ABC[0]} = wasm_v128_bitselect(vacc${ABC[0]}, vmax, wasm_f32x4_le(vacc${ABC[0]}, vmax));
-              vout${ABC[0]} = wasm_v128_bitselect(vmin, vout${ABC[0]}, wasm_f32x4_lt(vout${ABC[0]}, vmin));
+              v128_t vout${ABC[0]} = wasm_f32x4_pmin(vmax, vacc${ABC[0]});
+              vout${ABC[0]} = wasm_f32x4_pmax(vmin, vout${ABC[0]});
             $else:
               v128_t vout${ABC[0]} = wasm_f32x4_min(vacc${ABC[0]}, vmax);
               vout${ABC[0]} = wasm_f32x4_max(vout${ABC[0]}, vmin);
           $else:
             $if X86:
               $for M in range(0, SUBMR, 4):
-                v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_v128_bitselect(vacc${ABC[M:min(M+4,SUBMR)]}, vmax, wasm_f32x4_le(vacc${ABC[M:min(M+4,SUBMR)]}, vmax));
+                v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_pmin(vmax, vacc${ABC[M:min(M+4,SUBMR)]});
               $for M in range(0, SUBMR, 4):
-                vout${ABC[M:min(M+4,SUBMR)]} = wasm_v128_bitselect(vmin, vout${ABC[M:min(M+4,SUBMR)]}, wasm_f32x4_lt(vout${ABC[M:min(M+4,SUBMR)]}, vmin));
+                vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_pmax(vmin, vout${ABC[M:min(M+4,SUBMR)]});
             $else:
               $for M in range(0, SUBMR, 4):
                 v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_min(vacc${ABC[M:min(M+4,SUBMR)]}, vmax);
diff --git a/src/f32-spmm/wasmsimd.c.in b/src/f32-spmm/wasmsimd.c.in
index 83f4bc5..0e750c6 100644
--- a/src/f32-spmm/wasmsimd.c.in
+++ b/src/f32-spmm/wasmsimd.c.in
@@ -84,9 +84,9 @@
       }
       $if X86:
         $for M in range(0, MR, 4):
-          v128_t vout${ABC[M:M+4]} = wasm_v128_bitselect(vacc${ABC[M:M+4]}, vmax, wasm_f32x4_le(vacc${ABC[M:M+4]}, vmax));
+          v128_t vout${ABC[M:M+4]} = wasm_f32x4_pmin(vmax, vacc${ABC[M:M+4]});
         $for M in range(0, MR, 4):
-          vout${ABC[M:M+4]} = wasm_v128_bitselect(vmin, vout${ABC[M:M+4]}, wasm_f32x4_lt(vout${ABC[M:M+4]}, vmin));
+          vout${ABC[M:M+4]} = wasm_f32x4_pmax(vmin, vout${ABC[M:M+4]});
       $else:
         $for M in range(0, MR, 4):
           v128_t vout${ABC[M:M+4]} = wasm_f32x4_min(vacc${ABC[M:M+4]}, vmax);
@@ -145,17 +145,17 @@
           }
           $if SUBMR == 1:
             $if X86:
-              v128_t vout${ABC[0]} = wasm_v128_bitselect(vacc${ABC[0]}, vmax, wasm_f32x4_le(vacc${ABC[0]}, vmax));
-              vout${ABC[0]} = wasm_v128_bitselect(vmin, vout${ABC[0]}, wasm_f32x4_lt(vout${ABC[0]}, vmin));
+              v128_t vout${ABC[0]} = wasm_f32x4_pmin(vmax, vacc${ABC[0]});
+              vout${ABC[0]} = wasm_f32x4_pmax(vmin, vout${ABC[0]});
             $else:
               v128_t vout${ABC[0]} = wasm_f32x4_min(vacc${ABC[0]}, vmax);
               vout${ABC[0]} = wasm_f32x4_max(vout${ABC[0]}, vmin);
           $else:
             $if X86:
               $for M in range(0, SUBMR, 4):
-                v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_v128_bitselect(vacc${ABC[M:min(M+4,SUBMR)]}, vmax, wasm_f32x4_le(vacc${ABC[M:min(M+4,SUBMR)]}, vmax));
+                v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_pmin(vmax, vacc${ABC[M:min(M+4,SUBMR)]});
               $for M in range(0, SUBMR, 4):
-                vout${ABC[M:min(M+4,SUBMR)]} = wasm_v128_bitselect(vmin, vout${ABC[M:min(M+4,SUBMR)]}, wasm_f32x4_lt(vout${ABC[M:min(M+4,SUBMR)]}, vmin));
+                vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_pmax(vmin, vout${ABC[M:min(M+4,SUBMR)]});
             $else:
               $for M in range(0, SUBMR, 4):
                 v128_t vout${ABC[M:min(M+4,SUBMR)]} = wasm_f32x4_min(vacc${ABC[M:min(M+4,SUBMR)]}, vmax);