Leverage Load-Zero WAsm SIMD instructions in Chrome M88 microkernels

PiperOrigin-RevId: 394786669
diff --git a/BUILD.bazel b/BUILD.bazel
index c01fc68..5beda2f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7237,6 +7237,19 @@
     }) + select({
         ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"],
         "//conditions:default": [],
+    }) + select({
+        ":xnn_wasmsimd_version_m87": [
+            "-DXNN_WASMSIMD_VERSION=87",
+        ],
+        ":xnn_wasmsimd_version_m88": [
+            "-DXNN_WASMSIMD_VERSION=88",
+        ],
+        ":xnn_wasmsimd_version_m91": [
+            "-DXNN_WASMSIMD_VERSION=91",
+        ],
+        "//conditions:default": [
+            "-DXNN_WASMSIMD_VERSION=87",
+        ],
     }),
     gcc_copts = xnnpack_gcc_std_copts(),
     includes = ["include"],
@@ -7279,6 +7292,19 @@
     }) + select({
         ":xnn_enable_hmp_explicit_false": ["-DXNN_MAX_UARCH_TYPES=1"],
         "//conditions:default": [],
+    }) + select({
+        ":xnn_wasmsimd_version_m87": [
+            "-DXNN_WASMSIMD_VERSION=87",
+        ],
+        ":xnn_wasmsimd_version_m88": [
+            "-DXNN_WASMSIMD_VERSION=88",
+        ],
+        ":xnn_wasmsimd_version_m91": [
+            "-DXNN_WASMSIMD_VERSION=91",
+        ],
+        "//conditions:default": [
+            "-DXNN_WASMSIMD_VERSION=87",
+        ],
     }),
     gcc_copts = xnnpack_gcc_std_copts(),
     includes = ["include"],
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 3fca643..0bf43dd 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 13613fa..e717299 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index fb89308..e2560b5 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 8de0218..ddd5271 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index a1c9ddb..31e17da 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 7bbf946..83f9eb0 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 856ed50..5a29ce5 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 1fc1a47..6277d7d 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index b988401..a74a636 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -44,10 +44,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index c01a0e0..2f01d06 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -44,10 +44,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index d0feefe..6cedd07 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -48,10 +48,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 0b56798..4406da6 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -48,10 +48,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 51f60ea..bd8100c 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -52,10 +52,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index a9c4392..6b495ea 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -52,10 +52,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 0a5452e..d97d795 100644
--- a/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -56,10 +56,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 4d194ce..10408ba 100644
--- a/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -56,10 +56,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/MRx4c8-wasmsimd-dot16x2.c.in b/src/qs8-gemm/MRx4c8-wasmsimd-dot16x2.c.in
index 9862120..b9c5e10 100644
--- a/src/qs8-gemm/MRx4c8-wasmsimd-dot16x2.c.in
+++ b/src/qs8-gemm/MRx4c8-wasmsimd-dot16x2.c.in
@@ -68,8 +68,9 @@
   $if DATATYPE == "QU8":
     const v128_t vb_zero_point = wasm_v128_load(params->${PARAMS_STRUCT}.kernel_zero_point);
   do {
-    $for N in range(4):
-      v128_t vacc0x${N} = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[${N}]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    $for N in range(1, 4):
+      v128_t vacc0x${N} = wasm_v128_load32_zero((const int32_t*) w + ${N});
     $for M in range(1, MR):
       $for N in range(4):
         v128_t vacc${M}x${N} = vacc0x${N};
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 7530549..395fa7a 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 7d20d2c..6241399 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
index 5d8d595..48e2b75 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index d227c94..b06810c 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index d3e7c2d..d3dd02a 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
index 66a441b..f90e133 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 4c03608..f3fddd0 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 5cd9494..990c040 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
index cf48326..71767b5 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 2a027a0..e7522dd 100644
--- a/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index cb11e02..be3915e 100644
--- a/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c b/src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
index 4af900f..20b4d4a 100644
--- a/src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
+++ b/src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/MRx4c8-wasmsimd-dot16x2.c.in b/src/qs8-igemm/MRx4c8-wasmsimd-dot16x2.c.in
index 52dc461..889c9bd 100644
--- a/src/qs8-igemm/MRx4c8-wasmsimd-dot16x2.c.in
+++ b/src/qs8-igemm/MRx4c8-wasmsimd-dot16x2.c.in
@@ -68,8 +68,9 @@
   $if DATATYPE == "QU8":
     const v128_t vb_zero_point = wasm_v128_load(params->${PARAMS_STRUCT}.kernel_zero_point);
   do {
-    $for N in range(4):
-      v128_t vacc0x${N} = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[${N}]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    $for N in range(1, 4):
+      v128_t vacc0x${N} = wasm_v128_load32_zero((const int32_t*) w + ${N});
     $for M in range(1, MR):
       $for N in range(4):
         v128_t vacc${M}x${N} = vacc0x${N};
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 6f47b0a..5757b5a 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -44,10 +44,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index c51145f..5b6a843 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -44,10 +44,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 81337b6..8ea62bc 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -48,10 +48,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 62bd488..7d78733 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -48,10 +48,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index d27e9f7..f27f1f9 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -52,10 +52,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index a1e1899..7e479cc 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -52,10 +52,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 77a4afc..b7280a7 100644
--- a/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -56,10 +56,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 0ff4ca4..99b0786 100644
--- a/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -56,10 +56,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 8c5c26b..a437ccc 100644
--- a/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -42,10 +42,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 1c64e02..78a2088 100644
--- a/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -42,10 +42,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 87d1ca8..a6096f2 100644
--- a/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -48,10 +48,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index d083395..5c72fdc 100644
--- a/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -48,10 +48,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 21f8226..2460347 100644
--- a/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -54,10 +54,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 336ebc8..06d9581 100644
--- a/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -54,10 +54,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index ed14160..01f9319 100644
--- a/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -60,10 +60,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 2c4c1d1..4d1ca5f 100644
--- a/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -60,10 +60,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 0953ae6..eae8c40 100644
--- a/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -45,10 +45,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index aa617df..f6e8066 100644
--- a/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -45,10 +45,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t p = ks;
diff --git a/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 58b7923..382f514 100644
--- a/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -49,10 +49,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 2e21c91..5272a54 100644
--- a/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -49,10 +49,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 6455c02..ac1e141 100644
--- a/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -53,10 +53,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index aa0efe0..3a1f902 100644
--- a/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -53,10 +53,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 70f67f1..2c1902f 100644
--- a/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -57,10 +57,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 5d75bd7..9c3d7b3 100644
--- a/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -57,10 +57,10 @@
 
   const v128_t vb_zero_point = wasm_v128_load(params->fp32_wasmsimd.kernel_zero_point);
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;