Leverage Load-Zero WAsm SIMD instructions in Chrome M88 microkernels

PiperOrigin-RevId: 394786669
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 3fca643..0bf43dd 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 13613fa..e717299 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -41,10 +41,10 @@
   int8_t* c0 = c;
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     w = (const void*) ((const int32_t*) w + 4);
 
     size_t k = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index fb89308..e2560b5 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 8de0218..ddd5271 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -47,10 +47,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index a1c9ddb..31e17da 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 7bbf946..83f9eb0 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -53,10 +53,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
index 856ed50..5a29ce5 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;
diff --git a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
index 1fc1a47..6277d7d 100644
--- a/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
+++ b/src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c
@@ -59,10 +59,10 @@
   }
 
   do {
-    v128_t vacc0x0 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[0]);
-    v128_t vacc0x1 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[1]);
-    v128_t vacc0x2 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[2]);
-    v128_t vacc0x3 = wasm_f32x4_replace_lane(wasm_f32x4_const_splat(0.0f), 0, ((const float*) w)[3]);
+    v128_t vacc0x0 = wasm_v128_load32_zero(w);
+    v128_t vacc0x1 = wasm_v128_load32_zero((const int32_t*) w + 1);
+    v128_t vacc0x2 = wasm_v128_load32_zero((const int32_t*) w + 2);
+    v128_t vacc0x3 = wasm_v128_load32_zero((const int32_t*) w + 3);
     v128_t vacc1x0 = vacc0x0;
     v128_t vacc1x1 = vacc0x1;
     v128_t vacc1x2 = vacc0x2;