QC8/QS8/QU8 GEMM/IGEMM WAsm SIMD microkernels using i32x4.dot_i16x8_s instruction

PiperOrigin-RevId: 394098332
diff --git a/BUILD.bazel b/BUILD.bazel
index 53062b8..19cae49 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1759,18 +1759,50 @@
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16-add16.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qc8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qc8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-wasmsimd-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-wasmsimd-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-wasmsimd-mul16-add16.c",
@@ -1789,21 +1821,61 @@
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c8-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c16-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c24-acc2.c",
+    "src/qs8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
+    "src/qs8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "src/qs8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
+    "src/qs8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qs8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-requantization/fp32-wasmsimd.c",
     "src/qs8-requantization/gemmlowp-wasmsimd.c",
     "src/qs8-vadd/gen/minmax-wasmsimd-x8.c",
@@ -1824,18 +1896,50 @@
     "src/qu8-dwconv/gen/up16x25-minmax-fp32-wasmsimd-mul16.c",
     "src/qu8-dwconv/gen/up24x9-minmax-fp32-wasmsimd-mul16.c",
     "src/qu8-dwconv/gen/up24x25-minmax-fp32-wasmsimd-mul16.c",
+    "src/qu8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-igemm/gen/1x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-igemm/gen/2x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul32-ld64.c",
     "src/qu8-igemm/gen/3x4c8-minmax-fp32-wasmsimd-mul32-ld128.c",
+    "src/qu8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "src/qu8-igemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qu8-requantization/fp32-wasmsimd.c",
     "src/qu8-requantization/gemmlowp-wasmsimd.c",
     "src/qu8-vadd/gen/minmax-wasmsimd-x8.c",