XW (eXtended Weights) optimization for QS8 GEMM microkernel

PiperOrigin-RevId: 324734460
diff --git a/BUILD.bazel b/BUILD.bazel
index d250c83..dd1ad11 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1645,12 +1645,17 @@
     "src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-sse2.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-sse2.c",
     "src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-sse2-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-sse2-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-sse2.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-sse2.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-sse2.c",
     "src/qs8-igemm/gen/1x4c2-minmax-sse2-ld64.c",
     "src/qs8-igemm/gen/4x4c2-minmax-sse2-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-sse2-ld128.c",
@@ -1699,12 +1704,17 @@
     "src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-ssse3.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-ssse3.c",
     "src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-ssse3-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-ssse3.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-ssse3.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-ssse3.c",
     "src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld64.c",
     "src/qs8-igemm/gen/4x4c2-minmax-ssse3-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-ssse3-ld128.c",
@@ -1744,12 +1754,17 @@
     "src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-sse41.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-sse41.c",
     "src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-sse41-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-sse41-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-sse41.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-sse41.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-sse41.c",
     "src/qs8-igemm/gen/1x4c2-minmax-sse41-ld64.c",
     "src/qs8-igemm/gen/4x4c2-minmax-sse41-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-sse41-ld128.c",
@@ -1878,12 +1893,17 @@
     "src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c",
     "src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-xop.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-xop.c",
     "src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-xop-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-xop-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-xw-minmax-xop.c",
+    "src/qs8-gemm/gen/2x4c8-xw-minmax-xop.c",
+    "src/qs8-gemm/gen/3x4c8-xw-minmax-xop.c",
     "src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c",
     "src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c",
@@ -2060,6 +2080,9 @@
     "src/qs8-gemm/gen/1x8c8-minmax-avx2.c",
     "src/qs8-gemm/gen/2x8c8-minmax-avx2.c",
     "src/qs8-gemm/gen/3x8c8-minmax-avx2.c",
+    "src/qs8-gemm/gen/1x8c8-xw-minmax-avx2.c",
+    "src/qs8-gemm/gen/2x8c8-xw-minmax-avx2.c",
+    "src/qs8-gemm/gen/3x8c8-xw-minmax-avx2.c",
     "src/qs8-igemm/gen/1x8c8-minmax-avx2.c",
     "src/qs8-igemm/gen/2x8c8-minmax-avx2.c",
     "src/qs8-igemm/gen/3x8c8-minmax-avx2.c",