XOP versions of QS8 GEMM/IGEMM microkernels

PiperOrigin-RevId: 324541139
diff --git a/BUILD.bazel b/BUILD.bazel
index 2355ff6..59e96de 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1860,6 +1860,25 @@
     "src/f32-vunary/gen/vsqr-avx-x16.c",
 ]
 
+XOP_UKERNELS = [
+    "src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c",
+    "src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c",
+    "src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c",
+    "src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c",
+    "src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c",
+    "src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c",
+    "src/qs8-igemm/gen/1x4c2-minmax-xop-ld64.c",
+    "src/qs8-igemm/gen/4x4c2-minmax-xop-ld64.c",
+    "src/qs8-igemm/gen/1x4c2-minmax-xop-ld128.c",
+    "src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c",
+    "src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c",
+    "src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c",
+]
+
 FMA3_UKERNELS = [
     "src/f32-dwconv/gen/up16x4-minmax-fma3-acc2.c",
     "src/f32-dwconv/gen/up16x4-minmax-fma3.c",
@@ -2888,6 +2907,42 @@
 )
 
 xnnpack_cc_library(
+    name = "xop_ukernels",
+    hdrs = INTERNAL_HDRS,
+    gcc_copts = xnnpack_gcc_std_copts(),
+    gcc_x86_copts = ["-mxop"],
+    msvc_copts = xnnpack_msvc_std_copts(),
+    msvc_x86_32_copts = ["/arch:AVX"],
+    msvc_x86_64_copts = ["/arch:AVX"],
+    x86_srcs = XOP_UKERNELS,
+    deps = [
+        ":tables",
+        "@FP16",
+        "@pthreadpool",
+    ],
+)
+
+xnnpack_cc_library(
+    name = "xop_ukernels_test_mode",
+    hdrs = INTERNAL_HDRS,
+    copts = [
+        "-UNDEBUG",
+        "-DXNN_TEST_MODE=1",
+    ],
+    gcc_copts = xnnpack_gcc_std_copts(),
+    gcc_x86_copts = ["-mxop"],
+    msvc_copts = xnnpack_msvc_std_copts(),
+    msvc_x86_32_copts = ["/arch:AVX"],
+    msvc_x86_64_copts = ["/arch:AVX"],
+    x86_srcs = XOP_UKERNELS,
+    deps = [
+        ":tables",
+        "@FP16",
+        "@pthreadpool",
+    ],
+)
+
+xnnpack_cc_library(
     name = "fma3_ukernels",
     hdrs = INTERNAL_HDRS,
     gcc_copts = xnnpack_gcc_std_copts(),
@@ -3067,6 +3122,7 @@
         ":ssse3_ukernels",
         ":sse41_ukernels",
         ":avx_ukernels",
+        ":xop_ukernels",
         ":fma3_ukernels",
         ":avx2_ukernels",
         ":avx512f_ukernels",
@@ -3106,6 +3162,7 @@
         ":ssse3_ukernels_test_mode",
         ":sse41_ukernels_test_mode",
         ":avx_ukernels_test_mode",
+        ":xop_ukernels_test_mode",
         ":fma3_ukernels_test_mode",
         ":avx2_ukernels_test_mode",
         ":avx512f_ukernels_test_mode",
@@ -3147,6 +3204,7 @@
         ":ssse3_ukernels",
         ":sse41_ukernels",
         ":avx_ukernels",
+        ":xop_ukernels",
         ":fma3_ukernels",
         ":avx2_ukernels",
         ":avx512f_ukernels",