QC8 DWCONV microkernels for SSE/AVX/XOP/AVX512

PiperOrigin-RevId: 379431964
diff --git a/BUILD.bazel b/BUILD.bazel
index 46e0b84..30c2f80 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2624,6 +2624,12 @@
     "src/math/sigmoid-sse2-rr2-p5-div.c",
     "src/math/sigmoid-sse2-rr2-p5-nr1.c",
     "src/math/sigmoid-sse2-rr2-p5-nr2.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-sse2-mul16.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
@@ -2887,6 +2893,18 @@
     "src/math/roundne-sse41.c",
     "src/math/roundu-sse41.c",
     "src/math/roundz-sse41.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-sse41-mul32.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
@@ -3179,6 +3197,18 @@
     "src/math/sigmoid-avx-rr2-p5-div.c",
     "src/math/sigmoid-avx-rr2-p5-nr1.c",
     "src/math/sigmoid-avx-rr2-p5-nr2.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-avx-mul32.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
@@ -3313,6 +3343,12 @@
 ]
 
 XOP_UKERNELS = [
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
     "src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
@@ -3983,6 +4019,10 @@
 ]
 
 AVX512SKX_UKERNELS = [
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx512skx-mul32.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx512skx-mul32.c",
+    "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
+    "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
     "src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
     "src/qc8-gemm/gen/2x16c8-minmax-fp32-avx512skx.c",
     "src/qc8-gemm/gen/3x16c8-minmax-fp32-avx512skx.c",