AVX2 MUL16 QS8/QC8 DWCONV microkernels using VPUNPCK instructions to extend the product

PiperOrigin-RevId: 387428311
diff --git a/BUILD.bazel b/BUILD.bazel
index 006b297..3469c01 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4854,15 +4854,19 @@
     "src/math/sigmoid-avx2-rr2-p5-nr2fma.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx2-mul32.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx2-mul32.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-avx2-mul32.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-avx2-mul32.c",
-    "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16.c",
+    "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c",
-    "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16.c",
+    "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
     "src/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qc8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c",
@@ -4877,25 +4881,29 @@
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-avx2-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16-vpmovsx.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16-vpmovsx.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c",
     "src/qs8-dwconv/gen/up24x9-minmax-fp32-avx2-mul32.c",
     "src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c",
     "src/qs8-dwconv/gen/up24x25-minmax-fp32-avx2-mul32.c",
     "src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16-vpmovsx.c",
     "src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c",
     "src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c",
-    "src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c",
+    "src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16-vpmovsx.c",
     "src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c",
     "src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
     "src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c",