AVX2 MUL16 QS8/QC8 DWCONV microkernels using VPUNPCK instructions to extend the product

PiperOrigin-RevId: 387428311
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3fee85..61f3659 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4015,15 +4015,19 @@
   src/math/sigmoid-avx2-rr2-p5-nr2fma.c
   src/qc8-dwconv/gen/up8x9-minmax-fp32-avx2-mul32.c
   src/qc8-dwconv/gen/up8x25-minmax-fp32-avx2-mul32.c
-  src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16.c
+  src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpunpck.c
   src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
-  src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16.c
+  src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpunpck.c
   src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
   src/qc8-dwconv/gen/up24x9-minmax-fp32-avx2-mul32.c
   src/qc8-dwconv/gen/up24x25-minmax-fp32-avx2-mul32.c
-  src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16.c
+  src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpunpck.c
   src/qc8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c
-  src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16.c
+  src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c
   src/qc8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
   src/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qc8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
@@ -4038,25 +4042,29 @@
   src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx2-mul32.c
   src/qs8-dwconv/gen/up8x25-minmax-fp32-avx2-mul32.c
   src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx2-mul32.c
-  src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16.c
+  src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul16-vpunpck.c
   src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c
-  src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16.c
+  src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul16-vpmovsx.c
   src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx2-mul32.c
-  src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16.c
+  src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul16-vpunpck.c
   src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c
-  src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16.c
+  src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul16-vpmovsx.c
   src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx2-mul32.c
   src/qs8-dwconv/gen/up24x9-minmax-fp32-avx2-mul32.c
   src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-avx2-mul32.c
   src/qs8-dwconv/gen/up24x25-minmax-fp32-avx2-mul32.c
   src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx2-mul32.c
-  src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16.c
+  src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul16-vpunpck.c
   src/qs8-dwconv/gen/up32x9-minmax-fp32-avx2-mul32.c
-  src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16.c
+  src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul16-vpmovsx.c
   src/qs8-dwconv/gen/up32x9-minmax-gemmlowp-avx2-mul32.c
-  src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16.c
+  src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpmovsx.c
+  src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul16-vpunpck.c
   src/qs8-dwconv/gen/up32x25-minmax-fp32-avx2-mul32.c
-  src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16.c
+  src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul16-vpmovsx.c
   src/qs8-dwconv/gen/up32x25-minmax-gemmlowp-avx2-mul32.c
   src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c