Accumulate in 16 bits once in SSE2/SSE4/AVX/XOP MUL16 QS8/QC8 DWCONV before extending to 32 bits

PiperOrigin-RevId: 386989841
diff --git a/BUILD.bazel b/BUILD.bazel
index c784f40..b60fe6f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3017,9 +3017,13 @@
     "src/math/sigmoid-sse2-rr2-p5-nr1.c",
     "src/math/sigmoid-sse2-rr2-p5-nr2.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16-add16.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse2-mul16.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-sse2-mul16.c",
     "src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
@@ -3051,12 +3055,16 @@
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c",
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse2-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse2-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse2-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse2-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse2-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse2-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-fp32-sse2-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-sse2-mul16.c",
@@ -3277,12 +3285,16 @@
     "src/math/roundu-sse41.c",
     "src/math/roundz-sse41.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul16.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-sse41-mul32.c",
@@ -3317,18 +3329,22 @@
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c",
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul32.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-sse41-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-sse41-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-sse41-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-sse41-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-sse41-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-sse41-mul32.c",
@@ -3605,12 +3621,16 @@
     "src/math/sigmoid-avx-rr2-p5-nr1.c",
     "src/math/sigmoid-avx-rr2-p5-nr2.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul16.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c",
@@ -3645,18 +3665,22 @@
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c",
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul16.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-avx-mul32.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-avx-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul16.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-avx-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-avx-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-avx-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-avx-mul32.c",
@@ -3766,9 +3790,13 @@
 ]
 
 XOP_UKERNELS = [
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul16-add16.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-xop-mul32.c",
@@ -3800,12 +3828,16 @@
     "src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c",
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c",
     "src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-xop-mul32.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-xop-mul32.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-xop-mul32.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-xop-mul32.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-xop-mul32.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-xop-mul32.c",
     "src/qs8-dwconv/gen/up24x9-minmax-fp32-xop-mul32.c",