Accumulate in 16 bits once in SSE2/SSE4/AVX/XOP MUL16 QS8/QC8 DWCONV before extending to 32 bits

PiperOrigin-RevId: 386989841
diff --git a/test/qc8-dwconv-minmax-fp32.yaml b/test/qc8-dwconv-minmax-fp32.yaml
index ee69154..ea0fa2a 100644
--- a/test/qc8-dwconv-minmax-fp32.yaml
+++ b/test/qc8-dwconv-minmax-fp32.yaml
@@ -49,18 +49,34 @@
   init: xnn_init_qs8_minmax_sse2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x9__sse2_mul16
   init: xnn_init_qs8_minmax_sse2_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16
+  init: xnn_init_qs8_minmax_sse2_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16
+  init: xnn_init_qs8_minmax_sse2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x9__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x9__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16
   init: xnn_init_qs8_minmax_avx2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16
@@ -159,18 +175,34 @@
   init: xnn_init_qs8_minmax_sse2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x25__sse2_mul16
   init: xnn_init_qs8_minmax_sse2_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16
+  init: xnn_init_qs8_minmax_sse2_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__sse2_mul16_add16
+  init: xnn_init_qs8_minmax_sse2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x25__sse41_mul16
   init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__sse41_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up24x25__avx_mul16
   init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__avx_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__xop_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
+- name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16
+  init: xnn_init_qs8_minmax_sse4_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul16
   init: xnn_init_qs8_minmax_avx2_params
 - name: xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx2_mul16