QS8 C4 Neon GEMM/IGEMM microkernels

- Based on C2 microkernel which uses DUP to isolate/duplicate input channels
- Instead of 2 channels, DUP 4 channels.  PADAL to accumulate.
- Outside loop do partial sum of 2 channels.

PiperOrigin-RevId: 407440032
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 2fca7db..5dd77a9 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,6 +3,30 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neon_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neon_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c4__neonv8_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neon_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_padal
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
@@ -57,18 +81,6 @@
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neon_mlal_padal_dup
-  init: xnn_init_qs8_conv_minmax_fp32_neon_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neon_mlal_padal_dup
-  init: xnn_init_qs8_conv_minmax_fp32_neon_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2__neonv8_mlal_padal_dup
-  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
-  k-block: 16
-- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2__neonv8_mlal_padal_dup
-  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
-  k-block: 16
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
   init: xnn_init_qs8_conv_minmax_fp32_neon_params
   k-block: 16