QU8 C4 NEON Dot Product GEMM/IGEMM microkernels for Cortex A55r1
- adapted from QS8 4x16 microkernel to QU8 4x8
- 2 dot products per vector. A * W and A * zero_point.
- unsigned dot products with 2 sets of accumulators.
- subtract zero point accumulators from accumulators outside loop.
- 4x8 GEMM and IGEMM.
- RNDNU, FP32 and GEMMLOWP support but only RNDNU used
PiperOrigin-RevId: 393028276
diff --git a/BUILD.bazel b/BUILD.bazel
index 715c655..0ba70ba 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5647,12 +5647,14 @@
"src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
"src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S",
"src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
+ "src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
"src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld64.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
"src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
+ "src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
"src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld64.S",
"src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",