4x16 QS8 microkernel for Cortex A53
- GEMM and IGEMM
- prefetch and non-prefetch
PiperOrigin-RevId: 374297582
diff --git a/BUILD.bazel b/BUILD.bazel
index 48ba91b..3279af0 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -504,20 +504,20 @@
"src/qs8-gavgpool/gen/7x-minmax-scalar-c2.c",
"src/qs8-gavgpool/gen/7x-minmax-scalar-c4.c",
"src/qs8-gemm/gen/1x2-minmax-scalar.c",
- "src/qs8-gemm/gen/2x2-minmax-scalar.c",
- "src/qs8-gemm/gen/3x2-minmax-scalar.c",
- "src/qs8-gemm/gen/4x2-minmax-scalar.c",
"src/qs8-gemm/gen/1x4-minmax-scalar.c",
+ "src/qs8-gemm/gen/2x2-minmax-scalar.c",
"src/qs8-gemm/gen/2x4-minmax-scalar.c",
+ "src/qs8-gemm/gen/3x2-minmax-scalar.c",
"src/qs8-gemm/gen/3x4-minmax-scalar.c",
+ "src/qs8-gemm/gen/4x2-minmax-scalar.c",
"src/qs8-gemm/gen/4x4-minmax-scalar.c",
"src/qs8-igemm/gen/1x2-minmax-scalar.c",
- "src/qs8-igemm/gen/2x2-minmax-scalar.c",
- "src/qs8-igemm/gen/3x2-minmax-scalar.c",
- "src/qs8-igemm/gen/4x2-minmax-scalar.c",
"src/qs8-igemm/gen/1x4-minmax-scalar.c",
+ "src/qs8-igemm/gen/2x2-minmax-scalar.c",
"src/qs8-igemm/gen/2x4-minmax-scalar.c",
+ "src/qs8-igemm/gen/3x2-minmax-scalar.c",
"src/qs8-igemm/gen/3x4-minmax-scalar.c",
+ "src/qs8-igemm/gen/4x2-minmax-scalar.c",
"src/qs8-igemm/gen/4x4-minmax-scalar.c",
"src/qs8-requantization/fp32-scalar-lrintf.c",
"src/qs8-requantization/fp32-scalar-magic.c",
@@ -748,13 +748,13 @@
"src/f32-vlrelu/gen/vlrelu-wasm-x1.c",
"src/f32-vlrelu/gen/vlrelu-wasm-x2.c",
"src/f32-vlrelu/gen/vlrelu-wasm-x4.c",
+ "src/f32-vmulcaddc/gen/c1-minmax-wasm-2x.c",
+ "src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c",
+ "src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c",
"src/f32-vrelu/gen/vrelu-wasm-x1.c",
"src/f32-vrelu/gen/vrelu-wasm-x2.c",
"src/f32-vrelu/gen/vrelu-wasm-x4.c",
"src/f32-vrelu/gen/vrelu-wasm-x8.c",
- "src/f32-vmulcaddc/gen/c1-minmax-wasm-2x.c",
- "src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c",
- "src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c",
]
WASMSIMD_UKERNELS = [
@@ -2326,10 +2326,10 @@
"src/f16-vclamp/gen/vclamp-neonfp16arith-x16.c",
"src/f16-vhswish/gen/vhswish-neonfp16arith-x8.c",
"src/f16-vhswish/gen/vhswish-neonfp16arith-x16.c",
- "src/f16-vrelu/gen/vrelu-neonfp16arith-x8.c",
- "src/f16-vrelu/gen/vrelu-neonfp16arith-x16.c",
"src/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c",
"src/f16-vmulcaddc/gen/c16-minmax-neonfp16arith-2x.c",
+ "src/f16-vrelu/gen/vrelu-neonfp16arith-x8.c",
+ "src/f16-vrelu/gen/vrelu-neonfp16arith-x16.c",
]
NEONDOT_UKERNELS = [
@@ -3768,25 +3768,29 @@
"src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S",
"src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S",
"src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
- "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
- "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
"src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
"src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
+ "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
+ "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
"src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
"src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
"src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
"src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
+ "src/qs8-gemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S",
+ "src/qs8-gemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
"src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S",
"src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S",
"src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S",
- "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
- "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
"src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
"src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
+ "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
+ "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
"src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
"src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
"src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
"src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
+ "src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S",
+ "src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
]
INTERNAL_MICROKERNEL_HDRS = [