QS8 4x8 lane GEMM AArch32 microkernel for Cortex A53
- q4 and q5 for weights. additional push of d12/d13.
- kc (r2) and lr pushed onto stack.
- load weights and expand in previous block
PiperOrigin-RevId: 425813466
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f29b72e..f73b573 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2503,19 +2503,13 @@
src/u8-maxpool/9p8x-minmax-neon-c16.c
src/u8-rmax/neon.c
src/u8-vclamp/neon-x64.c
+ src/x8-transpose/gen/16x16-reuse-dec-zip-neon.c
+ src/x8-transpose/gen/16x16-reuse-mov-zip-neon.c
+ src/x8-transpose/gen/16x16-reuse-switch-zip-neon.c
src/x8-zip/x2-neon.c
src/x8-zip/x3-neon.c
src/x8-zip/x4-neon.c
src/x8-zip/xm-neon.c
- src/x32-packx/x4-neon-st4.c
- src/x32-unpool/neon.c
- src/x32-zip/x2-neon.c
- src/x32-zip/x3-neon.c
- src/x32-zip/x4-neon.c
- src/x32-zip/xm-neon.c
- src/x8-transpose/gen/16x16-reuse-dec-zip-neon.c
- src/x8-transpose/gen/16x16-reuse-mov-zip-neon.c
- src/x8-transpose/gen/16x16-reuse-switch-zip-neon.c
src/x16-transpose/gen/8x8-multi-dec-zip-neon.c
src/x16-transpose/gen/8x8-multi-mov-zip-neon.c
src/x16-transpose/gen/8x8-multi-switch-zip-neon.c
@@ -2523,6 +2517,7 @@
src/x16-transpose/gen/8x8-reuse-mov-zip-neon.c
src/x16-transpose/gen/8x8-reuse-multi-zip-neon.c
src/x16-transpose/gen/8x8-reuse-switch-zip-neon.c
+ src/x32-packx/x4-neon-st4.c
src/x32-transpose/gen/4x4-multi-dec-zip-neon.c
src/x32-transpose/gen/4x4-multi-mov-zip-neon.c
src/x32-transpose/gen/4x4-multi-multi-zip-neon.c
@@ -2531,6 +2526,11 @@
src/x32-transpose/gen/4x4-reuse-mov-zip-neon.c
src/x32-transpose/gen/4x4-reuse-multi-zip-neon.c
src/x32-transpose/gen/4x4-reuse-switch-zip-neon.c
+ src/x32-unpool/neon.c
+ src/x32-zip/x2-neon.c
+ src/x32-zip/x3-neon.c
+ src/x32-zip/x4-neon.c
+ src/x32-zip/xm-neon.c
src/xx-fill/neon-x64.c
src/xx-pad/neon.c)
@@ -5805,7 +5805,9 @@
src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S
src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S
src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S
+ src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S
src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S
+ src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S
src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S
src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S
src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S
@@ -5815,7 +5817,9 @@
src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S
src/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S
src/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S
+ src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
+ src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
src/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S
src/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
@@ -5823,7 +5827,9 @@
src/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S
src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
+ src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
+ src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S)