Fix conversion script for aarch64 assembly kernels and convert a single F32 GEMM as a test
PiperOrigin-RevId: 424212216
diff --git a/BUILD.bazel b/BUILD.bazel
index 7b10d47..f76f50e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7314,6 +7314,10 @@
"src/qs8-igemm/4x8c4-rndnu-aarch32-neondot-ld64.cc",
]
+JIT_AARCH64_SRCS = [
+ "src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc",
+]
+
INTERNAL_MICROKERNEL_HDRS = [
"src/xnnpack/allocator.h",
"src/xnnpack/argmaxpool.h",
@@ -9169,6 +9173,7 @@
"src/xnnpack/assembler.h",
],
aarch32_srcs = JIT_AARCH32_SRCS,
+ aarch64_srcs = JIT_AARCH64_SRCS,
msvc_copts = xnnpack_msvc_std_copts(),
deps = [
":logging_utils",
@@ -9189,6 +9194,7 @@
"src/xnnpack/assembler.h",
],
aarch32_srcs = JIT_AARCH32_SRCS,
+ aarch64_srcs = JIT_AARCH64_SRCS,
copts = [
"-UNDEBUG",
"-DXNN_TEST_MODE=1",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f85c746..44c534e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -283,6 +283,9 @@
src/qs8-igemm/4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc
src/qs8-igemm/4x8c4-rndnu-aarch32-neondot-ld64.cc)
+SET(JIT_AARCH64_SRCS
+ src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc)
+
SET(PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS
src/params-init.c
src/u8-lut32norm/scalar.c
@@ -6066,6 +6069,7 @@
LIST(APPEND PROD_MICROKERNEL_SRCS ${AARCH64_ASM_MICROKERNEL_SRCS})
LIST(APPEND ALL_MICROKERNEL_SRCS ${AARCH64_ASM_MICROKERNEL_SRCS})
ENDIF()
+ LIST(APPEND JIT_SRCS ${JIT_AARCH64_SRCS})
ENDIF()
IF(XNNPACK_TARGET_PROCESSOR MATCHES "^(i[3-6]86|x86_64|AMD64)$" OR IOS_ARCH MATCHES "^(i386|x86_64|AMD64)$")
LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_SSE_MICROKERNEL_SRCS})
diff --git a/scripts/convert-assembly-to-jit.py b/scripts/convert-assembly-to-jit.py
old mode 100644
new mode 100755
index d3b648d..51e19f5
--- a/scripts/convert-assembly-to-jit.py
+++ b/scripts/convert-assembly-to-jit.py
@@ -18,7 +18,7 @@
COMMENTS = SPACES + '((//\s+.+)|)$'
WB = r'!'
-REG_NO_GROUP = r'r\d+|s\d+|d\d+|q\d+|sp|lr|pc'
+REG_NO_GROUP = r'r\d+|s\d+|d\d+|q\d+|sp|lr|pc|x\d+|(?:v\d+\.(?:\d+)?(?:d|s|h|b))'
REG = r'(' + REG_NO_GROUP + ')'
IMM_NO_GROUP = r'\d+'
IMM = r'(' + IMM_NO_GROUP + ')'
@@ -26,7 +26,7 @@
REG_OR_IMM = r'(' + REG_LANE_NO_GROUP + '|' + REG_NO_GROUP + '|' + IMM_NO_GROUP + ')'
REGLIST_CONSEC = r'\{(\w+)-(\w+)\}' + SPACES
-REGLIST_INDIV = r'\{(\w+(?:,\s+\w+)*)\}' + SPACES
+REGLIST_INDIV = r'\{([\w.]+(?:,\s+[\w.]+)*)\}' + SPACES
REGLIST_INDIV_REPLICATE = r'\{(\w+(?:\[\])(,\s*\w+(?:\[\]))*)\}' + SPACES
REGLIST_INDEX = r'\{(' + REG_LANE_NO_GROUP + ')\}' + SPACES
@@ -35,14 +35,15 @@
MEMOP = r'\[' + SPACES + REG + '\]' + SPACES
MEMOP_MAYBE_WB = r'\[' + SPACES + REG + '\]' + f'({WB})?'
-MEMOP_OFFSET = r'\[' + REG + COMMA + '(\d+)\]' + SPACES
+MEMOP_OFFSET = r'\[' + REG + COMMA + '(-?\d+)\]' + SPACES
+MEMOP_OFFSET_MAYBE_WB = r'\[' + REG + COMMA + '(-?\d+)\]' + f'({WB})?' + SPACES
B_IMM = r'(\d+)(f|b)'
INSTR = SPACES + r'([A-Z0-9.]+)' + SPACES
# e.g. #ifndef __APPLE__
-IFDEF_RE = re.compile(r'\s*#(ifndef|endif|ifdef)')
+IFDEF_RE = re.compile(r'\s*#(ifndef|endif|ifdef).*')
# e.g. # Push 96 bytes
COMMENT_RE = re.compile(SPACES + r'((//|#)\s*.+)')
# e.g. 0:
@@ -57,18 +58,28 @@
INSTR_OP_RE = re.compile(INSTR + REG + COMMENTS)
# e.g. BLO 2f
INSTR_B_IMM = re.compile(INSTR + B_IMM + COMMENTS)
+# e.g. TBNZ x0, 4, 5f
+INSTR_B_REG_IMM_IMM = re.compile(INSTR + REG + COMMA + IMM + COMMA + B_IMM + COMMENTS)
# e.g. .p2align 3
P2ALIGN_RE = re.compile(SPACES + r'\.p2align\s+(\d+)')
# e.g. CMP r0, 2
INSTR_REG_IMM_RE = re.compile(INSTR + REG + COMMA + IMM + COMMENTS)
# e.g. LDR r0, [r12]
INSTR_REG_MEMOP_RE = re.compile(INSTR + REG + COMMA + MEMOP + COMMENTS)
+# e.g. LDR q0, [x4], 16
+INSTR_REG_MEMOP_IMM_RE = re.compile(INSTR + REG + COMMA + MEMOP + COMMA + IMM + COMMENTS)
# e.g. LDR r0, [sp, 112]
INSTR_REG_MEMOP_OFFSET_RE = re.compile(INSTR + REG + COMMA + MEMOP_OFFSET +
COMMENTS)
-# e.g. LDRD r6, r7, [sp, 104]
+# e.g. LDRD r6, r7, [sp]
+INSTR_REG_REG_MEMOP_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
+ MEMOP + COMMENTS)
+# e.g. LDRD r6, r7, [sp, 104], STP d8, d9, [sp, -64]!
INSTR_REG_REG_MEMOP_OFFSET_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
- MEMOP_OFFSET + COMMENTS)
+ MEMOP_OFFSET_MAYBE_WB + COMMENTS)
+# e.g. LDP q20, q21, [x5], 32
+INSTR_REG_REG_MEMOP_IMM_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
+ MEMOP + COMMA + IMM + COMMENTS)
# e.g. PLD [r4, 64]
INSTR_MEMOP_OFFSET_RE = re.compile(INSTR + MEMOP_OFFSET + COMMENTS)
# e.g. movlo r12, r3, vdup.32 q0, d14[0]
@@ -97,6 +108,9 @@
# e.g. VLD1.32 {d0}, [r3]{!}
INSTR_REGLIST_INDIV_MEMOP = re.compile(INSTR + REGLIST_INDIV + COMMA +
MEMOP_MAYBE_WB + COMMENTS)
+# e.g. LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48
+INSTR_REGLIST_INDIV_MEMOP_IMM = re.compile(INSTR + REGLIST_INDIV + COMMA +
+ MEMOP + COMMA + IMM + COMMENTS)
# e.g. VST1.32 {d24-d25}, [r11]{!}
INSTR_REGLIST_CONSEC_MEMOP = re.compile(INSTR + REGLIST_CONSEC + COMMA +
MEMOP_MAYBE_WB + COMMENTS)
@@ -110,17 +124,26 @@
INSTR_REG_FPSCR = re.compile(INSTR + f'({APSR}|{REG_NO_GROUP})' + COMMA +
FPSCR + COMMENTS)
+# e.g. PRFM PLDL1KEEP, [x5]
+INSTR_PLD_MEMOP = re.compile(INSTR + f'(PLDL1KEEP)' + COMMA + MEMOP + COMMENTS)
+# e.g. PRFM PLDL1KEEP, [x5, 64]
+INSTR_PLD_MEMOP_OFFSET = re.compile(INSTR + f'(PLDL1KEEP)' + COMMA + MEMOP_OFFSET + COMMENTS)
+
+COND = r'([A-Z]+)'
+# e.g. CSEL x9, x3, x9, LO
+INSTR_REG_REG_REG_COND_RE = re.compile(INSTR + REG + COMMA + REG + COMMA + REG + COMMA + COND + COMMENTS)
+
def remove_brackets(s):
return s.replace('[', '').replace(']', '')
def fix_replicate_instruction(s):
- return re.sub(r'\.(\d+)', r'r.\1', s, 1)
+ return re.sub(r'_(\d+)', r'r_\1', s, 1)
def fix_instr_name(s):
- return s.replace('.', '_', 2).replace('and', 'and_', 1)
+ return s.lower().replace('.', '_', 2).replace('and', 'and_', 1)
def fix_comments(s):
@@ -140,10 +163,39 @@
return f'xnn_generate_{name}'
+def fix_regs(regs):
+ # Vector registers with datatype need to be method calls.
+ # e.g. v2.4s -> v2.v4s(), v2.s -> v2.s()
+ def repl(m):
+ if m.group(2):
+ return f'{m[1]}v{m[2]}{m[3]}()'
+ else:
+ return f'{m[1]}{m[3]}()'
+ return re.sub(r'(\w+\.)(\d+)?(\w+)', repl, regs)
+
+
IGNORE_LINES = [r'\s*\.\w+']
+AARCH32 = 'aarch32'
+AARCH64 = 'aarch64'
+GEMM = 'GEMM'
+IGEMM = 'IGEMM'
def main(input_file):
+ arch = None
+ kernel_type = GEMM
+
+ if 'aarch32' in input_file:
+ arch = AARCH32
+ elif 'aarch64' in input_file:
+ arch = AARCH64
+ else:
+ print('ERROR: unknown architecture')
+ sys.exit(1)
+
+ if 'igemm' in input_file:
+ kernel_type = IGEMM
+
# Whether we are in the copyright section.
in_copyright = False
# Whether we are in the microkernel function.
@@ -163,6 +215,7 @@
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
+ line = line.rstrip()
# Handle all lines before the microkernel instructions begin.
if not in_function:
@@ -172,8 +225,11 @@
elif 'BEGIN_FUNCTION' in line:
in_function = True
fn_name = line.split()[1]
- prologue.append(f'// Converted from: {input_file}')
- prologue.append('void Generator::generate() {')
+ prologue.append(f'// Converted from: {input_file[20:]}')
+ if kernel_type == GEMM:
+ prologue.append('void Generator::generate(size_t nc, size_t kc, void* params) {')
+ else:
+ prologue.append('void Generator::generate(size_t nc, size_t kc, size_t ks, void* params) {')
continue
elif 'Copyright ' in line:
in_autogen = False
@@ -183,164 +239,207 @@
1).rstrip())
continue
elif '#include <xnnpack/assembly.h>' in line:
- prologue.append('#include <xnnpack/aarch32-assembler.h>')
+ prologue.append(f'#include <xnnpack/{arch}-assembler.h>')
prologue.append('#include <xnnpack/allocator.h>')
- if 'igemm' in input_file:
- prologue.append('#include <xnnpack/igemm.h>')
- elif 'gemm' in input_file:
+ if kernel_type == GEMM:
prologue.append('#include <xnnpack/gemm.h>')
+ else:
+ prologue.append('#include <xnnpack/igemm.h>')
prologue.append('')
prologue.append('namespace xnnpack {')
- prologue.append('namespace aarch32 {')
+ prologue.append(f'namespace {arch} {{')
prologue.append('namespace {')
prologue.append('class Generator : public Assembler {')
prologue.append(' using Assembler::Assembler;')
prologue.append(' public:')
- prologue.append(' void generate();')
+ if kernel_type == GEMM:
+ prologue.append(' void generate(size_t nc, size_t kc, void* params);')
+ else:
+ prologue.append(' void generate(size_t nc, size_t kc, size_t ks, void* params);')
prologue.append('};')
continue
- elif any(re.match(p, line) for p in IGNORE_LINES):
+ elif any(re.fullmatch(p, line) for p in IGNORE_LINES):
continue
elif in_autogen:
continue
else:
- prologue.append(line.rstrip())
+ prologue.append(fix_comments(line.rstrip()))
continue
# We are now in the microkernel function body.
# Don't keep the ifdefs.
- m = re.match(IFDEF_RE, line)
+ m = re.fullmatch(IFDEF_RE, line)
if m:
continue
# But keep other comments.
- m = re.match(COMMENT_RE, line)
+ m = re.fullmatch(COMMENT_RE, line)
if m:
instructions.append(m[1])
continue
- m = re.match(LABEL, line)
+ m = re.fullmatch(LABEL, line)
if m:
labels.append(m[1])
instructions.append(f'bind(l{m[1]}){sc}')
continue
- m = re.match(INSTR_RE, line)
+ m = re.fullmatch(INSTR_RE, line)
if m:
- instructions.append(f'{m[1].lower()}(){sc} {m[2]}')
+ instructions.append(f'{fix_instr_name(m[1])}(){sc} {m[2]}')
continue
- m = re.match(INSTR_OP_RE, line)
+ m = re.fullmatch(INSTR_OP_RE, line)
if m:
- instructions.append(f'{m[1].lower()}({m[2]}){sc} {m[3]}')
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}){sc} {m[3]}')
continue
- m = re.match(INSTR_REGLIST_CONSEC_MEMOP_REG, line)
+ m = re.fullmatch(INSTR_REGLIST_CONSEC_MEMOP_REG, line)
if m:
instructions.append(
- f'{m[1].lower()}({{{m[2]}-{m[3]}}}, mem[{m[4]}], {m[5]}){sc} {m[6]}'
+ f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}, mem[{m[4]}], {m[5]}){sc} {m[6]}'
)
continue
- m = re.match(INSTR_REGLIST_INDIV_MEMOP_REG, line)
+ m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP_REG, line)
if m:
instructions.append(
- f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
+ f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
continue
- m = re.match(INSTR_REGLIST_CONSEC_RE, line)
+ m = re.fullmatch(INSTR_REGLIST_CONSEC_RE, line)
if m:
- instructions.append(f'{m[1].lower()}({{{m[2]}-{m[3]}}}){sc} {m[4]}')
+ instructions.append(f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}){sc} {m[4]}')
continue
- m = re.match(INSTR_REGLIST_LIST_RE, line)
+ m = re.fullmatch(INSTR_REGLIST_LIST_RE, line)
if m:
- instructions.append(f'{m[1].lower()}({{{m[2]}}}){sc} {m[3]}')
+ instructions.append(f'{fix_instr_name(m[1])}({{{m[2]}}}){sc} {m[3]}')
continue
- m = re.match(INSTR_MEMOP_OFFSET_RE, line)
+ m = re.fullmatch(INSTR_MEMOP_OFFSET_RE, line)
if m:
- instructions.append(f'{m[1].lower()}(mem[{m[2]}, {m[3]}]){sc} {m[4]}')
+ instructions.append(f'{fix_instr_name(m[1])}(mem[{m[2]}, {m[3]}]){sc} {m[4]}')
continue
- m = re.match(INSTR_REG_MEMOP_RE, line)
+ m = re.fullmatch(INSTR_REG_MEMOP_RE, line)
if m:
- instructions.append(f'{m[1].lower()}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
continue
- m = re.match(INSTR_REG_MEMOP_OFFSET_RE, line)
+ m = re.fullmatch(INSTR_REG_MEMOP_IMM_RE , line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
+ continue
+ m = re.fullmatch(INSTR_REG_MEMOP_OFFSET_RE, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
+ f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
continue
- m = re.match(INSTR_REG_REG_MEMOP_OFFSET_RE, line)
+ m = re.fullmatch(INSTR_REG_REG_MEMOP_RE, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]){sc} {m[6]}')
+ f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}]){sc} {m[5]}')
continue
- m = re.match(INSTR_REG_IMM_RE, line)
+ m = re.fullmatch(INSTR_REG_REG_MEMOP_OFFSET_RE, line)
if m:
- instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
+ if m[6]: # wb
+ instructions.append(
+ f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]++){sc} {m[7]}')
+ else: #no wb
+ instructions.append(
+ f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]){sc} {m[7]}')
continue
- m = re.match(INSTR_REG_REG_REG_RE, line)
+ m = re.fullmatch(INSTR_REG_REG_MEMOP_IMM_RE , line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {m[3]}, {m[4]}){sc} {m[5]}')
+ f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}], {m[5]}){sc} {m[6]}')
continue
- m = re.match(INSTR_REG_REG_REG_IMM_RE, line)
+ m = re.fullmatch(INSTR_REG_IMM_RE, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {m[3]}){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_REG_REG_REG_RE, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {m[3]}, {m[4]}, {m[5]}){sc} {m[6]}')
+ f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {fix_regs(m[3])}, {fix_regs(m[4])}){sc} {m[5]}')
continue
- m = re.match(INSTR_REG_REG_RE, line)
- if m:
- instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
- continue
- m = re.match(INSTR_REG_REGLIST_CONSECT, line)
+ m = re.fullmatch(INSTR_REG_REG_REG_IMM_RE, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {{{m[3]}-{m[4]}}}, false){sc} {m[5]}')
+ f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, {m[4]}, {m[5]}){sc} {m[6]}')
continue
- m = re.match(INSTR_REG_REGLIST_CONSECT_WB, line)
+ m = re.fullmatch(INSTR_REG_REG_RE, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {fix_regs(m[3])}){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_REG_REGLIST_CONSECT, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {{{m[3]}-{m[4]}}}, true){sc} {m[5]}')
+ f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}-{m[4]}}}, false){sc} {m[5]}')
continue
- m = re.match(INSTR_REG_REGLIST_INDIV_WB, line)
+ m = re.fullmatch(INSTR_REG_REGLIST_CONSECT_WB, line)
if m:
instructions.append(
- f'{m[1].lower()}({m[2]}, {{{m[3]}}}, true){sc} {m[4]}')
+ f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}-{m[4]}}}, true){sc} {m[5]}')
continue
- m = re.match(INSTR_B_IMM, line)
- if m:
- instructions.append(f'{m[1].lower()}(l{m[2]}){sc} {m[4]}')
- continue
- m = re.match(INSTR_REGLIST_INDIV_MEMOP, line)
+ m = re.fullmatch(INSTR_REG_REGLIST_INDIV_WB, line)
if m:
instructions.append(
- f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
+ f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}}}, true){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_B_IMM, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}(l{m[2]}){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_B_REG_IMM_IMM , line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, l{m[4]}){sc} {m[6]}')
+ continue
+ m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP, line)
+ if m:
+ instructions.append(
+ f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
)
continue
- m = re.match(INSTR_REGLIST_CONSEC_MEMOP, line)
+ m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP_IMM, line)
if m:
instructions.append(
- f'{m[1].lower()}({{{m[2]}-{m[3]}}}, mem[{m[4]}]{maybe_wb(m[5])}){sc} {m[6]}'
+ f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}'
)
continue
- m = re.match(INSTR_REGLIST_REPLICATE_MEMOP, line)
+ m = re.fullmatch(INSTR_REGLIST_CONSEC_MEMOP, line)
+ if m:
+ instructions.append(
+ f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}, mem[{m[4]}]{maybe_wb(m[5])}){sc} {m[6]}'
+ )
+ continue
+ m = re.fullmatch(INSTR_REGLIST_REPLICATE_MEMOP, line)
if m:
if m[5]:
instructions.append(
- f'{fix_replicate_instruction(m[1].lower())}({{{remove_brackets(m[2])}}}, mem[{m[4]}]++){sc} {m[6]}'
+ f'{fix_replicate_instruction(fix_instr_name(m[1]))}({{{remove_brackets(m[2])}}}, mem[{m[4]}]++){sc} {m[6]}'
)
else:
instructions.append(
- f'{fix_replicate_instruction(m[1].lower())}({{{remove_brackets(m[2])}}}, mem[{m[4]}]){sc} {m[6]}'
+ f'{fix_replicate_instruction(fix_instr_name(m[1]))}({{{remove_brackets(m[2])}}}, mem[{m[4]}]){sc} {m[6]}'
)
continue
- m = re.match(INSTR_REGLIST_INDEX_MEMOP, line)
+ m = re.fullmatch(INSTR_REGLIST_INDEX_MEMOP, line)
if m:
instructions.append(
- f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
+ f'{fix_instr_name(m[1])}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
)
continue
- m = re.match(P2ALIGN_RE, line)
+ m = re.fullmatch(P2ALIGN_RE, line)
if m:
instructions.append(f'align({1 << int(m[1])}){sc}')
continue
- m = re.match(INSTR_REG_FPSCR, line)
+ m = re.fullmatch(INSTR_REG_FPSCR, line)
if m:
- instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_PLD_MEMOP, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
+ continue
+ m = re.fullmatch(INSTR_PLD_MEMOP_OFFSET, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
+ continue
+ m = re.fullmatch(INSTR_REG_REG_REG_COND_RE, line)
+ if m:
+ instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, {m[4]}, k{m[5]}){sc} {m[6]}')
continue
# Keep empty lines for formatting
@@ -376,19 +475,25 @@
elif i.strip() == '':
print()
else:
- print(indent + fix_instr_name(i).rstrip())
+ print(indent + (i).rstrip())
print('}')
print('} // namespace')
- print('} // aarch32')
+ print(f'}} // {arch}')
print('} // xnnpack')
print('')
- print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code) {{')
- print(' using namespace xnnpack::aarch32;')
+ if kernel_type == GEMM:
+ print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {{')
+ else:
+ print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code, size_t nc, size_t kc, size_t ks, void* params) {{')
+ print(f' using namespace xnnpack::{arch};')
print(' Generator g(code);')
- print(' g.generate();')
+ if kernel_type == GEMM:
+ print(' g.generate(nc, kc, nullptr);')
+ else:
+ print(' g.generate(nc, kc, ks, nullptr);')
print(' g.finalize();')
- print(' if (g.error() != Error::kNoError) {')
+ print(' if (g.error() != xnnpack::Error::kNoError) {')
print(' return xnn_status_invalid_state;')
print(' }')
print(' return xnn_status_success;')
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc b/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc
new file mode 100644
index 0000000..5c0048e
--- /dev/null
+++ b/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc
@@ -0,0 +1,720 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/aarch64-assembler.h>
+#include <xnnpack/allocator.h>
+#include <xnnpack/gemm.h>
+
+namespace xnnpack {
+namespace aarch64 {
+namespace {
+class Generator : public Assembler {
+ using Assembler::Assembler;
+ public:
+ void generate(bool prefetch, size_t nc, size_t kc, void* params);
+};
+
+// void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(
+// size_t mr, x0
+// size_t nc, x1
+// size_t kc, x2 / x0
+// const uint8_t*restrict a, x3
+// size_t a_stride, x4
+// const void*restrict w, x5
+// uint8_t*restrict c, x6
+// size_t cm_stride, x7
+// size_t cn_stride, [sp] -> (x0)
+// const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
+
+// d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+// A pointers
+// x3 a0
+// x9 a1
+// x10 a2
+// x11 a3
+// x12 a4
+// x4 a5
+
+// C pointers
+// x6 c0
+// x16 c1
+// x17 c2
+// x14 c3
+// x13 c4
+// x7 c5
+
+// Vector register usage
+// A0 v0 v6
+// A1 v1 v7
+// A2 v2 v8
+// A3 v3 v9
+// A4 v4 v10
+// A5 v5 v11
+// B v12 v13 v14 v15
+// B v16 v17 v18 v19
+// C v20 v21
+// C v22 v23
+// C v24 v25
+// C v26 v27
+// C v28 v29
+// C v30 v31
+// Clamp v6 v7
+
+// Converted from: src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
+void Generator::generate(bool prefetch, size_t nc, size_t kc, void* params) {
+ Label l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;
+
+
+ // Load params pointer
+ ldr(x8, mem[sp, 8]);
+
+ // Clamp A and C pointers / Save d8-d15 on stack
+ stp(d8, d9, mem[sp, -64]++);
+ cmp(x0, 2); // if mr < 2
+ add(x9, x3, x4); // a1 = a0 + a_stride
+ add(x16, x6, x7); // c1 = c0 + cm_stride
+ csel(x9, x3, x9, kLO); // a1 = a0
+ csel(x16, x6, x16, kLO); // c1 = c0
+
+ stp(d10, d11, mem[sp, 16]);
+ add(x10, x9, x4); // a2 = a1 + a_stride
+ add(x17, x16, x7); // c2 = c1 + cm_stride
+ // if mr <= 2
+ csel(x10, x9, x10, kLS); // a2 = a1
+ csel(x17, x16, x17, kLS); // c2 = c1
+
+ stp(d12, d13, mem[sp, 32]);
+ cmp(x0, 4); // if mr < 4
+ add(x11, x10, x4); // a3 = a2 + a_stride
+ add(x14, x17, x7); // c3 = c2 + cm_stride
+ csel(x11, x10, x11, kLO); // a3 = a2
+ csel(x14, x17, x14, kLO); // c3 = c2
+
+ stp(d14, d15, mem[sp, 48]);
+ add(x12, x11, x4); // a4 = a3 + a_stride
+ add(x13, x14, x7); // c4 = c3 + cm_stride
+ // if mr <= 4
+ csel(x12, x11, x12, kLS); // a4 = a3
+ csel(x13, x14, x13, kLS); // c4 = c3
+
+ cmp(x0, 6); // if mr < 6
+ add(x4, x12, x4); // a5 = a4 + a_stride
+ add(x7, x13, x7); // c5 = c4 + cm_stride
+ csel(x4, x12, x4, kLO); // a5 = a4
+ csel(x7, x13, x7, kLO); // c5 = c4
+
+ bind(l0);
+ // Load initial bias from w into accumulators
+ ldp(q20, q21, mem[x5], 32);
+ mov(v22.v16b(), v20.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 0]); // Prefetch B
+ }
+ mov(v23.v16b(), v21.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 64]);
+ }
+ mov(v24.v16b(), v20.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 128]);
+ }
+ mov(v25.v16b(), v21.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 192]);
+ }
+ mov(v26.v16b(), v20.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x3]); // Prefetch A
+ }
+ mov(v27.v16b(), v21.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x9]);
+ }
+ mov(v28.v16b(), v20.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x10]);
+ }
+ mov(v29.v16b(), v21.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x11]);
+ }
+ mov(v30.v16b(), v20.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x12]);
+ }
+ mov(v31.v16b(), v21.v16b());
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x4]);
+ }
+
+ // Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ subs(x0, x2, 32); // k = kc - 32
+ b_lo(l4);
+
+ // Prologue - loads for main loop of 96 FMA
+ ldr(q0, mem[x3], 16);
+ ldr(q1, mem[x9], 16);
+ ldr(q2, mem[x10], 16);
+ ldr(q3, mem[x11], 16);
+ ldr(q4, mem[x12], 16);
+ ldr(q5, mem[x4], 16);
+ ldp(q12, q13, mem[x5], 32); // Fetch 3 B (4th deferred)
+ ldp(q14, q15, mem[x5], 32);
+ ldp(q16, q17, mem[x5], 32);
+
+ // Is there at least 8 floats (32 bytes) for main loop?
+ subs(x0, x0, 32);
+ b_lo(l2);
+
+ // Main loop - 8 floats of A (32 bytes)
+ // 96 FMA + 6 LDP A + 8 LDP B
+ bind(l1);
+ // First group of 4 A. 48 FMA.
+ fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+ ldp(q18, q19, mem[x5], 32); // Load last B
+ fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+
+ fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+ fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 128]); // Prefetch B
+ }
+ fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+ fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 256]);
+ }
+ fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+ fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+ ldr(q6, mem[x3], 16); // Load next 6 A
+ fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+ ldr(q7, mem[x9], 16);
+
+ fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+ fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+ fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+ ldr(q8, mem[x10], 16);
+ fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+ fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+ fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+ ldr(q9, mem[x11], 16);
+ fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+ fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+ fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+ ldr(q10, mem[x12], 16);
+ fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+ fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+ fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+ ldr(q11, mem[x4], 16);
+
+ fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+ fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+ fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+ ldp(q12, q13, mem[x5], 32); // Load 4 B
+ fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+ fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+ fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+ ldp(q14, q15, mem[x5], 32);
+ fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+ fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+ fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+ ldp(q16, q17, mem[x5], 32);
+ fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+ fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+ fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+ ldp(q18, q19, mem[x5], 32);
+
+ // Second group of 4 A. 48 FMA.
+ fmla(v20.v4s(), v12.v4s(), v6.s()[0]);
+ fmla(v22.v4s(), v12.v4s(), v7.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v8.s()[0]);
+ ldr(q0, mem[x3], 16); // Load next 6 A
+ fmla(v26.v4s(), v12.v4s(), v9.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v10.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v11.s()[0]);
+ ldr(q1, mem[x9], 16);
+ fmla(v21.v4s(), v13.v4s(), v6.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v7.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v8.s()[0]);
+ ldr(q2, mem[x10], 16);
+ fmla(v27.v4s(), v13.v4s(), v9.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v10.s()[0]);
+ fmla(v31.v4s(), v13.v4s(), v11.s()[0]);
+ ldr(q3, mem[x11], 16);
+
+ fmla(v20.v4s(), v14.v4s(), v6.s()[1]);
+ fmla(v22.v4s(), v14.v4s(), v7.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v8.s()[1]);
+ ldr(q4, mem[x12], 16);
+ fmla(v26.v4s(), v14.v4s(), v9.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v10.s()[1]);
+ fmla(v30.v4s(), v14.v4s(), v11.s()[1]);
+ ldr(q5, mem[x4], 16);
+ fmla(v21.v4s(), v15.v4s(), v6.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v7.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v8.s()[1]);
+ ldp(q12, q13, mem[x5], 32); // Load next 3 B (not last)
+ fmla(v27.v4s(), v15.v4s(), v9.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v10.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v11.s()[1]);
+ ldp(q14, q15, mem[x5], 32);
+
+ fmla(v20.v4s(), v16.v4s(), v6.s()[2]);
+ fmla(v22.v4s(), v16.v4s(), v7.s()[2]);
+ fmla(v24.v4s(), v16.v4s(), v8.s()[2]);
+ fmla(v26.v4s(), v16.v4s(), v9.s()[2]);
+ fmla(v28.v4s(), v16.v4s(), v10.s()[2]);
+ fmla(v30.v4s(), v16.v4s(), v11.s()[2]);
+ fmla(v21.v4s(), v17.v4s(), v6.s()[2]);
+ fmla(v23.v4s(), v17.v4s(), v7.s()[2]);
+ fmla(v25.v4s(), v17.v4s(), v8.s()[2]);
+ fmla(v27.v4s(), v17.v4s(), v9.s()[2]);
+ fmla(v29.v4s(), v17.v4s(), v10.s()[2]);
+ fmla(v31.v4s(), v17.v4s(), v11.s()[2]);
+ ldp(q16, q17, mem[x5], 32);
+
+ fmla(v20.v4s(), v18.v4s(), v6.s()[3]);
+ fmla(v22.v4s(), v18.v4s(), v7.s()[3]);
+ subs(x0, x0, 32);
+ fmla(v24.v4s(), v18.v4s(), v8.s()[3]);
+ fmla(v26.v4s(), v18.v4s(), v9.s()[3]);
+ fmla(v28.v4s(), v18.v4s(), v10.s()[3]);
+ fmla(v30.v4s(), v18.v4s(), v11.s()[3]);
+ fmla(v21.v4s(), v19.v4s(), v6.s()[3]);
+ fmla(v23.v4s(), v19.v4s(), v7.s()[3]);
+ fmla(v25.v4s(), v19.v4s(), v8.s()[3]);
+ fmla(v27.v4s(), v19.v4s(), v9.s()[3]);
+ fmla(v29.v4s(), v19.v4s(), v10.s()[3]);
+ fmla(v31.v4s(), v19.v4s(), v11.s()[3]);
+ b_hs(l1);
+
+ // Epilogue - 8 floats of A (32 bytes)
+ // 96 FMA + 6 LDP A + 8 LDP B
+ // First block same as main loop. Second block has no preloads.
+ bind(l2);
+ // First group of 4 A. 48 FMA.
+ fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+ ldp(q18, q19, mem[x5], 32); // Load last B
+ fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+
+ fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+ fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 128]); // Prefetch B
+ }
+ fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+ fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+ if (prefetch) {
+ prfm(kPLDL1KEEP, mem[x5, 256]);
+ }
+ fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+ fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+ ldr(q6, mem[x3], 16); // Load next 6 A
+ fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+ ldr(q7, mem[x9], 16);
+
+ fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+ fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+ fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+ ldr(q8, mem[x10], 16);
+ fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+ fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+ fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+ ldr(q9, mem[x11], 16);
+ fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+ fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+ fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+ ldr(q10, mem[x12], 16);
+ fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+ fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+ fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+ ldr(q11, mem[x4], 16);
+
+ fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+ fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+ fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+ ldp(q12, q13, mem[x5], 32); // Load 4 B
+ fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+ fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+ fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+ ldp(q14, q15, mem[x5], 32);
+ fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+ fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+ fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+ ldp(q16, q17, mem[x5], 32);
+ fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+ fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+ fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+ ldp(q18, q19, mem[x5], 32);
+
+ // Second group of 4 A. 48 FMA.
+ fmla(v20.v4s(), v12.v4s(), v6.s()[0]);
+ fmla(v22.v4s(), v12.v4s(), v7.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v8.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v9.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v10.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v11.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v6.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v7.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v8.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v9.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v10.s()[0]);
+ fmla(v31.v4s(), v13.v4s(), v11.s()[0]);
+
+ fmla(v20.v4s(), v14.v4s(), v6.s()[1]);
+ fmla(v22.v4s(), v14.v4s(), v7.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v8.s()[1]);
+ fmla(v26.v4s(), v14.v4s(), v9.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v10.s()[1]);
+ fmla(v30.v4s(), v14.v4s(), v11.s()[1]);
+ fmla(v21.v4s(), v15.v4s(), v6.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v7.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v8.s()[1]);
+ fmla(v27.v4s(), v15.v4s(), v9.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v10.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v11.s()[1]);
+
+ fmla(v20.v4s(), v16.v4s(), v6.s()[2]);
+ fmla(v22.v4s(), v16.v4s(), v7.s()[2]);
+ fmla(v24.v4s(), v16.v4s(), v8.s()[2]);
+ fmla(v26.v4s(), v16.v4s(), v9.s()[2]);
+ fmla(v28.v4s(), v16.v4s(), v10.s()[2]);
+ fmla(v30.v4s(), v16.v4s(), v11.s()[2]);
+ fmla(v21.v4s(), v17.v4s(), v6.s()[2]);
+ fmla(v23.v4s(), v17.v4s(), v7.s()[2]);
+ fmla(v25.v4s(), v17.v4s(), v8.s()[2]);
+ fmla(v27.v4s(), v17.v4s(), v9.s()[2]);
+ fmla(v29.v4s(), v17.v4s(), v10.s()[2]);
+ fmla(v31.v4s(), v17.v4s(), v11.s()[2]);
+
+ fmla(v20.v4s(), v18.v4s(), v6.s()[3]);
+ fmla(v22.v4s(), v18.v4s(), v7.s()[3]);
+ fmla(v24.v4s(), v18.v4s(), v8.s()[3]);
+ fmla(v26.v4s(), v18.v4s(), v9.s()[3]);
+ fmla(v28.v4s(), v18.v4s(), v10.s()[3]);
+ fmla(v30.v4s(), v18.v4s(), v11.s()[3]);
+ fmla(v21.v4s(), v19.v4s(), v6.s()[3]);
+ fmla(v23.v4s(), v19.v4s(), v7.s()[3]);
+
+ // Load min/max values
+ ld2r({v6.v4s(), v7.v4s()}, mem[x8]);
+
+ fmla(v25.v4s(), v19.v4s(), v8.s()[3]);
+ fmla(v27.v4s(), v19.v4s(), v9.s()[3]);
+ // Is there a remainder?- 4 floats of A (16 bytes) or less
+ tst(x0, 31);
+ fmla(v29.v4s(), v19.v4s(), v10.s()[3]);
+ fmla(v31.v4s(), v19.v4s(), v11.s()[3]);
+ b_ne(l4);
+
+ // Clamp
+ bind(l3);
+ fmax(v20.v4s(), v20.v4s(), v6.v4s());
+ // Load cn_stride
+ ldr(x0, mem[sp, 64]);
+ fmax(v21.v4s(), v21.v4s(), v6.v4s());
+ fmax(v22.v4s(), v22.v4s(), v6.v4s());
+ fmax(v23.v4s(), v23.v4s(), v6.v4s());
+ fmax(v24.v4s(), v24.v4s(), v6.v4s());
+ fmax(v25.v4s(), v25.v4s(), v6.v4s());
+ fmax(v26.v4s(), v26.v4s(), v6.v4s());
+ fmax(v27.v4s(), v27.v4s(), v6.v4s());
+ fmax(v28.v4s(), v28.v4s(), v6.v4s());
+ fmax(v29.v4s(), v29.v4s(), v6.v4s());
+ fmax(v30.v4s(), v30.v4s(), v6.v4s());
+ fmax(v31.v4s(), v31.v4s(), v6.v4s());
+ subs(x1, x1, 8);
+ fmin(v20.v4s(), v20.v4s(), v7.v4s());
+ fmin(v21.v4s(), v21.v4s(), v7.v4s());
+ fmin(v22.v4s(), v22.v4s(), v7.v4s());
+ fmin(v23.v4s(), v23.v4s(), v7.v4s());
+ fmin(v24.v4s(), v24.v4s(), v7.v4s());
+ fmin(v25.v4s(), v25.v4s(), v7.v4s());
+ fmin(v26.v4s(), v26.v4s(), v7.v4s());
+ fmin(v27.v4s(), v27.v4s(), v7.v4s());
+ fmin(v28.v4s(), v28.v4s(), v7.v4s());
+ fmin(v29.v4s(), v29.v4s(), v7.v4s());
+ fmin(v30.v4s(), v30.v4s(), v7.v4s());
+ fmin(v31.v4s(), v31.v4s(), v7.v4s());
+
+ // Store full 6 x 8
+ b_lo(l7);
+
+ stp(q20, q21, mem[x6]);
+ add(x6, x6, x0);
+ sub(x3, x3, x2); // a0 -= kc
+ stp(q22, q23, mem[x16]);
+ add(x16, x16, x0);
+ sub(x9, x9, x2); // a1 -= kc
+ stp(q24, q25, mem[x17]);
+ add(x17, x17, x0);
+ sub(x10, x10, x2); // a2 -= kc
+ stp(q26, q27, mem[x14]);
+ add(x14, x14, x0);
+ sub(x11, x11, x2); // a3 -= kc
+ stp(q28, q29, mem[x13]);
+ add(x13, x13, x0);
+ sub(x12, x12, x2); // a4 -= kc
+ stp(q30, q31, mem[x7]);
+ add(x7, x7, x0);
+ sub(x4, x4, x2); // a5 -= kc
+
+ b_hi(l0);
+
+ // Restore d8-d15 from stack
+ ldp(d14, d15, mem[sp, 48]);
+ ldp(d12, d13, mem[sp, 32]);
+ ldp(d10, d11, mem[sp, 16]);
+ ldp(d8, d9, mem[sp], 64);
+ ret();
+
+ bind(l4);
+ // Load min/max values
+ ld2r({v6.v4s(), v7.v4s()}, mem[x8]);
+
+ // Is there a remainder?- 4 floats of A (16 bytes)
+ tbz(x0, 4, l5);
+
+ // Remainder- 4 floats of A (16 bytes)
+ // Load A
+ ldr(q0, mem[x3], 16);
+ ldr(q1, mem[x9], 16);
+ ldr(q2, mem[x10], 16);
+ ldr(q3, mem[x11], 16);
+ ldr(q4, mem[x12], 16);
+ ldr(q5, mem[x4], 16);
+ // Load B
+ ldp(q12, q13, mem[x5], 32);
+ ldp(q14, q15, mem[x5], 32);
+ ldp(q16, q17, mem[x5], 32);
+ ldp(q18, q19, mem[x5], 32);
+
+ fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+ fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+ fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+
+ fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+ fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+ fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+ fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+ fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+ fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+
+ fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+ fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+ fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+ fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+ fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+ fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+ fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+ fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+ fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+ fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+ fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+ fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+
+ fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+ fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+ fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+ fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+ fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+ fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+ fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+ fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+ fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+ fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+ fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+ fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+
+ // Is there a remainder?- 2 floats of A (8 bytes)
+ bind(l5);
+ tbz(x0, 3, l6);
+
+ // Remainder- 2 floats of A (8 bytes)
+ // Load A
+ ldr(d0, mem[x3], 8);
+ ldr(d1, mem[x9], 8);
+ ldr(d2, mem[x10], 8);
+ ldr(d3, mem[x11], 8);
+ ldr(d4, mem[x12], 8);
+ ldr(d5, mem[x4], 8);
+ // Load B
+ ldp(q12, q13, mem[x5], 32);
+ ldp(q14, q15, mem[x5], 32);
+
+ fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+ fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+ fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+
+ fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+ fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+ fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+ fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+ fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+ fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+ fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+ fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+ fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+ fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+ fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+ fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+
+ // Is there a remainder?- 1 float of A (4 bytes)
+ bind(l6);
+ tbz(x0, 2, l3);
+
+ // Remainder- 1 float of A (4 bytes)
+ // Load A
+ ldr(s0, mem[x3], 4);
+ ldr(s1, mem[x9], 4);
+ ldr(s2, mem[x10], 4);
+ ldr(s3, mem[x11], 4);
+ ldr(s4, mem[x12], 4);
+ ldr(s5, mem[x4], 4);
+ // Load B
+ ldp(q12, q13, mem[x5], 32);
+
+ fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+ fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+ fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+ fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+ fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+ fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+ fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+ fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+ fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+ fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+ fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+ fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+ b(l3);
+
+ // Store odd width
+ bind(l7);
+ tbz(x1, 2, l8);
+ str(q20, mem[x6], 16);
+ mov(v20.v16b(), v21.v16b());
+ str(q22, mem[x16], 16);
+ mov(v22.v16b(), v23.v16b());
+ str(q24, mem[x17], 16);
+ mov(v24.v16b(), v25.v16b());
+ str(q26, mem[x14], 16);
+ mov(v26.v16b(), v27.v16b());
+ str(q28, mem[x13], 16);
+ mov(v28.v16b(), v29.v16b());
+ str(q30, mem[x7], 16);
+ mov(v30.v16b(), v31.v16b());
+ bind(l8);
+ tbz(x1, 1, l9);
+ str(d20, mem[x6], 8);
+ str(d22, mem[x16], 8);
+ dup(d20, v20.d()[1]);
+ dup(d22, v22.d()[1]);
+ str(d24, mem[x17], 8);
+ str(d26, mem[x14], 8);
+ dup(d24, v24.d()[1]);
+ dup(d26, v26.d()[1]);
+ str(d28, mem[x13], 8);
+ str(d30, mem[x7], 8);
+ dup(d28, v28.d()[1]);
+ dup(d30, v30.d()[1]);
+
+ bind(l9);
+ tbz(x1, 0, l10);
+ str(s20, mem[x6]);
+ str(s22, mem[x16]);
+ str(s24, mem[x17]);
+ str(s26, mem[x14]);
+ str(s28, mem[x13]);
+ str(s30, mem[x7]);
+ bind(l10);
+ // Restore d8-d15 from stack
+ ldp(d14, d15, mem[sp, 48]);
+ ldp(d12, d13, mem[sp, 32]);
+ ldp(d10, d11, mem[sp, 16]);
+ ldp(d8, d9, mem[sp], 64);
+ ret();
+
+
+}
+} // namespace
+} // aarch64
+} // xnnpack
+
+xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {
+ using namespace xnnpack::aarch64;
+ Generator g(code);
+ g.generate(false, nc, kc, nullptr);
+ g.finalize();
+ if (g.error() != xnnpack::Error::kNoError) {
+ return xnn_status_invalid_state;
+ }
+ return xnn_status_success;
+}
+
+xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {
+ using namespace xnnpack::aarch64;
+ Generator g(code);
+ g.generate(true, nc, kc, nullptr);
+ g.finalize();
+ if (g.error() != xnnpack::Error::kNoError) {
+ return xnn_status_invalid_state;
+ }
+ return xnn_status_success;
+}
diff --git a/src/xnnpack/aarch64-assembler.h b/src/xnnpack/aarch64-assembler.h
index 7cf7a49..d6bf34b 100644
--- a/src/xnnpack/aarch64-assembler.h
+++ b/src/xnnpack/aarch64-assembler.h
@@ -284,12 +284,10 @@
// - ldp(x0, x1, mem[rn], offset); // post-indexed
constexpr MemOperandHelper mem;
-enum class PrefetchOp {
+enum PrefetchOp {
kPLDL1KEEP = 0
};
-constexpr PrefetchOp PLDL1KEEP = PrefetchOp::kPLDL1KEEP;
-
enum Condition : uint32_t {
kEQ = 0x0,
kNE = 0x1,
diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
index 663099d..f3b7767 100644
--- a/src/xnnpack/common.h
+++ b/src/xnnpack/common.h
@@ -110,7 +110,7 @@
#define XNN_PLATFORM_WINDOWS 0
#endif
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS
+#if (XNN_ARCH_ARM || XNN_ARCH_ARM64) && !XNN_PLATFORM_IOS
#define XNN_PLATFORM_JIT 1
#else
#define XNN_PLATFORM_JIT 0
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 28c9f82..bdf5005 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -1625,6 +1625,9 @@
enum xnn_status xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
enum xnn_status xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
+enum xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
+enum xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/test/aarch64-assembler.cc b/test/aarch64-assembler.cc
index fbf8332..d52113b 100644
--- a/test/aarch64-assembler.cc
+++ b/test/aarch64-assembler.cc
@@ -53,9 +53,9 @@
EXPECT_ERROR(Error::kInvalidOperand, a.ldr(x8, mem[sp, 32768]));
EXPECT_ERROR(Error::kInvalidOperand, a.ldr(x8, MemOperand(sp, 16, AddressingMode::kPostIndex)));
- CHECK_ENCODING(0xF98000A0, a.prfm(PLDL1KEEP, mem[x5]));
- EXPECT_ERROR(Error::kInvalidOperand, a.prfm(PLDL1KEEP, mem[x5, -8]));
- EXPECT_ERROR(Error::kInvalidOperand, a.prfm(PLDL1KEEP, mem[x5, 32761]));
+ CHECK_ENCODING(0xF98000A0, a.prfm(kPLDL1KEEP, mem[x5]));
+ EXPECT_ERROR(Error::kInvalidOperand, a.prfm(kPLDL1KEEP, mem[x5, -8]));
+ EXPECT_ERROR(Error::kInvalidOperand, a.prfm(kPLDL1KEEP, mem[x5, 32761]));
CHECK_ENCODING(0xD65F03C0, a.ret());
diff --git a/test/f32-gemm-minmax-2.cc b/test/f32-gemm-minmax-2.cc
index e96dddd..a6d8f04 100644
--- a/test/f32-gemm-minmax-2.cc
+++ b/test/f32-gemm-minmax-2.cc
@@ -33589,3 +33589,504 @@
.Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, xnn_init_f32_minmax_scalar_params);
}
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+
+
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+#endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index 6f53017..26690e5 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc
@@ -30588,3 +30588,504 @@
.Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
}
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+
+
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+ }
+ }
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+
+ TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+ }
+#endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
diff --git a/test/f32-gemm-minmax.yaml b/test/f32-gemm-minmax.yaml
index 1e10bcf..fc0a9bc 100644
--- a/test/f32-gemm-minmax.yaml
+++ b/test/f32-gemm-minmax.yaml
@@ -524,3 +524,11 @@
init: xnn_init_f32_minmax_scalar_params
k-block: 2
assembly: true
+- name: xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
+ init: xnn_init_f32_minmax_scalar_params
+ k-block: 8
+ pipelined: true
+- name: xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75
+ init: xnn_init_f32_minmax_scalar_params
+ k-block: 8
+ pipelined: true
diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc
index 1bd23c8..d91dd55 100644
--- a/test/gemm-microkernel-tester.cc
+++ b/test/gemm-microkernel-tester.cc
@@ -1650,7 +1650,7 @@
jit_gemm_params p = (jit_gemm_params) {
.f32_minmax = params
};
- ASSERT_EQ(xnn_status_success, gemm_generator(&code_buffer,n(), k() * sizeof(float), &p));
+ ASSERT_EQ(xnn_status_success, gemm_generator(&code_buffer, n(), k() * sizeof(float), &p));
xnn_f32_gemm_minmax_ukernel_function gemm_minmax = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.code);
gemm_minmax(m(), n(), k() * sizeof(float),