Fix conversion script for aarch64 assembly kernels and convert a single F32 GEMM as a test PiperOrigin-RevId: 424212216

commit: c2e2da8f941d96c92dc27fddcfdbe0a9e7616482 [log] [tgz]
author: Zhi An Ng <zhin@google.com> Tue Jan 25 16:51:58 2022 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Tue Jan 25 16:53:13 2022 -0800
tree: 65969552ffda091b92ae6f3f09cb4d241eed8952
parent: 4a1c6a8179d1e9932f586b56c9163a62a6778851 [diff]
diff --git a/BUILD.bazel b/BUILD.bazel
index 7b10d47..f76f50e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel

@@ -7314,6 +7314,10 @@
     "src/qs8-igemm/4x8c4-rndnu-aarch32-neondot-ld64.cc",
 ]
 
+JIT_AARCH64_SRCS = [
+    "src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc",
+]
+
 INTERNAL_MICROKERNEL_HDRS = [
     "src/xnnpack/allocator.h",
     "src/xnnpack/argmaxpool.h",
@@ -9169,6 +9173,7 @@
         "src/xnnpack/assembler.h",
     ],
     aarch32_srcs = JIT_AARCH32_SRCS,
+    aarch64_srcs = JIT_AARCH64_SRCS,
     msvc_copts = xnnpack_msvc_std_copts(),
     deps = [
         ":logging_utils",
@@ -9189,6 +9194,7 @@
         "src/xnnpack/assembler.h",
     ],
     aarch32_srcs = JIT_AARCH32_SRCS,
+    aarch64_srcs = JIT_AARCH64_SRCS,
     copts = [
         "-UNDEBUG",
         "-DXNN_TEST_MODE=1",

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f85c746..44c534e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -283,6 +283,9 @@
   src/qs8-igemm/4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc
   src/qs8-igemm/4x8c4-rndnu-aarch32-neondot-ld64.cc)
 
+SET(JIT_AARCH64_SRCS
+  src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc)
+
 SET(PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS
   src/params-init.c
   src/u8-lut32norm/scalar.c
@@ -6066,6 +6069,7 @@
     LIST(APPEND PROD_MICROKERNEL_SRCS ${AARCH64_ASM_MICROKERNEL_SRCS})
     LIST(APPEND ALL_MICROKERNEL_SRCS ${AARCH64_ASM_MICROKERNEL_SRCS})
   ENDIF()
+  LIST(APPEND JIT_SRCS ${JIT_AARCH64_SRCS})
 ENDIF()
 IF(XNNPACK_TARGET_PROCESSOR MATCHES "^(i[3-6]86|x86_64|AMD64)$" OR IOS_ARCH MATCHES "^(i386|x86_64|AMD64)$")
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_SSE_MICROKERNEL_SRCS})

diff --git a/scripts/convert-assembly-to-jit.py b/scripts/convert-assembly-to-jit.py
old mode 100644
new mode 100755
index d3b648d..51e19f5
--- a/scripts/convert-assembly-to-jit.py
+++ b/scripts/convert-assembly-to-jit.py

@@ -18,7 +18,7 @@
 COMMENTS = SPACES + '((//\s+.+)|)$'
 WB = r'!'
 
-REG_NO_GROUP = r'r\d+|s\d+|d\d+|q\d+|sp|lr|pc'
+REG_NO_GROUP = r'r\d+|s\d+|d\d+|q\d+|sp|lr|pc|x\d+|(?:v\d+\.(?:\d+)?(?:d|s|h|b))'
 REG = r'(' + REG_NO_GROUP + ')'
 IMM_NO_GROUP = r'\d+'
 IMM = r'(' + IMM_NO_GROUP + ')'
@@ -26,7 +26,7 @@
 REG_OR_IMM = r'(' + REG_LANE_NO_GROUP + '|' + REG_NO_GROUP + '|' + IMM_NO_GROUP + ')'
 
 REGLIST_CONSEC = r'\{(\w+)-(\w+)\}' + SPACES
-REGLIST_INDIV = r'\{(\w+(?:,\s+\w+)*)\}' + SPACES
+REGLIST_INDIV = r'\{([\w.]+(?:,\s+[\w.]+)*)\}' + SPACES
 REGLIST_INDIV_REPLICATE = r'\{(\w+(?:\[\])(,\s*\w+(?:\[\]))*)\}' + SPACES
 REGLIST_INDEX = r'\{(' + REG_LANE_NO_GROUP + ')\}' + SPACES
 
@@ -35,14 +35,15 @@
 
 MEMOP = r'\[' + SPACES + REG + '\]' + SPACES
 MEMOP_MAYBE_WB = r'\[' + SPACES + REG + '\]' + f'({WB})?'
-MEMOP_OFFSET = r'\[' + REG + COMMA + '(\d+)\]' + SPACES
+MEMOP_OFFSET = r'\[' + REG + COMMA + '(-?\d+)\]' + SPACES
+MEMOP_OFFSET_MAYBE_WB = r'\[' + REG + COMMA + '(-?\d+)\]' + f'({WB})?' + SPACES
 
 B_IMM = r'(\d+)(f|b)'
 
 INSTR = SPACES + r'([A-Z0-9.]+)' + SPACES
 
 # e.g. #ifndef __APPLE__
-IFDEF_RE = re.compile(r'\s*#(ifndef|endif|ifdef)')
+IFDEF_RE = re.compile(r'\s*#(ifndef|endif|ifdef).*')
 # e.g. # Push 96 bytes
 COMMENT_RE = re.compile(SPACES + r'((//|#)\s*.+)')
 # e.g. 0:
@@ -57,18 +58,28 @@
 INSTR_OP_RE = re.compile(INSTR + REG + COMMENTS)
 # e.g. BLO 2f
 INSTR_B_IMM = re.compile(INSTR + B_IMM + COMMENTS)
+# e.g. TBNZ x0, 4, 5f
+INSTR_B_REG_IMM_IMM = re.compile(INSTR + REG + COMMA + IMM + COMMA + B_IMM + COMMENTS)
 # e.g. .p2align 3
 P2ALIGN_RE = re.compile(SPACES + r'\.p2align\s+(\d+)')
 # e.g. CMP r0, 2
 INSTR_REG_IMM_RE = re.compile(INSTR + REG + COMMA + IMM + COMMENTS)
 # e.g. LDR r0, [r12]
 INSTR_REG_MEMOP_RE = re.compile(INSTR + REG + COMMA + MEMOP + COMMENTS)
+# e.g. LDR q0, [x4], 16
+INSTR_REG_MEMOP_IMM_RE = re.compile(INSTR + REG + COMMA + MEMOP + COMMA + IMM + COMMENTS)
 # e.g. LDR r0, [sp, 112]
 INSTR_REG_MEMOP_OFFSET_RE = re.compile(INSTR + REG + COMMA + MEMOP_OFFSET +
                                        COMMENTS)
-# e.g. LDRD r6, r7, [sp, 104]
+# e.g. LDRD r6, r7, [sp]
+INSTR_REG_REG_MEMOP_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
+                                           MEMOP + COMMENTS)
+# e.g. LDRD r6, r7, [sp, 104], STP d8, d9, [sp, -64]!
 INSTR_REG_REG_MEMOP_OFFSET_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
-                                           MEMOP_OFFSET + COMMENTS)
+                                           MEMOP_OFFSET_MAYBE_WB + COMMENTS)
+# e.g. LDP q20, q21, [x5], 32
+INSTR_REG_REG_MEMOP_IMM_RE = re.compile(INSTR + REG + COMMA + REG + COMMA +
+                                           MEMOP + COMMA + IMM + COMMENTS)
 # e.g. PLD [r4, 64]
 INSTR_MEMOP_OFFSET_RE = re.compile(INSTR + MEMOP_OFFSET + COMMENTS)
 # e.g. movlo r12, r3, vdup.32 q0, d14[0]
@@ -97,6 +108,9 @@
 # e.g. VLD1.32 {d0}, [r3]{!}
 INSTR_REGLIST_INDIV_MEMOP = re.compile(INSTR + REGLIST_INDIV + COMMA +
                                        MEMOP_MAYBE_WB + COMMENTS)
+# e.g. LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48
+INSTR_REGLIST_INDIV_MEMOP_IMM = re.compile(INSTR + REGLIST_INDIV + COMMA +
+                                       MEMOP + COMMA + IMM + COMMENTS)
 # e.g. VST1.32 {d24-d25}, [r11]{!}
 INSTR_REGLIST_CONSEC_MEMOP = re.compile(INSTR + REGLIST_CONSEC + COMMA +
                                         MEMOP_MAYBE_WB + COMMENTS)
@@ -110,17 +124,26 @@
 INSTR_REG_FPSCR = re.compile(INSTR + f'({APSR}|{REG_NO_GROUP})' + COMMA +
                              FPSCR + COMMENTS)
 
+# e.g. PRFM PLDL1KEEP, [x5]
+INSTR_PLD_MEMOP = re.compile(INSTR + f'(PLDL1KEEP)' + COMMA + MEMOP + COMMENTS)
+# e.g. PRFM PLDL1KEEP, [x5, 64]
+INSTR_PLD_MEMOP_OFFSET = re.compile(INSTR + f'(PLDL1KEEP)' + COMMA + MEMOP_OFFSET + COMMENTS)
+
+COND = r'([A-Z]+)'
+# e.g. CSEL x9, x3, x9, LO
+INSTR_REG_REG_REG_COND_RE = re.compile(INSTR + REG + COMMA + REG + COMMA + REG + COMMA + COND + COMMENTS)
+
 
 def remove_brackets(s):
   return s.replace('[', '').replace(']', '')
 
 
 def fix_replicate_instruction(s):
-  return re.sub(r'\.(\d+)', r'r.\1', s, 1)
+  return re.sub(r'_(\d+)', r'r_\1', s, 1)
 
 
 def fix_instr_name(s):
-  return s.replace('.', '_', 2).replace('and', 'and_', 1)
+  return s.lower().replace('.', '_', 2).replace('and', 'and_', 1)
 
 
 def fix_comments(s):
@@ -140,10 +163,39 @@
   return f'xnn_generate_{name}'
 
 
+def fix_regs(regs):
+  # Vector registers with datatype need to be method calls.
+  # e.g. v2.4s -> v2.v4s(), v2.s -> v2.s()
+  def repl(m):
+    if m.group(2):
+      return f'{m[1]}v{m[2]}{m[3]}()'
+    else:
+      return f'{m[1]}{m[3]}()'
+  return re.sub(r'(\w+\.)(\d+)?(\w+)', repl, regs)
+
+
 IGNORE_LINES = [r'\s*\.\w+']
 
+AARCH32 = 'aarch32'
+AARCH64 = 'aarch64'
+GEMM = 'GEMM'
+IGEMM = 'IGEMM'
 
 def main(input_file):
+  arch = None
+  kernel_type = GEMM
+
+  if 'aarch32' in input_file:
+    arch = AARCH32
+  elif 'aarch64' in input_file:
+    arch = AARCH64
+  else:
+    print('ERROR: unknown architecture')
+    sys.exit(1)
+
+  if 'igemm' in input_file:
+    kernel_type = IGEMM
+
   # Whether we are in the copyright section.
   in_copyright = False
   # Whether we are in the microkernel function.
@@ -163,6 +215,7 @@
 
   with open(input_file, 'r', encoding='utf-8') as f:
     for line in f:
+      line = line.rstrip()
 
       # Handle all lines before the microkernel instructions begin.
       if not in_function:
@@ -172,8 +225,11 @@
         elif 'BEGIN_FUNCTION' in line:
           in_function = True
           fn_name = line.split()[1]
-          prologue.append(f'// Converted from: {input_file}')
-          prologue.append('void Generator::generate() {')
+          prologue.append(f'// Converted from: {input_file[20:]}')
+          if kernel_type == GEMM:
+            prologue.append('void Generator::generate(size_t nc, size_t kc, void* params) {')
+          else:
+            prologue.append('void Generator::generate(size_t nc, size_t kc, size_t ks, void* params) {')
           continue
         elif 'Copyright ' in line:
           in_autogen = False
@@ -183,164 +239,207 @@
                      1).rstrip())
           continue
         elif '#include <xnnpack/assembly.h>' in line:
-          prologue.append('#include <xnnpack/aarch32-assembler.h>')
+          prologue.append(f'#include <xnnpack/{arch}-assembler.h>')
           prologue.append('#include <xnnpack/allocator.h>')
-          if 'igemm' in input_file:
-            prologue.append('#include <xnnpack/igemm.h>')
-          elif 'gemm' in input_file:
+          if kernel_type == GEMM:
             prologue.append('#include <xnnpack/gemm.h>')
+          else:
+            prologue.append('#include <xnnpack/igemm.h>')
           prologue.append('')
           prologue.append('namespace xnnpack {')
-          prologue.append('namespace aarch32 {')
+          prologue.append(f'namespace {arch} {{')
           prologue.append('namespace {')
           prologue.append('class Generator : public Assembler {')
           prologue.append('  using Assembler::Assembler;')
           prologue.append(' public:')
-          prologue.append('  void generate();')
+          if kernel_type == GEMM:
+            prologue.append('  void generate(size_t nc, size_t kc, void* params);')
+          else:
+            prologue.append('  void generate(size_t nc, size_t kc, size_t ks, void* params);')
           prologue.append('};')
           continue
-        elif any(re.match(p, line) for p in IGNORE_LINES):
+        elif any(re.fullmatch(p, line) for p in IGNORE_LINES):
           continue
         elif in_autogen:
           continue
         else:
-          prologue.append(line.rstrip())
+          prologue.append(fix_comments(line.rstrip()))
           continue
 
       # We are now in the microkernel function body.
       # Don't keep the ifdefs.
-      m = re.match(IFDEF_RE, line)
+      m = re.fullmatch(IFDEF_RE, line)
       if m:
         continue
       # But keep other comments.
-      m = re.match(COMMENT_RE, line)
+      m = re.fullmatch(COMMENT_RE, line)
       if m:
         instructions.append(m[1])
         continue
 
-      m = re.match(LABEL, line)
+      m = re.fullmatch(LABEL, line)
       if m:
         labels.append(m[1])
         instructions.append(f'bind(l{m[1]}){sc}')
         continue
-      m = re.match(INSTR_RE, line)
+      m = re.fullmatch(INSTR_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}(){sc} {m[2]}')
+        instructions.append(f'{fix_instr_name(m[1])}(){sc} {m[2]}')
         continue
-      m = re.match(INSTR_OP_RE, line)
+      m = re.fullmatch(INSTR_OP_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}({m[2]}){sc} {m[3]}')
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}){sc} {m[3]}')
         continue
-      m = re.match(INSTR_REGLIST_CONSEC_MEMOP_REG, line)
+      m = re.fullmatch(INSTR_REGLIST_CONSEC_MEMOP_REG, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({{{m[2]}-{m[3]}}}, mem[{m[4]}], {m[5]}){sc} {m[6]}'
+            f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}, mem[{m[4]}], {m[5]}){sc} {m[6]}'
         )
         continue
-      m = re.match(INSTR_REGLIST_INDIV_MEMOP_REG, line)
+      m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP_REG, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
+            f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
         continue
-      m = re.match(INSTR_REGLIST_CONSEC_RE, line)
+      m = re.fullmatch(INSTR_REGLIST_CONSEC_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}({{{m[2]}-{m[3]}}}){sc} {m[4]}')
+        instructions.append(f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}){sc} {m[4]}')
         continue
-      m = re.match(INSTR_REGLIST_LIST_RE, line)
+      m = re.fullmatch(INSTR_REGLIST_LIST_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}({{{m[2]}}}){sc} {m[3]}')
+        instructions.append(f'{fix_instr_name(m[1])}({{{m[2]}}}){sc} {m[3]}')
         continue
-      m = re.match(INSTR_MEMOP_OFFSET_RE, line)
+      m = re.fullmatch(INSTR_MEMOP_OFFSET_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}(mem[{m[2]}, {m[3]}]){sc} {m[4]}')
+        instructions.append(f'{fix_instr_name(m[1])}(mem[{m[2]}, {m[3]}]){sc} {m[4]}')
         continue
-      m = re.match(INSTR_REG_MEMOP_RE, line)
+      m = re.fullmatch(INSTR_REG_MEMOP_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
         continue
-      m = re.match(INSTR_REG_MEMOP_OFFSET_RE, line)
+      m = re.fullmatch(INSTR_REG_MEMOP_IMM_RE , line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}], {m[4]}){sc} {m[5]}')
+        continue
+      m = re.fullmatch(INSTR_REG_MEMOP_OFFSET_RE, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
+            f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
         continue
-      m = re.match(INSTR_REG_REG_MEMOP_OFFSET_RE, line)
+      m = re.fullmatch(INSTR_REG_REG_MEMOP_RE, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]){sc} {m[6]}')
+            f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}]){sc} {m[5]}')
         continue
-      m = re.match(INSTR_REG_IMM_RE, line)
+      m = re.fullmatch(INSTR_REG_REG_MEMOP_OFFSET_RE, line)
       if m:
-        instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
+        if m[6]: # wb
+          instructions.append(
+              f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]++){sc} {m[7]}')
+        else: #no wb
+          instructions.append(
+              f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}, {m[5]}]){sc} {m[7]}')
         continue
-      m = re.match(INSTR_REG_REG_REG_RE, line)
+      m = re.fullmatch(INSTR_REG_REG_MEMOP_IMM_RE , line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {m[3]}, {m[4]}){sc} {m[5]}')
+            f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, mem[{m[4]}], {m[5]}){sc} {m[6]}')
         continue
-      m = re.match(INSTR_REG_REG_REG_IMM_RE, line)
+      m = re.fullmatch(INSTR_REG_IMM_RE, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {m[3]}){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_REG_REG_REG_RE, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {m[3]}, {m[4]}, {m[5]}){sc} {m[6]}')
+            f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {fix_regs(m[3])}, {fix_regs(m[4])}){sc} {m[5]}')
         continue
-      m = re.match(INSTR_REG_REG_RE, line)
-      if m:
-        instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
-        continue
-      m = re.match(INSTR_REG_REGLIST_CONSECT, line)
+      m = re.fullmatch(INSTR_REG_REG_REG_IMM_RE, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {{{m[3]}-{m[4]}}}, false){sc} {m[5]}')
+            f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, {m[4]}, {m[5]}){sc} {m[6]}')
         continue
-      m = re.match(INSTR_REG_REGLIST_CONSECT_WB, line)
+      m = re.fullmatch(INSTR_REG_REG_RE, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({fix_regs(m[2])}, {fix_regs(m[3])}){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_REG_REGLIST_CONSECT, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {{{m[3]}-{m[4]}}}, true){sc} {m[5]}')
+            f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}-{m[4]}}}, false){sc} {m[5]}')
         continue
-      m = re.match(INSTR_REG_REGLIST_INDIV_WB, line)
+      m = re.fullmatch(INSTR_REG_REGLIST_CONSECT_WB, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({m[2]}, {{{m[3]}}}, true){sc} {m[4]}')
+            f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}-{m[4]}}}, true){sc} {m[5]}')
         continue
-      m = re.match(INSTR_B_IMM, line)
-      if m:
-        instructions.append(f'{m[1].lower()}(l{m[2]}){sc} {m[4]}')
-        continue
-      m = re.match(INSTR_REGLIST_INDIV_MEMOP, line)
+      m = re.fullmatch(INSTR_REG_REGLIST_INDIV_WB, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
+            f'{fix_instr_name(m[1])}({m[2]}, {{{m[3]}}}, true){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_B_IMM, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}(l{m[2]}){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_B_REG_IMM_IMM , line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, l{m[4]}){sc} {m[6]}')
+        continue
+      m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP, line)
+      if m:
+        instructions.append(
+            f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
         )
         continue
-      m = re.match(INSTR_REGLIST_CONSEC_MEMOP, line)
+      m = re.fullmatch(INSTR_REGLIST_INDIV_MEMOP_IMM, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({{{m[2]}-{m[3]}}}, mem[{m[4]}]{maybe_wb(m[5])}){sc} {m[6]}'
+            f'{fix_instr_name(m[1])}({{{fix_regs(m[2])}}}, mem[{m[3]}], {m[4]}){sc} {m[5]}'
         )
         continue
-      m = re.match(INSTR_REGLIST_REPLICATE_MEMOP, line)
+      m = re.fullmatch(INSTR_REGLIST_CONSEC_MEMOP, line)
+      if m:
+        instructions.append(
+            f'{fix_instr_name(m[1])}({{{m[2]}-{m[3]}}}, mem[{m[4]}]{maybe_wb(m[5])}){sc} {m[6]}'
+        )
+        continue
+      m = re.fullmatch(INSTR_REGLIST_REPLICATE_MEMOP, line)
       if m:
         if m[5]:
           instructions.append(
-              f'{fix_replicate_instruction(m[1].lower())}({{{remove_brackets(m[2])}}}, mem[{m[4]}]++){sc} {m[6]}'
+              f'{fix_replicate_instruction(fix_instr_name(m[1]))}({{{remove_brackets(m[2])}}}, mem[{m[4]}]++){sc} {m[6]}'
           )
         else:
           instructions.append(
-              f'{fix_replicate_instruction(m[1].lower())}({{{remove_brackets(m[2])}}}, mem[{m[4]}]){sc} {m[6]}'
+              f'{fix_replicate_instruction(fix_instr_name(m[1]))}({{{remove_brackets(m[2])}}}, mem[{m[4]}]){sc} {m[6]}'
           )
         continue
-      m = re.match(INSTR_REGLIST_INDEX_MEMOP, line)
+      m = re.fullmatch(INSTR_REGLIST_INDEX_MEMOP, line)
       if m:
         instructions.append(
-            f'{m[1].lower()}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
+            f'{fix_instr_name(m[1])}({{{m[2]}}}, mem[{m[3]}]{maybe_wb(m[4])}){sc} {m[5]}'
         )
         continue
-      m = re.match(P2ALIGN_RE, line)
+      m = re.fullmatch(P2ALIGN_RE, line)
       if m:
         instructions.append(f'align({1 << int(m[1])}){sc}')
         continue
-      m = re.match(INSTR_REG_FPSCR, line)
+      m = re.fullmatch(INSTR_REG_FPSCR, line)
       if m:
-        instructions.append(f'{m[1].lower()}({m[2]}, {m[3]}){sc} {m[4]}')
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_PLD_MEMOP, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}]){sc} {m[4]}')
+        continue
+      m = re.fullmatch(INSTR_PLD_MEMOP_OFFSET, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, mem[{m[3]}, {m[4]}]){sc} {m[5]}')
+        continue
+      m = re.fullmatch(INSTR_REG_REG_REG_COND_RE, line)
+      if m:
+        instructions.append(f'{fix_instr_name(m[1])}({m[2]}, {m[3]}, {m[4]}, k{m[5]}){sc} {m[6]}')
         continue
 
       # Keep empty lines for formatting
@@ -376,19 +475,25 @@
     elif i.strip() == '':
       print()
     else:
-      print(indent + fix_instr_name(i).rstrip())
+      print(indent + (i).rstrip())
 
   print('}')
   print('}  // namespace')
-  print('}  // aarch32')
+  print(f'}}  // {arch}')
   print('}  // xnnpack')
   print('')
-  print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code) {{')
-  print('  using namespace xnnpack::aarch32;')
+  if kernel_type == GEMM:
+    print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {{')
+  else:
+    print(f'xnn_status {fix_fn_name(fn_name)}(xnn_code_buffer* code, size_t nc, size_t kc, size_t ks, void* params) {{')
+  print(f'  using namespace xnnpack::{arch};')
   print('  Generator g(code);')
-  print('  g.generate();')
+  if kernel_type == GEMM:
+    print('  g.generate(nc, kc, nullptr);')
+  else:
+    print('  g.generate(nc, kc, ks, nullptr);')
   print('  g.finalize();')
-  print('  if (g.error() != Error::kNoError) {')
+  print('  if (g.error() != xnnpack::Error::kNoError) {')
   print('    return xnn_status_invalid_state;')
   print('  }')
   print('  return xnn_status_success;')

diff --git a/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc b/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc
new file mode 100644
index 0000000..5c0048e
--- /dev/null
+++ b/src/f32-gemm/6x8-aarch64-neonfma-prfm-cortex-a75.cc

@@ -0,0 +1,720 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/aarch64-assembler.h>
+#include <xnnpack/allocator.h>
+#include <xnnpack/gemm.h>
+
+namespace xnnpack {
+namespace aarch64 {
+namespace {
+class Generator : public Assembler {
+  using Assembler::Assembler;
+ public:
+  void generate(bool prefetch, size_t nc, size_t kc, void* params);
+};
+
+// void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(
+//     size_t mr,                x0
+//     size_t nc,                x1
+//     size_t kc,                x2 / x0
+//     const uint8_t*restrict a, x3
+//     size_t a_stride,          x4
+//     const void*restrict w,    x5
+//     uint8_t*restrict c,       x6
+//     size_t cm_stride,         x7
+//     size_t cn_stride,         [sp] -> (x0)
+//     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
+
+// d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+// A pointers
+//  x3 a0
+//  x9 a1
+// x10 a2
+// x11 a3
+// x12 a4
+//  x4 a5
+
+// C pointers
+//  x6 c0
+// x16 c1
+// x17 c2
+// x14 c3
+// x13 c4
+//  x7 c5
+
+// Vector register usage
+// A0   v0  v6
+// A1   v1  v7
+// A2   v2  v8
+// A3   v3  v9
+// A4   v4 v10
+// A5   v5 v11
+// B   v12 v13 v14 v15
+// B   v16 v17 v18 v19
+// C   v20 v21
+// C   v22 v23
+// C   v24 v25
+// C   v26 v27
+// C   v28 v29
+// C   v30 v31
+// Clamp v6 v7
+
+// Converted from: src/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
+void Generator::generate(bool prefetch, size_t nc, size_t kc, void* params) {
+  Label l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10;
+
+
+  // Load params pointer
+  ldr(x8, mem[sp, 8]);
+
+  // Clamp A and C pointers / Save d8-d15 on stack
+  stp(d8, d9, mem[sp, -64]++);
+  cmp(x0, 2); // if mr < 2
+  add(x9, x3, x4); // a1 = a0 + a_stride
+  add(x16, x6, x7); // c1 = c0 + cm_stride
+  csel(x9, x3, x9, kLO); //   a1 = a0
+  csel(x16, x6, x16, kLO); //   c1 = c0
+
+  stp(d10, d11, mem[sp, 16]);
+  add(x10, x9, x4); // a2 = a1 + a_stride
+  add(x17, x16, x7); // c2 = c1 + cm_stride
+  // if mr <= 2
+  csel(x10, x9, x10, kLS); //   a2 = a1
+  csel(x17, x16, x17, kLS); //   c2 = c1
+
+  stp(d12, d13, mem[sp, 32]);
+  cmp(x0, 4); // if mr < 4
+  add(x11, x10, x4); // a3 = a2 + a_stride
+  add(x14, x17, x7); // c3 = c2 + cm_stride
+  csel(x11, x10, x11, kLO); //   a3 = a2
+  csel(x14, x17, x14, kLO); //   c3 = c2
+
+  stp(d14, d15, mem[sp, 48]);
+  add(x12, x11, x4); // a4 = a3 + a_stride
+  add(x13, x14, x7); // c4 = c3 + cm_stride
+  // if mr <= 4
+  csel(x12, x11, x12, kLS); //   a4 = a3
+  csel(x13, x14, x13, kLS); //   c4 = c3
+
+  cmp(x0, 6); // if mr < 6
+  add(x4, x12, x4); // a5 = a4 + a_stride
+  add(x7, x13, x7); // c5 = c4 + cm_stride
+  csel(x4, x12, x4, kLO); //   a5 = a4
+  csel(x7, x13, x7, kLO); //   c5 = c4
+
+  bind(l0);
+  // Load initial bias from w into accumulators
+  ldp(q20, q21, mem[x5], 32);
+  mov(v22.v16b(), v20.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 0]); // Prefetch B
+  }
+  mov(v23.v16b(), v21.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 64]);
+  }
+  mov(v24.v16b(), v20.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 128]);
+  }
+  mov(v25.v16b(), v21.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 192]);
+  }
+  mov(v26.v16b(), v20.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x3]); // Prefetch A
+  }
+  mov(v27.v16b(), v21.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x9]);
+  }
+  mov(v28.v16b(), v20.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x10]);
+  }
+  mov(v29.v16b(), v21.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x11]);
+  }
+  mov(v30.v16b(), v20.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x12]);
+  }
+  mov(v31.v16b(), v21.v16b());
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x4]);
+  }
+
+  // Is there at least 8 floats (32 bytes) for prologue + epilogue?
+  subs(x0, x2, 32); // k = kc - 32
+  b_lo(l4);
+
+  // Prologue - loads for main loop of 96 FMA
+  ldr(q0, mem[x3], 16);
+  ldr(q1, mem[x9], 16);
+  ldr(q2, mem[x10], 16);
+  ldr(q3, mem[x11], 16);
+  ldr(q4, mem[x12], 16);
+  ldr(q5, mem[x4], 16);
+  ldp(q12, q13, mem[x5], 32); // Fetch 3 B (4th deferred)
+  ldp(q14, q15, mem[x5], 32);
+  ldp(q16, q17, mem[x5], 32);
+
+  // Is there at least 8 floats (32 bytes) for main loop?
+  subs(x0, x0, 32);
+  b_lo(l2);
+
+  // Main loop - 8 floats of A (32 bytes)
+  // 96 FMA + 6 LDP A + 8 LDP B
+  bind(l1);
+  // First group of 4 A.  48 FMA.
+  fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+  ldp(q18, q19, mem[x5], 32); // Load last B
+  fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+
+  fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+  fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 128]); // Prefetch B
+  }
+  fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+  fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 256]);
+  }
+  fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+  fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+  ldr(q6, mem[x3], 16); // Load next 6 A
+  fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+  ldr(q7, mem[x9], 16);
+
+  fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+  fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+  fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+  ldr(q8, mem[x10], 16);
+  fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+  fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+  fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+  ldr(q9, mem[x11], 16);
+  fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+  fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+  fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+  ldr(q10, mem[x12], 16);
+  fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+  fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+  fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+  ldr(q11, mem[x4], 16);
+
+  fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+  fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+  fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+  ldp(q12, q13, mem[x5], 32); // Load 4 B
+  fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+  fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+  fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+  ldp(q14, q15, mem[x5], 32);
+  fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+  fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+  fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+  ldp(q16, q17, mem[x5], 32);
+  fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+  fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+  fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+  ldp(q18, q19, mem[x5], 32);
+
+  // Second group of 4 A.  48 FMA.
+  fmla(v20.v4s(), v12.v4s(), v6.s()[0]);
+  fmla(v22.v4s(), v12.v4s(), v7.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v8.s()[0]);
+  ldr(q0, mem[x3], 16); // Load next 6 A
+  fmla(v26.v4s(), v12.v4s(), v9.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v10.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v11.s()[0]);
+  ldr(q1, mem[x9], 16);
+  fmla(v21.v4s(), v13.v4s(), v6.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v7.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v8.s()[0]);
+  ldr(q2, mem[x10], 16);
+  fmla(v27.v4s(), v13.v4s(), v9.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v10.s()[0]);
+  fmla(v31.v4s(), v13.v4s(), v11.s()[0]);
+  ldr(q3, mem[x11], 16);
+
+  fmla(v20.v4s(), v14.v4s(), v6.s()[1]);
+  fmla(v22.v4s(), v14.v4s(), v7.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v8.s()[1]);
+  ldr(q4, mem[x12], 16);
+  fmla(v26.v4s(), v14.v4s(), v9.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v10.s()[1]);
+  fmla(v30.v4s(), v14.v4s(), v11.s()[1]);
+  ldr(q5, mem[x4], 16);
+  fmla(v21.v4s(), v15.v4s(), v6.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v7.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v8.s()[1]);
+  ldp(q12, q13, mem[x5], 32); // Load next 3 B (not last)
+  fmla(v27.v4s(), v15.v4s(), v9.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v10.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v11.s()[1]);
+  ldp(q14, q15, mem[x5], 32);
+
+  fmla(v20.v4s(), v16.v4s(), v6.s()[2]);
+  fmla(v22.v4s(), v16.v4s(), v7.s()[2]);
+  fmla(v24.v4s(), v16.v4s(), v8.s()[2]);
+  fmla(v26.v4s(), v16.v4s(), v9.s()[2]);
+  fmla(v28.v4s(), v16.v4s(), v10.s()[2]);
+  fmla(v30.v4s(), v16.v4s(), v11.s()[2]);
+  fmla(v21.v4s(), v17.v4s(), v6.s()[2]);
+  fmla(v23.v4s(), v17.v4s(), v7.s()[2]);
+  fmla(v25.v4s(), v17.v4s(), v8.s()[2]);
+  fmla(v27.v4s(), v17.v4s(), v9.s()[2]);
+  fmla(v29.v4s(), v17.v4s(), v10.s()[2]);
+  fmla(v31.v4s(), v17.v4s(), v11.s()[2]);
+  ldp(q16, q17, mem[x5], 32);
+
+  fmla(v20.v4s(), v18.v4s(), v6.s()[3]);
+  fmla(v22.v4s(), v18.v4s(), v7.s()[3]);
+  subs(x0, x0, 32);
+  fmla(v24.v4s(), v18.v4s(), v8.s()[3]);
+  fmla(v26.v4s(), v18.v4s(), v9.s()[3]);
+  fmla(v28.v4s(), v18.v4s(), v10.s()[3]);
+  fmla(v30.v4s(), v18.v4s(), v11.s()[3]);
+  fmla(v21.v4s(), v19.v4s(), v6.s()[3]);
+  fmla(v23.v4s(), v19.v4s(), v7.s()[3]);
+  fmla(v25.v4s(), v19.v4s(), v8.s()[3]);
+  fmla(v27.v4s(), v19.v4s(), v9.s()[3]);
+  fmla(v29.v4s(), v19.v4s(), v10.s()[3]);
+  fmla(v31.v4s(), v19.v4s(), v11.s()[3]);
+  b_hs(l1);
+
+  // Epilogue - 8 floats of A (32 bytes)
+  // 96 FMA + 6 LDP A + 8 LDP B
+  // First block same as main loop.  Second block has no preloads.
+  bind(l2);
+  // First group of 4 A.  48 FMA.
+  fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+  ldp(q18, q19, mem[x5], 32); // Load last B
+  fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+
+  fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+  fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 128]); // Prefetch B
+  }
+  fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+  fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+  if (prefetch) {
+    prfm(kPLDL1KEEP, mem[x5, 256]);
+  }
+  fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+  fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+  ldr(q6, mem[x3], 16); // Load next 6 A
+  fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+  ldr(q7, mem[x9], 16);
+
+  fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+  fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+  fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+  ldr(q8, mem[x10], 16);
+  fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+  fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+  fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+  ldr(q9, mem[x11], 16);
+  fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+  fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+  fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+  ldr(q10, mem[x12], 16);
+  fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+  fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+  fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+  ldr(q11, mem[x4], 16);
+
+  fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+  fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+  fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+  ldp(q12, q13, mem[x5], 32); // Load 4 B
+  fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+  fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+  fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+  ldp(q14, q15, mem[x5], 32);
+  fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+  fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+  fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+  ldp(q16, q17, mem[x5], 32);
+  fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+  fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+  fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+  ldp(q18, q19, mem[x5], 32);
+
+  // Second group of 4 A.  48 FMA.
+  fmla(v20.v4s(), v12.v4s(), v6.s()[0]);
+  fmla(v22.v4s(), v12.v4s(), v7.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v8.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v9.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v10.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v11.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v6.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v7.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v8.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v9.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v10.s()[0]);
+  fmla(v31.v4s(), v13.v4s(), v11.s()[0]);
+
+  fmla(v20.v4s(), v14.v4s(), v6.s()[1]);
+  fmla(v22.v4s(), v14.v4s(), v7.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v8.s()[1]);
+  fmla(v26.v4s(), v14.v4s(), v9.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v10.s()[1]);
+  fmla(v30.v4s(), v14.v4s(), v11.s()[1]);
+  fmla(v21.v4s(), v15.v4s(), v6.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v7.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v8.s()[1]);
+  fmla(v27.v4s(), v15.v4s(), v9.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v10.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v11.s()[1]);
+
+  fmla(v20.v4s(), v16.v4s(), v6.s()[2]);
+  fmla(v22.v4s(), v16.v4s(), v7.s()[2]);
+  fmla(v24.v4s(), v16.v4s(), v8.s()[2]);
+  fmla(v26.v4s(), v16.v4s(), v9.s()[2]);
+  fmla(v28.v4s(), v16.v4s(), v10.s()[2]);
+  fmla(v30.v4s(), v16.v4s(), v11.s()[2]);
+  fmla(v21.v4s(), v17.v4s(), v6.s()[2]);
+  fmla(v23.v4s(), v17.v4s(), v7.s()[2]);
+  fmla(v25.v4s(), v17.v4s(), v8.s()[2]);
+  fmla(v27.v4s(), v17.v4s(), v9.s()[2]);
+  fmla(v29.v4s(), v17.v4s(), v10.s()[2]);
+  fmla(v31.v4s(), v17.v4s(), v11.s()[2]);
+
+  fmla(v20.v4s(), v18.v4s(), v6.s()[3]);
+  fmla(v22.v4s(), v18.v4s(), v7.s()[3]);
+  fmla(v24.v4s(), v18.v4s(), v8.s()[3]);
+  fmla(v26.v4s(), v18.v4s(), v9.s()[3]);
+  fmla(v28.v4s(), v18.v4s(), v10.s()[3]);
+  fmla(v30.v4s(), v18.v4s(), v11.s()[3]);
+  fmla(v21.v4s(), v19.v4s(), v6.s()[3]);
+  fmla(v23.v4s(), v19.v4s(), v7.s()[3]);
+
+  // Load min/max values
+  ld2r({v6.v4s(), v7.v4s()}, mem[x8]);
+
+  fmla(v25.v4s(), v19.v4s(), v8.s()[3]);
+  fmla(v27.v4s(), v19.v4s(), v9.s()[3]);
+  // Is there a remainder?- 4 floats of A (16 bytes) or less
+  tst(x0, 31);
+  fmla(v29.v4s(), v19.v4s(), v10.s()[3]);
+  fmla(v31.v4s(), v19.v4s(), v11.s()[3]);
+  b_ne(l4);
+
+  // Clamp
+  bind(l3);
+  fmax(v20.v4s(), v20.v4s(), v6.v4s());
+  // Load cn_stride
+  ldr(x0, mem[sp, 64]);
+  fmax(v21.v4s(), v21.v4s(), v6.v4s());
+  fmax(v22.v4s(), v22.v4s(), v6.v4s());
+  fmax(v23.v4s(), v23.v4s(), v6.v4s());
+  fmax(v24.v4s(), v24.v4s(), v6.v4s());
+  fmax(v25.v4s(), v25.v4s(), v6.v4s());
+  fmax(v26.v4s(), v26.v4s(), v6.v4s());
+  fmax(v27.v4s(), v27.v4s(), v6.v4s());
+  fmax(v28.v4s(), v28.v4s(), v6.v4s());
+  fmax(v29.v4s(), v29.v4s(), v6.v4s());
+  fmax(v30.v4s(), v30.v4s(), v6.v4s());
+  fmax(v31.v4s(), v31.v4s(), v6.v4s());
+  subs(x1, x1, 8);
+  fmin(v20.v4s(), v20.v4s(), v7.v4s());
+  fmin(v21.v4s(), v21.v4s(), v7.v4s());
+  fmin(v22.v4s(), v22.v4s(), v7.v4s());
+  fmin(v23.v4s(), v23.v4s(), v7.v4s());
+  fmin(v24.v4s(), v24.v4s(), v7.v4s());
+  fmin(v25.v4s(), v25.v4s(), v7.v4s());
+  fmin(v26.v4s(), v26.v4s(), v7.v4s());
+  fmin(v27.v4s(), v27.v4s(), v7.v4s());
+  fmin(v28.v4s(), v28.v4s(), v7.v4s());
+  fmin(v29.v4s(), v29.v4s(), v7.v4s());
+  fmin(v30.v4s(), v30.v4s(), v7.v4s());
+  fmin(v31.v4s(), v31.v4s(), v7.v4s());
+
+  // Store full 6 x 8
+  b_lo(l7);
+
+  stp(q20, q21, mem[x6]);
+  add(x6, x6, x0);
+  sub(x3, x3, x2); // a0 -= kc
+  stp(q22, q23, mem[x16]);
+  add(x16, x16, x0);
+  sub(x9, x9, x2); // a1 -= kc
+  stp(q24, q25, mem[x17]);
+  add(x17, x17, x0);
+  sub(x10, x10, x2); // a2 -= kc
+  stp(q26, q27, mem[x14]);
+  add(x14, x14, x0);
+  sub(x11, x11, x2); // a3 -= kc
+  stp(q28, q29, mem[x13]);
+  add(x13, x13, x0);
+  sub(x12, x12, x2); // a4 -= kc
+  stp(q30, q31, mem[x7]);
+  add(x7, x7, x0);
+  sub(x4, x4, x2); // a5 -= kc
+
+  b_hi(l0);
+
+  // Restore d8-d15 from stack
+  ldp(d14, d15, mem[sp, 48]);
+  ldp(d12, d13, mem[sp, 32]);
+  ldp(d10, d11, mem[sp, 16]);
+  ldp(d8, d9, mem[sp], 64);
+  ret();
+
+  bind(l4);
+  // Load min/max values
+  ld2r({v6.v4s(), v7.v4s()}, mem[x8]);
+
+  // Is there a remainder?- 4 floats of A (16 bytes)
+  tbz(x0, 4, l5);
+
+  // Remainder- 4 floats of A (16 bytes)
+  // Load A
+  ldr(q0, mem[x3], 16);
+  ldr(q1, mem[x9], 16);
+  ldr(q2, mem[x10], 16);
+  ldr(q3, mem[x11], 16);
+  ldr(q4, mem[x12], 16);
+  ldr(q5, mem[x4], 16);
+  // Load B
+  ldp(q12, q13, mem[x5], 32);
+  ldp(q14, q15, mem[x5], 32);
+  ldp(q16, q17, mem[x5], 32);
+  ldp(q18, q19, mem[x5], 32);
+
+  fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+  fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+  fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+
+  fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+  fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+  fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+  fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+  fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+  fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+
+  fmla(v20.v4s(), v16.v4s(), v0.s()[2]);
+  fmla(v22.v4s(), v16.v4s(), v1.s()[2]);
+  fmla(v24.v4s(), v16.v4s(), v2.s()[2]);
+  fmla(v26.v4s(), v16.v4s(), v3.s()[2]);
+  fmla(v28.v4s(), v16.v4s(), v4.s()[2]);
+  fmla(v30.v4s(), v16.v4s(), v5.s()[2]);
+  fmla(v21.v4s(), v17.v4s(), v0.s()[2]);
+  fmla(v23.v4s(), v17.v4s(), v1.s()[2]);
+  fmla(v25.v4s(), v17.v4s(), v2.s()[2]);
+  fmla(v27.v4s(), v17.v4s(), v3.s()[2]);
+  fmla(v29.v4s(), v17.v4s(), v4.s()[2]);
+  fmla(v31.v4s(), v17.v4s(), v5.s()[2]);
+
+  fmla(v20.v4s(), v18.v4s(), v0.s()[3]);
+  fmla(v22.v4s(), v18.v4s(), v1.s()[3]);
+  fmla(v24.v4s(), v18.v4s(), v2.s()[3]);
+  fmla(v26.v4s(), v18.v4s(), v3.s()[3]);
+  fmla(v28.v4s(), v18.v4s(), v4.s()[3]);
+  fmla(v30.v4s(), v18.v4s(), v5.s()[3]);
+  fmla(v21.v4s(), v19.v4s(), v0.s()[3]);
+  fmla(v23.v4s(), v19.v4s(), v1.s()[3]);
+  fmla(v25.v4s(), v19.v4s(), v2.s()[3]);
+  fmla(v27.v4s(), v19.v4s(), v3.s()[3]);
+  fmla(v29.v4s(), v19.v4s(), v4.s()[3]);
+  fmla(v31.v4s(), v19.v4s(), v5.s()[3]);
+
+  // Is there a remainder?- 2 floats of A (8 bytes)
+  bind(l5);
+  tbz(x0, 3, l6);
+
+  // Remainder- 2 floats of A (8 bytes)
+  // Load A
+  ldr(d0, mem[x3], 8);
+  ldr(d1, mem[x9], 8);
+  ldr(d2, mem[x10], 8);
+  ldr(d3, mem[x11], 8);
+  ldr(d4, mem[x12], 8);
+  ldr(d5, mem[x4], 8);
+  // Load B
+  ldp(q12, q13, mem[x5], 32);
+  ldp(q14, q15, mem[x5], 32);
+
+  fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+  fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+  fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+
+  fmla(v20.v4s(), v14.v4s(), v0.s()[1]);
+  fmla(v22.v4s(), v14.v4s(), v1.s()[1]);
+  fmla(v24.v4s(), v14.v4s(), v2.s()[1]);
+  fmla(v26.v4s(), v14.v4s(), v3.s()[1]);
+  fmla(v28.v4s(), v14.v4s(), v4.s()[1]);
+  fmla(v30.v4s(), v14.v4s(), v5.s()[1]);
+  fmla(v21.v4s(), v15.v4s(), v0.s()[1]);
+  fmla(v23.v4s(), v15.v4s(), v1.s()[1]);
+  fmla(v25.v4s(), v15.v4s(), v2.s()[1]);
+  fmla(v27.v4s(), v15.v4s(), v3.s()[1]);
+  fmla(v29.v4s(), v15.v4s(), v4.s()[1]);
+  fmla(v31.v4s(), v15.v4s(), v5.s()[1]);
+
+  // Is there a remainder?- 1 float of A (4 bytes)
+  bind(l6);
+  tbz(x0, 2, l3);
+
+  // Remainder- 1 float of A (4 bytes)
+  // Load A
+  ldr(s0, mem[x3], 4);
+  ldr(s1, mem[x9], 4);
+  ldr(s2, mem[x10], 4);
+  ldr(s3, mem[x11], 4);
+  ldr(s4, mem[x12], 4);
+  ldr(s5, mem[x4], 4);
+  // Load B
+  ldp(q12, q13, mem[x5], 32);
+
+  fmla(v20.v4s(), v12.v4s(), v0.s()[0]);
+  fmla(v22.v4s(), v12.v4s(), v1.s()[0]);
+  fmla(v24.v4s(), v12.v4s(), v2.s()[0]);
+  fmla(v26.v4s(), v12.v4s(), v3.s()[0]);
+  fmla(v28.v4s(), v12.v4s(), v4.s()[0]);
+  fmla(v30.v4s(), v12.v4s(), v5.s()[0]);
+  fmla(v21.v4s(), v13.v4s(), v0.s()[0]);
+  fmla(v23.v4s(), v13.v4s(), v1.s()[0]);
+  fmla(v25.v4s(), v13.v4s(), v2.s()[0]);
+  fmla(v27.v4s(), v13.v4s(), v3.s()[0]);
+  fmla(v29.v4s(), v13.v4s(), v4.s()[0]);
+  fmla(v31.v4s(), v13.v4s(), v5.s()[0]);
+  b(l3);
+
+  // Store odd width
+  bind(l7);
+  tbz(x1, 2, l8);
+  str(q20, mem[x6], 16);
+  mov(v20.v16b(), v21.v16b());
+  str(q22, mem[x16], 16);
+  mov(v22.v16b(), v23.v16b());
+  str(q24, mem[x17], 16);
+  mov(v24.v16b(), v25.v16b());
+  str(q26, mem[x14], 16);
+  mov(v26.v16b(), v27.v16b());
+  str(q28, mem[x13], 16);
+  mov(v28.v16b(), v29.v16b());
+  str(q30, mem[x7], 16);
+  mov(v30.v16b(), v31.v16b());
+  bind(l8);
+  tbz(x1, 1, l9);
+  str(d20, mem[x6], 8);
+  str(d22, mem[x16], 8);
+  dup(d20, v20.d()[1]);
+  dup(d22, v22.d()[1]);
+  str(d24, mem[x17], 8);
+  str(d26, mem[x14], 8);
+  dup(d24, v24.d()[1]);
+  dup(d26, v26.d()[1]);
+  str(d28, mem[x13], 8);
+  str(d30, mem[x7], 8);
+  dup(d28, v28.d()[1]);
+  dup(d30, v30.d()[1]);
+
+  bind(l9);
+  tbz(x1, 0, l10);
+  str(s20, mem[x6]);
+  str(s22, mem[x16]);
+  str(s24, mem[x17]);
+  str(s26, mem[x14]);
+  str(s28, mem[x13]);
+  str(s30, mem[x7]);
+  bind(l10);
+  // Restore d8-d15 from stack
+  ldp(d14, d15, mem[sp, 48]);
+  ldp(d12, d13, mem[sp, 32]);
+  ldp(d10, d11, mem[sp, 16]);
+  ldp(d8, d9, mem[sp], 64);
+  ret();
+
+
+}
+}  // namespace
+}  // aarch64
+}  // xnnpack
+
+xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {
+  using namespace xnnpack::aarch64;
+  Generator g(code);
+  g.generate(false, nc, kc, nullptr);
+  g.finalize();
+  if (g.error() != xnnpack::Error::kNoError) {
+    return xnn_status_invalid_state;
+  }
+  return xnn_status_success;
+}
+
+xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(xnn_code_buffer* code, size_t nc, size_t kc, void* params) {
+  using namespace xnnpack::aarch64;
+  Generator g(code);
+  g.generate(true, nc, kc, nullptr);
+  g.finalize();
+  if (g.error() != xnnpack::Error::kNoError) {
+    return xnn_status_invalid_state;
+  }
+  return xnn_status_success;
+}

diff --git a/src/xnnpack/aarch64-assembler.h b/src/xnnpack/aarch64-assembler.h
index 7cf7a49..d6bf34b 100644
--- a/src/xnnpack/aarch64-assembler.h
+++ b/src/xnnpack/aarch64-assembler.h

@@ -284,12 +284,10 @@
 // - ldp(x0, x1, mem[rn], offset); // post-indexed
 constexpr MemOperandHelper mem;
 
-enum class PrefetchOp {
+enum PrefetchOp {
   kPLDL1KEEP = 0
 };
 
-constexpr PrefetchOp PLDL1KEEP = PrefetchOp::kPLDL1KEEP;
-
 enum Condition : uint32_t {
   kEQ = 0x0,
   kNE = 0x1,

diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h
index 663099d..f3b7767 100644
--- a/src/xnnpack/common.h
+++ b/src/xnnpack/common.h

@@ -110,7 +110,7 @@
   #define XNN_PLATFORM_WINDOWS 0
 #endif
 
-#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS
+#if (XNN_ARCH_ARM || XNN_ARCH_ARM64) && !XNN_PLATFORM_IOS
   #define XNN_PLATFORM_JIT 1
 #else
   #define XNN_PLATFORM_JIT 0

diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 28c9f82..bdf5005 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h

@@ -1625,6 +1625,9 @@
 enum xnn_status xnn_generate_qc8_gemm_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
 enum xnn_status xnn_generate_qc8_gemm_fp32_ukernel_4x8c4__aarch32_neondot_ld64(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
 
+enum xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
+enum xnn_status xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75(struct xnn_code_buffer* code, size_t nc, size_t kc, void* params);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/test/aarch64-assembler.cc b/test/aarch64-assembler.cc
index fbf8332..d52113b 100644
--- a/test/aarch64-assembler.cc
+++ b/test/aarch64-assembler.cc

@@ -53,9 +53,9 @@
   EXPECT_ERROR(Error::kInvalidOperand, a.ldr(x8, mem[sp, 32768]));
   EXPECT_ERROR(Error::kInvalidOperand, a.ldr(x8, MemOperand(sp, 16, AddressingMode::kPostIndex)));
 
-  CHECK_ENCODING(0xF98000A0, a.prfm(PLDL1KEEP, mem[x5]));
-  EXPECT_ERROR(Error::kInvalidOperand, a.prfm(PLDL1KEEP, mem[x5, -8]));
-  EXPECT_ERROR(Error::kInvalidOperand, a.prfm(PLDL1KEEP, mem[x5, 32761]));
+  CHECK_ENCODING(0xF98000A0, a.prfm(kPLDL1KEEP, mem[x5]));
+  EXPECT_ERROR(Error::kInvalidOperand, a.prfm(kPLDL1KEEP, mem[x5, -8]));
+  EXPECT_ERROR(Error::kInvalidOperand, a.prfm(kPLDL1KEEP, mem[x5, 32761]));
 
   CHECK_ENCODING(0xD65F03C0, a.ret());
 

diff --git a/test/f32-gemm-minmax-2.cc b/test/f32-gemm-minmax-2.cc
index e96dddd..a6d8f04 100644
--- a/test/f32-gemm-minmax-2.cc
+++ b/test/f32-gemm-minmax-2.cc

@@ -33589,3 +33589,504 @@
       .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_ld64, xnn_init_f32_minmax_scalar_params);
   }
 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+
+
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 6; m++) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t m = 1; m <= 6; m++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(16)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 6; m++) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+#endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT

diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index 6f53017..26690e5 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc

@@ -30588,3 +30588,504 @@
       .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
   }
 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
+
+
+#if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 6; m++) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t m = 1; m <= 6; m++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(16)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(16)
+      .a_stride(19)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n <= 8; n++) {
+      for (uint32_t m = 1; m <= 6; m++) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(16)
+          .iterations(1)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 16; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(37)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 17; k < 32; k++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(6)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(6)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 24; k <= 80; k += 8) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(6)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(6)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        for (uint32_t m = 1; m <= 6; m++) {
+          GemmMicrokernelTester()
+            .mr(6)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+        }
+      }
+    }
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+
+  TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(6)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(6)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
+  }
+#endif  // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT

diff --git a/test/f32-gemm-minmax.yaml b/test/f32-gemm-minmax.yaml
index 1e10bcf..fc0a9bc 100644
--- a/test/f32-gemm-minmax.yaml
+++ b/test/f32-gemm-minmax.yaml

@@ -524,3 +524,11 @@
   init: xnn_init_f32_minmax_scalar_params
   k-block: 2
   assembly: true
+- name: xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
+  init: xnn_init_f32_minmax_scalar_params
+  k-block: 8
+  pipelined: true
+- name: xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75
+  init: xnn_init_f32_minmax_scalar_params
+  k-block: 8
+  pipelined: true

diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc
index 1bd23c8..d91dd55 100644
--- a/test/gemm-microkernel-tester.cc
+++ b/test/gemm-microkernel-tester.cc

@@ -1650,7 +1650,7 @@
     jit_gemm_params p = (jit_gemm_params) {
       .f32_minmax = params
     };
-    ASSERT_EQ(xnn_status_success, gemm_generator(&code_buffer,n(), k() * sizeof(float), &p));
+    ASSERT_EQ(xnn_status_success, gemm_generator(&code_buffer, n(), k() * sizeof(float), &p));
     xnn_f32_gemm_minmax_ukernel_function gemm_minmax = reinterpret_cast<xnn_f32_gemm_minmax_ukernel_function>(code_buffer.code);
 
     gemm_minmax(m(), n(), k() * sizeof(float),
commit	c2e2da8f941d96c92dc27fddcfdbe0a9e7616482	[log] [tgz]
author	Zhi An Ng <zhin@google.com>	Tue Jan 25 16:51:58 2022 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Tue Jan 25 16:53:13 2022 -0800
tree	65969552ffda091b92ae6f3f09cb4d241eed8952
parent	4a1c6a8179d1e9932f586b56c9163a62a6778851 [diff]