Blame - src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in - platform/external/XNNPACK

blob: 14cca1b06bf0134607787728f85b104bfee6586a [file] [log] [blame]

Frank Barchard	baa9ead	2019-10-18 18:06:41 -0700	[diff] [blame^]	1	// Copyright 2019 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	5
				6	#include <xnnpack/assembly.h>
				7
				8	# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53(
				9	# size_t mr, x0
				10	# size_t nc, x1
				11	# size_t kc, x2 / x0
				12	# const uint8_t*restrict a, x3
				13	# size_t a_stride, x4
				14	# const void*restrict w, x5
				15	# uint8_t*restrict c, x6
				16	# size_t cm_stride, x7
				17	# size_t cn_stride, [sp] -> x14
				18	$if INC:
				19	# const float*restrict acc, [sp + 8] -> x15
				20	# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
				21	$else:
				22	# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
				23
				24	# d8-d15 need to be preserved if used.
				25	# x19-30 need to be preserved if used.
				26
				27	# A pointers
				28	# x3 a0
				29	# x11 a1
				30	# x12 a2
				31	# x4 a3 / a_stride
				32
				33	# C pointers
				34	# x6 c0
				35	# x9 c1
				36	# x10 c2
				37	# x7 c3 / cm_stride
				38
				39	# Vector register usage and GPR shadows
				40	# a0 v0 first set of A
				41	# a1 v0[1] x13
				42	# a2 v1
				43	# a3 v1[1] x8
				44	# a0 v2 second set of A
				45	# a1 v2[1] x13
				46	# a2 v3
				47	# a3 v3[1] x8
				48	# B v6 v7 v8 x20 x21 x16 first set of B
				49	# B v9 v10 v11 x17 x18 x19
				50	# B v14 v15 v16 x20 x21 x16 second set of B (same x as first set)
				51	# B v17 v18 v19 x17 x18 x19
				52	# C v20 v21 v22
				53	# C v23 v24 v25
				54	# C v26 v27 v28
				55	# C v29 v30 v31
				56	# Clamp v4 v5
				57	# v12 to v13 unused.
				58
				59	BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
				60
				61	$if INC:
				62	# Load cn_stride, acc
				63	LDP x14, x15, [sp]
				64	# Load params pointer
				65	LDR x8, [sp, 16]
				66	$else:
				67	# Load cn_stride, params pointer
				68	LDP x14, x8, [sp]
				69
				70	# Load clamping_params values
				71	LD2R {v4.4s, v5.4s}, [x8]
				72
				73	# Save x19-21 on stack
				74	STR x21, [sp, -80]!
				75	STP x19, x20, [sp, 16]
				76
				77	# Save d8-d11,d14,d15 on stack
				78	STP d8, d9, [sp, 32]
				79	STP d10, d11, [sp, 48]
				80	STP d14, d15, [sp, 64]
				81
				82	# Clamp A and C pointers
				83	ADD x11, x3, x4 // a1 = a0 + a_stride
				84	ADD x9, x6, x7 // c1 = c0 + cm_stride
				85	CMP x0, 2 // if mr < 2
				86	CSEL x11, x3, x11, LO // a1 = a0
				87	CSEL x9, x6, x9, LO // c1 = c0
				88	ADD x12, x11, x4 // a2 = a1 + a_stride
				89	ADD x10, x9, x7 // c2 = c1 + cm_stride
				90	// if mr <= 2
				91	CSEL x12, x11, x12, LS // a2 = a1
				92	CSEL x10, x9, x10, LS // c2 = c1
				93	ADD x4, x12, x4 // a3 = a2 + a_stride
				94	ADD x7, x10, x7 // c3 = c2 + cm_stride
				95	CMP x0, 4 // if mr < 4
				96	CSEL x4, x12, x4, LO // a3 = a2
				97	CSEL x7, x10, x7, LO // c3 = c2
				98
				99	0:
				100	$if INC:
				101	# Load initial accumulators
				102	LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
				103	LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
				104	LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
				105	LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
				106	$else:
				107	# Load initial bias from w into accumulators
				108	LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
				109	MOV v23.16b, v20.16b
				110	MOV v24.16b, v21.16b
				111	MOV v25.16b, v22.16b
				112	MOV v26.16b, v20.16b
				113	MOV v27.16b, v21.16b
				114	MOV v28.16b, v22.16b
				115	MOV v29.16b, v20.16b
				116	MOV v30.16b, v21.16b
				117	MOV v31.16b, v22.16b
				118
				119	PRFM PLDL1KEEP, [x5]
				120	PRFM PLDL1KEEP, [x5, 64]
				121	PRFM PLDL1KEEP, [x5, 128]
				122	PRFM PLDL1KEEP, [x5, 192]
				123	PRFM PLDL1KEEP, [x5, 256]
				124	PRFM PLDL1KEEP, [x5, 320]
				125
				126	# Is there at least 4 floats (16 bytes)?
				127	SUBS x0, x2, 16 // k = kc - 16
				128	B.LO 3f
				129
				130	SUBS x0, x0, 16
				131
				132	# Prologue - loads for first group of 24 FMA
				133
				134	# Read first block of 4 A.
				135	LDR d0, [x3], 8 // a0
				136	LDR x13, [x11], 8 // a1
				137	LDR d1, [x12], 8 // a2
				138	LDR x8, [x4], 8 // a3
				139
				140	LDR d6, [x5] // vb0x0123
				141	LDR x20, [x5, 8]
				142
				143	LDR d7, [x5, 16] // vb0x4567
				144	LDR x21, [x5, 24]
				145
				146	LDR d8, [x5, 32] // vb0x89AB
				147	LDR x16, [x5, 40]
				148
				149	LDR d9, [x5, 48] // vb1x0123
				150	INS v0.d[1], x13
				151	LDR x17, [x5, 56]
				152
				153	LDR d10, [x5, 64] // vb1x4567
				154	INS v1.d[1], x8
				155	LDR x18, [x5, 72]
				156
				157	LDR d11, [x5, 80] // vb1x89AB
				158	LDR x19, [x5, 88]
				159	INS v6.d[1], x20
				160	ADD x5, x5, 96
				161
				162	# Is there at least 4 floats (16 bytes) for main loop?
				163	B.LO 2f
				164
				165	# Main loop - 4 floats of A (16 bytes)
				166	1:
				167	# First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
				168	# A is loaded for 2nd group into v2/v3
				169	# INS is 4 blocks (16 cycles) after load
				170
				171	# BLOCK 0
				172	LDR d2, [x3], 8 // a0
				173	INS v7.d[1], x21
				174	FMLA v20.4s, v6.4s, v0.s[0]
				175	LDR x13, [x11], 8 // a1
				176	FMLA v23.4s, v6.4s, v0.s[2]
				177	PRFM PLDL1KEEP, [x5, 192]
				178	FMLA v26.4s, v6.4s, v1.s[0]
				179
				180	# BLOCK 1
				181	LDR d3, [x12], 8 // a2
				182	INS v8.d[1], x16
				183	FMLA v29.4s, v6.4s, v1.s[2]
				184	LDR x8, [x4], 8 // a3
				185	FMLA v21.4s, v7.4s, v0.s[0]
				186	PRFM PLDL1KEEP, [x5, 256]
				187	FMLA v24.4s, v7.4s, v0.s[2]
				188
				189	# BLOCK 2
				190	LDR d14, [x5] // vb0x0123
				191	INS v9.d[1], x17
				192	FMLA v27.4s, v7.4s, v1.s[0]
				193	LDR x20, [x5, 8]
				194	FMLA v30.4s, v7.4s, v1.s[2]
				195	PRFM PLDL1KEEP, [x5, 320]
				196	FMLA v22.4s, v8.4s, v0.s[0]
				197
				198	# BLOCK 3
				199	LDR d15, [x5, 16] // vb0x4567
				200	INS v10.d[1], x18
				201	FMLA v25.4s, v8.4s, v0.s[2]
				202	LDR x21, [x5, 24]
				203	FMLA v28.4s, v8.4s, v1.s[0]
				204	FMLA v31.4s, v8.4s, v1.s[2]
				205
				206	# BLOCK 4
				207	LDR d16, [x5, 32] // vb0x89AB
				208	INS v11.d[1], x19
				209	FMLA v20.4s, v9.4s, v0.s[1]
				210	LDR x16, [x5, 40]
				211	FMLA v23.4s, v9.4s, v0.s[3]
				212	FMLA v26.4s, v9.4s, v1.s[1]
				213
				214	# BLOCK 5
				215	LDR d17, [x5, 48] // vb1x0123
				216	INS v2.d[1], x13 // a1 was loaded in block 0
				217	FMLA v29.4s, v9.4s, v1.s[3]
				218	LDR x17, [x5, 56]
				219	FMLA v21.4s, v10.4s, v0.s[1]
				220	FMLA v24.4s, v10.4s, v0.s[3]
				221
				222	# BLOCK 6
				223	LDR d18, [x5, 64] // vb1x4567
				224	INS v3.d[1], x8 // a3 was loaded in block 1
				225	FMLA v27.4s, v10.4s, v1.s[1]
				226	LDR x18, [x5, 72]
				227	FMLA v30.4s, v10.4s, v1.s[3]
				228	FMLA v22.4s, v11.4s, v0.s[1]
				229
				230	# BLOCK 7
				231	LDR d19, [x5, 80] // vb1x89AB
				232	INS v14.d[1], x20 // v14 was loaded in block 2
				233	FMLA v25.4s, v11.4s, v0.s[3]
				234	LDR x19, [x5, 88]
				235	FMLA v28.4s, v11.4s, v1.s[1]
				236	FMLA v31.4s, v11.4s, v1.s[3]
				237
				238	# Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
				239	# A is loaded for 1st group into v0/v1
				240
				241	# BLOCK 0
				242	LDR d0, [x3], 8 // a0
				243	INS v15.d[1], x21
				244	FMLA v20.4s, v14.4s, v2.s[0]
				245	LDR x13, [x11], 8 // a1
				246	FMLA v23.4s, v14.4s, v2.s[2]
				247	FMLA v26.4s, v14.4s, v3.s[0]
				248
				249	# BLOCK 1
				250	LDR d1, [x12], 8 // a2
				251	INS v16.d[1], x16
				252	FMLA v29.4s, v14.4s, v3.s[2]
				253	LDR x8, [x4], 8 // a3
				254	FMLA v21.4s, v15.4s, v2.s[0]
				255	FMLA v24.4s, v15.4s, v2.s[2]
				256
				257	# BLOCK 2
				258	LDR d6, [x5, 96] // vb0x0123
				259	INS v17.d[1], x17
				260	FMLA v27.4s, v15.4s, v3.s[0]
				261	LDR x20, [x5, 104]
				262	FMLA v30.4s, v15.4s, v3.s[2]
				263	FMLA v22.4s, v16.4s, v2.s[0]
				264
				265	# BLOCK 3
				266	LDR d7, [x5, 112] // vb0x4567
				267	INS v18.d[1], x18
				268	FMLA v25.4s, v16.4s, v2.s[2]
				269	LDR x21, [x5, 120]
				270	FMLA v28.4s, v16.4s, v3.s[0]
				271	FMLA v31.4s, v16.4s, v3.s[2]
				272
				273	# BLOCK 4
				274	LDR d8, [x5, 128] // vb0x89AB
				275	INS v19.d[1], x19
				276	FMLA v20.4s, v17.4s, v2.s[1]
				277	LDR x16, [x5, 136]
				278	FMLA v23.4s, v17.4s, v2.s[3]
				279	FMLA v26.4s, v17.4s, v3.s[1]
				280
				281	# BLOCK 5
				282	LDR d9, [x5, 144] // vb1x0123
				283	INS v0.d[1], x13 // a1
				284	FMLA v29.4s, v17.4s, v3.s[3]
				285	LDR x17, [x5, 152]
				286	FMLA v21.4s, v18.4s, v2.s[1]
				287	FMLA v24.4s, v18.4s, v2.s[3]
				288
				289	# BLOCK 6
				290	LDR d10, [x5, 160] // vb1x4567
				291	INS v1.d[1], x8 // a3
				292	FMLA v27.4s, v18.4s, v3.s[1]
				293	LDR x18, [x5, 168]
				294	FMLA v30.4s, v18.4s, v3.s[3]
				295	SUBS x0, x0, 16
				296	FMLA v22.4s, v19.4s, v2.s[1]
				297
				298	# BLOCK 7
				299	LDR d11, [x5, 176] // vb1x89AB
				300	INS v6.d[1], x20
				301	FMLA v25.4s, v19.4s, v2.s[3]
				302	LDR x19, [x5, 184]
				303	FMLA v28.4s, v19.4s, v3.s[1]
				304	ADD x5, x5, 192
				305	FMLA v31.4s, v19.4s, v3.s[3]
				306	B.HS 1b
				307
				308	# Epilogue
				309	# First block same as main loop. Second block has no loads.
				310	2:
				311	# BLOCK 0
				312	LDR d2, [x3], 8 // a0
				313	INS v7.d[1], x21
				314	FMLA v20.4s, v6.4s, v0.s[0]
				315	LDR x13, [x11], 8 // a1
				316	FMLA v23.4s, v6.4s, v0.s[2]
				317	PRFM PLDL1KEEP, [x5, 192]
				318	FMLA v26.4s, v6.4s, v1.s[0]
				319
				320	# BLOCK 1
				321	LDR d3, [x12], 8 // a2
				322	INS v8.d[1], x16
				323	FMLA v29.4s, v6.4s, v1.s[2]
				324	LDR x8, [x4], 8 // a3
				325	FMLA v21.4s, v7.4s, v0.s[0]
				326	PRFM PLDL1KEEP, [x5, 256]
				327	FMLA v24.4s, v7.4s, v0.s[2]
				328
				329	# BLOCK 2
				330	LDR d14, [x5] // vb0x0123
				331	INS v9.d[1], x17
				332	FMLA v27.4s, v7.4s, v1.s[0]
				333	LDR x20, [x5, 8]
				334	FMLA v30.4s, v7.4s, v1.s[2]
				335	PRFM PLDL1KEEP, [x5, 320]
				336	FMLA v22.4s, v8.4s, v0.s[0]
				337
				338	# BLOCK 3
				339	LDR d15, [x5, 16] // vb0x4567
				340	INS v10.d[1], x18
				341	FMLA v25.4s, v8.4s, v0.s[2]
				342	LDR x21, [x5, 24]
				343	FMLA v28.4s, v8.4s, v1.s[0]
				344	FMLA v31.4s, v8.4s, v1.s[2]
				345
				346	# BLOCK 4
				347	LDR d16, [x5, 32] // vb0x89AB
				348	INS v11.d[1], x19
				349	FMLA v20.4s, v9.4s, v0.s[1]
				350	LDR x16, [x5, 40]
				351	FMLA v23.4s, v9.4s, v0.s[3]
				352	FMLA v26.4s, v9.4s, v1.s[1]
				353
				354	# BLOCK 5
				355	LDR d17, [x5, 48] // vb1x0123
				356	INS v2.d[1], x13 // a1 was loaded in block 0
				357	FMLA v29.4s, v9.4s, v1.s[3]
				358	LDR x17, [x5, 56]
				359	FMLA v21.4s, v10.4s, v0.s[1]
				360	FMLA v24.4s, v10.4s, v0.s[3]
				361
				362	# BLOCK 6
				363	LDR d18, [x5, 64] // vb1x4567
				364	INS v3.d[1], x8 // a3 was loaded in block 1
				365	FMLA v27.4s, v10.4s, v1.s[1]
				366	LDR x18, [x5, 72]
				367	FMLA v30.4s, v10.4s, v1.s[3]
				368	FMLA v22.4s, v11.4s, v0.s[1]
				369
				370	# BLOCK 7
				371	LDR d19, [x5, 80] // vb1x89AB
				372	INS v14.d[1], x20 // v14 was loaded in block 2
				373	FMLA v25.4s, v11.4s, v0.s[3]
				374	LDR x19, [x5, 88]
				375	FMLA v28.4s, v11.4s, v1.s[1]
				376	ADD x5, x5, 96
				377	FMLA v31.4s, v11.4s, v1.s[3]
				378
				379	# Second group of 24 fma. 8 blocks of 4 cycles.
				380	# Epilogue version does no loads
				381
				382	# BLOCK 0
				383	INS v15.d[1], x21
				384	FMLA v20.4s, v14.4s, v2.s[0]
				385	FMLA v23.4s, v14.4s, v2.s[2]
				386	FMLA v26.4s, v14.4s, v3.s[0]
				387
				388	# BLOCK 1
				389	INS v16.d[1], x16
				390	FMLA v29.4s, v14.4s, v3.s[2]
				391	FMLA v21.4s, v15.4s, v2.s[0]
				392	FMLA v24.4s, v15.4s, v2.s[2]
				393
				394	# BLOCK 2
				395	INS v17.d[1], x17
				396	FMLA v27.4s, v15.4s, v3.s[0]
				397	FMLA v30.4s, v15.4s, v3.s[2]
				398	FMLA v22.4s, v16.4s, v2.s[0]
				399
				400	# BLOCK 3
				401	INS v18.d[1], x18
				402	FMLA v25.4s, v16.4s, v2.s[2]
				403	FMLA v28.4s, v16.4s, v3.s[0]
				404	FMLA v31.4s, v16.4s, v3.s[2]
				405
				406	# BLOCK 4
				407	INS v19.d[1], x19
				408	FMLA v20.4s, v17.4s, v2.s[1]
				409	FMLA v23.4s, v17.4s, v2.s[3]
				410	FMLA v26.4s, v17.4s, v3.s[1]
				411
				412	# BLOCK 5
				413	FMLA v29.4s, v17.4s, v3.s[3]
				414	FMLA v21.4s, v18.4s, v2.s[1]
				415	FMLA v24.4s, v18.4s, v2.s[3]
				416
				417	# BLOCK 6
				418	FMLA v27.4s, v18.4s, v3.s[1]
				419	FMLA v30.4s, v18.4s, v3.s[3]
				420	FMLA v22.4s, v19.4s, v2.s[1]
				421
				422	# BLOCK 7
				423	FMLA v25.4s, v19.4s, v2.s[3]
				424	FMLA v28.4s, v19.4s, v3.s[1]
				425	FMLA v31.4s, v19.4s, v3.s[3]
				426
				427	3:
				428	# Is there a remainder?- 2 floats of A (8 bytes)
				429	TBNZ x0, 3, 5f
				430	# Is there a remainder?- 1 floats of A (4 bytes)
				431	TBNZ x0, 2, 6f
				432
				433	4:
				434	# Clamp
				435	FMIN v20.4s, v20.4s, v4.4s
				436	FMIN v21.4s, v21.4s, v4.4s
				437	FMIN v22.4s, v22.4s, v4.4s
				438	FMIN v23.4s, v23.4s, v4.4s
				439	FMIN v24.4s, v24.4s, v4.4s
				440	FMIN v25.4s, v25.4s, v4.4s
				441	FMIN v26.4s, v26.4s, v4.4s
				442	FMIN v27.4s, v27.4s, v4.4s
				443	FMIN v28.4s, v28.4s, v4.4s
				444	FMIN v29.4s, v29.4s, v4.4s
				445	FMIN v30.4s, v30.4s, v4.4s
				446	FMIN v31.4s, v31.4s, v4.4s
				447	FMAX v20.4s, v20.4s, v5.4s
				448	FMAX v21.4s, v21.4s, v5.4s
				449	FMAX v22.4s, v22.4s, v5.4s
				450	FMAX v23.4s, v23.4s, v5.4s
				451	FMAX v24.4s, v24.4s, v5.4s
				452	FMAX v25.4s, v25.4s, v5.4s
				453	FMAX v26.4s, v26.4s, v5.4s
				454	FMAX v27.4s, v27.4s, v5.4s
				455	FMAX v28.4s, v28.4s, v5.4s
				456	FMAX v29.4s, v29.4s, v5.4s
				457	FMAX v30.4s, v30.4s, v5.4s
				458	FMAX v31.4s, v31.4s, v5.4s
				459
				460	# Store full 4 x 12
				461	CMP x1, 12
				462	B.LO 7f
				463
				464	ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
				465	ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
				466	ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
				467	ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
				468
				469	SUB x3, x3, x2 // a0 -= kc
				470	SUB x11, x11, x2 // a1 -= kc
				471	SUB x12, x12, x2 // a2 -= kc
				472	SUB x4, x4, x2 // a3 -= kc
				473
				474	SUBS x1, x1, 12
				475	B.HI 0b
				476
				477	# Restore d8-d11,d14,d15 from stack
				478	LDP d14, d15, [sp, 64]
				479	LDP d10, d11, [sp, 48]
				480	LDP d8, d9, [sp, 32]
				481
				482	# Restore x19-21 from stack
				483	LDP x19, x20, [sp, 16]
				484	LDR x21, [sp], 80
				485	RET
				486
				487	5:
				488	# Remainder - 2 floats of A (8 bytes)
				489	# Read first block of 4 A.
				490	LDR d0, [x3], 8 // a0
				491	LDR d1, [x11], 8 // a1
				492	LDR d2, [x12], 8 // a2
				493	LDR d3, [x4], 8 // a3
				494	LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
				495	LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
				496
				497	# First block of 3 B
				498	FMLA v20.4s, v6.4s, v0.s[0]
				499	FMLA v23.4s, v6.4s, v1.s[0]
				500	FMLA v26.4s, v6.4s, v2.s[0]
				501	FMLA v29.4s, v6.4s, v3.s[0]
				502	FMLA v21.4s, v7.4s, v0.s[0]
				503	FMLA v24.4s, v7.4s, v1.s[0]
				504	FMLA v27.4s, v7.4s, v2.s[0]
				505	FMLA v30.4s, v7.4s, v3.s[0]
				506	FMLA v22.4s, v8.4s, v0.s[0]
				507	FMLA v25.4s, v8.4s, v1.s[0]
				508	FMLA v28.4s, v8.4s, v2.s[0]
				509	FMLA v31.4s, v8.4s, v3.s[0]
				510
				511	# Second block of 3 B
				512	FMLA v20.4s, v9.4s, v0.s[1]
				513	FMLA v23.4s, v9.4s, v1.s[1]
				514	FMLA v26.4s, v9.4s, v2.s[1]
				515	FMLA v29.4s, v9.4s, v3.s[1]
				516	FMLA v21.4s, v10.4s, v0.s[1]
				517	FMLA v24.4s, v10.4s, v1.s[1]
				518	FMLA v27.4s, v10.4s, v2.s[1]
				519	FMLA v30.4s, v10.4s, v3.s[1]
				520	FMLA v22.4s, v11.4s, v0.s[1]
				521	FMLA v25.4s, v11.4s, v1.s[1]
				522	FMLA v28.4s, v11.4s, v2.s[1]
				523	FMLA v31.4s, v11.4s, v3.s[1]
				524
				525	TBZ x0, 2, 4b
				526	6:
				527	# Remainder - 1 float of A (4 bytes)
				528	LDR s0, [x3], 4 // a0
				529	LDR s1, [x11], 4 // a1
				530	LDR s2, [x12], 4 // a2
				531	LDR s3, [x4], 4 // a3
				532	LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
				533
				534	FMLA v20.4s, v6.4s, v0.s[0]
				535	FMLA v23.4s, v6.4s, v1.s[0]
				536	FMLA v26.4s, v6.4s, v2.s[0]
				537	FMLA v29.4s, v6.4s, v3.s[0]
				538	FMLA v21.4s, v7.4s, v0.s[0]
				539	FMLA v24.4s, v7.4s, v1.s[0]
				540	FMLA v27.4s, v7.4s, v2.s[0]
				541	FMLA v30.4s, v7.4s, v3.s[0]
				542	FMLA v22.4s, v8.4s, v0.s[0]
				543	FMLA v25.4s, v8.4s, v1.s[0]
				544	FMLA v28.4s, v8.4s, v2.s[0]
				545	FMLA v31.4s, v8.4s, v3.s[0]
				546	B 4b
				547
				548	7:
				549	# Store odd channels
				550	TBZ x1, 3, 8f
				551	STP q29, q30, [x7]
				552	ADD x7, x7, 32
				553	MOV v29.16b, v31.16b
				554	STP q26, q27, [x10]
				555	ADD x10, x10, 32
				556	MOV v26.16b, v28.16b
				557	STP q23, q24, [x9]
				558	ADD x9, x9, 32
				559	MOV v23.16b, v25.16b
				560	STP q20, q21, [x6]
				561	ADD x6, x6, 32
				562	MOV v20.16b, v22.16b
				563
				564	8:
				565	TBZ x1, 2, 9f
				566	STR q29, [x7], 16
				567	MOV v29.16b, v30.16b
				568	STR q26, [x10], 16
				569	MOV v26.16b, v27.16b
				570	STR q23, [x9], 16
				571	MOV v23.16b, v24.16b
				572	STR q20, [x6], 16
				573	MOV v20.16b, v21.16b
				574
				575	9:
				576	TBZ x1, 1, 10f
				577	STR d29, [x7], 8
				578	DUP d29, v29.d[1]
				579	STR d26, [x10], 8
				580	DUP d26, v26.d[1]
				581	STR d23, [x9], 8
				582	DUP d23, v23.d[1]
				583	STR d20, [x6], 8
				584	DUP d20, v20.d[1]
				585
				586	10:
				587	TBZ x1, 0, 11f
				588	STR s29, [x7]
				589	STR s26, [x10]
				590	STR s23, [x9]
				591	STR s20, [x6]
				592	11:
				593	# Restore d8-d11,d14,d15 from stack
				594	LDP d14, d15, [sp, 64]
				595	LDP d10, d11, [sp, 48]
				596	LDP d8, d9, [sp, 32]
				597
				598	# Restore x19-21 from stack
				599	LDP x19, x20, [sp, 16]
				600	LDR x21, [sp], 80
				601	RET
				602
				603	END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
				604
				605	#ifdef __ELF__
				606	.section ".note.GNU-stack","",%progbits
				607	#endif