Blame - src/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S - platform/external/XNNPACK

blob: 2606f7d8cfdebb6d8e1b7661d834d44db140c4e1 [file] [log] [blame]

Frank Barchard	8fb9055	2020-03-16 11:36:09 -0700	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2019 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <xnnpack/assembly.h>
				11
				12	# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55(
				13	# size_t mr, x0
				14	# size_t nc, x1
				15	# size_t kc, x2 / x0
				16	# const uint8_t*restrict a, x3
				17	# size_t a_stride, x4
				18	# const void*restrict w, x5
				19	# uint8_t*restrict c, x6
				20	# size_t cm_stride, x7
				21	# size_t cn_stride, [sp] -> x14
Marat Dukhan	eb09a6b	2020-04-08 17:34:32 -0700	[diff] [blame]	22	# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard	8fb9055	2020-03-16 11:36:09 -0700	[diff] [blame]	23
				24	# d8-d15 need to be preserved if used.
				25	# x19-30 need to be preserved if used.
				26
				27	# A pointers
				28	# x3 a0
				29	# x9 a1
				30	# x10 a2
				31	# x11 a3
				32
				33	# C pointers
				34	# x6 c0
				35	# x16 c1
				36	# x17 c2
				37	# x18 c3
				38
				39	# x4 temporary vector shadow register
				40
				41	# Vector register usage
				42	# A0 v0 v3
				43	# A1 v0[1] v3[1]
				44	# A2 v1 v4
				45	# A3 v1[1] v4[1]
				46
				47	# B v12 v13 v14 v15 second set of B
				48	# B v16 v17 v18 v19 first set
				49	# C v20 v21
				50	# C v22 v23
				51	# C v24 v25
				52	# C v26 v27
				53	# Clamp v6 v7
				54
				55	# unused A v8 v9 v10 v11
				56	# x12 a4
				57	# x13 c4
				58	# x7 c5
				59	# A4 v2 v5
				60	# A5 v2[1] v5[1]
				61	# C v28 v29
				62	# C v30 v31
				63
				64	BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55
				65
				66	# Clamp A and C pointers
				67	CMP x0, 2 // if mr < 2
				68	ADD x9, x3, x4 // a1 = a0 + a_stride
				69	ADD x16, x6, x7 // c1 = c0 + cm_stride
				70	CSEL x9, x3, x9, LO // a1 = a0
				71	CSEL x16, x6, x16, LO // c1 = c0
				72
				73	ADD x10, x9, x4 // a2 = a1 + a_stride
				74	ADD x17, x16, x7 // c2 = c1 + cm_stride
				75	// if mr <= 2
				76	CSEL x10, x9, x10, LS // a2 = a1
				77	CSEL x17, x16, x17, LS // c2 = c1
				78
				79	CMP x0, 4 // if mr < 4
				80	ADD x11, x10, x4 // a3 = a2 + a_stride
				81	ADD x18, x17, x7 // c3 = c2 + cm_stride
				82	CSEL x11, x10, x11, LO // a3 = a2
				83	CSEL x18, x17, x18, LO // c3 = c2
				84
				85	# Load params pointer
				86	LDR x8, [sp, 8]
				87
Marat Dukhan	eb09a6b	2020-04-08 17:34:32 -0700	[diff] [blame]	88	# Load min/max values
Frank Barchard	8fb9055	2020-03-16 11:36:09 -0700	[diff] [blame]	89	LD2R {v6.4s, v7.4s}, [x8]
				90
				91	# Load cn_stride
				92	LDR x14, [sp]
				93
				94	// Save d12-d15 on stack
				95	STP d12, d13, [sp, -32]!
				96	STP d14, d15, [sp, 16]
				97
				98	0:
				99	# Load initial bias from w into accumulators
				100	LDP q20, q21, [x5], 32
				101	MOV v22.16b, v20.16b
				102	PRFM PLDL1KEEP, [x3, 0] // Prefetch A
				103	PRFM PLDL1KEEP, [x3, 64]
				104	MOV v23.16b, v21.16b
				105	PRFM PLDL1KEEP, [x9, 0]
				106	PRFM PLDL1KEEP, [x9, 64]
				107	MOV v24.16b, v20.16b
				108	PRFM PLDL1KEEP, [x10, 0]
				109	PRFM PLDL1KEEP, [x10, 64]
				110	MOV v25.16b, v21.16b
				111	PRFM PLDL1KEEP, [x11, 0]
				112	PRFM PLDL1KEEP, [x11, 64]
				113	MOV v26.16b, v20.16b
				114	PRFM PLDL1KEEP, [x5, 0] // Prefetch B
				115	MOV v27.16b, v21.16b
				116	PRFM PLDL1KEEP, [x5, 64]
				117	PRFM PLDL1KEEP, [x5, 128]
				118	PRFM PLDL1KEEP, [x5, 192]
				119
				120	# Is there at least 4 floats (16 bytes) for prologue + epilogue?
				121	SUBS x0, x2, 16 // k = kc - 16
				122	B.LO 5f
				123
				124	# Prologue - First group loads, no FMA
				125	LDR d0, [x3], 8 // a0
				126	LDP q16, q17, [x5], 32 // b
				127	LDR d1, [x10], 8 // a2
				128	LD1 {v0.d}[1], [x9], 8 // a1
				129	LD1 {v1.d}[1], [x11], 8 // a3
				130	SUBS x0, x0, 16
				131	LDR q18, [x5], 16
				132	LDR d19, [x5], 8
				133	LDR x4, [x5], 8 // ins is in BLOCK 0
				134
				135	# Is there at least 4 floats (16 bytes) for main loop?
				136	B.LO 2f
				137
				138	# Main loop - 4 floats of A (16 bytes)
				139	# 32 FMA + 8 LD64 A + 8 LDR B
				140	1:
				141	# First group of 16 FMA, Second group loads
				142	// BLOCK 0
				143	FMLA v20.4s, v16.4s, v0.s[0]
				144	LDR d3, [x3], 8 // a0
				145	FMLA v22.4s, v16.4s, v0.s[2]
				146	INS v19.d[1], x4 // b from second group
				147	FMLA v24.4s, v16.4s, v1.s[0]
				148	LDR x4, [x9], 8 // a1
				149
				150	// BLOCK 1
				151	FMLA v26.4s, v16.4s, v1.s[2]
				152	LDR d12, [x5]
				153	FMLA v21.4s, v17.4s, v0.s[0]
				154	INS v3.d[1], x4 // a1 ins
				155	FMLA v23.4s, v17.4s, v0.s[2]
				156	LDR x4, [x5, 8] // b
				157
				158	// BLOCK 2
				159	FMLA v25.4s, v17.4s, v1.s[0]
				160	LDR d4, [x10], 8 // a2
				161	FMLA v27.4s, v17.4s, v1.s[2]
				162	INS v12.d[1], x4 // b ins
				163	FMLA v20.4s, v18.4s, v0.s[1]
				164	LDR x4, [x11], 8 // a3
				165
				166	// BLOCK 3
				167	FMLA v22.4s, v18.4s, v0.s[3]
				168	LDR d13, [x5, 16]
				169	FMLA v24.4s, v18.4s, v1.s[1]
				170	INS v4.d[1], x4 // a3 ins
				171	FMLA v26.4s, v18.4s, v1.s[3]
				172	LDR x4, [x5, 24]
				173
				174	// BLOCK 4
				175	FMLA v21.4s, v19.4s, v0.s[1]
				176	LDR d14, [x5, 32]
				177	FMLA v23.4s, v19.4s, v0.s[3]
				178	INS v13.d[1], x4 // b
				179	FMLA v25.4s, v19.4s, v1.s[1]
				180	LDR x4, [x5, 40]
				181
				182	// BLOCK 5
				183	// NOPs to ensure 4 cycle LDR lands on next LDR
				184	FMLA v27.4s, v19.4s, v1.s[3]
				185	LDR d15, [x5, 48]
				186	NOP
				187	INS v14.d[1], x4 // b from previous
				188	SUBS x0, x0, 16
				189	LDR x4, [x5, 56]
				190
				191	# Second group of 16 FMA, First group of loads
				192	// BLOCK 0
				193	FMLA v20.4s, v12.4s, v3.s[0]
				194	LDR d0, [x3], 8 // a0
				195	FMLA v22.4s, v12.4s, v3.s[2]
				196	INS v15.d[1], x4 // b from previous
				197	FMLA v24.4s, v12.4s, v4.s[0]
				198	LDR x4, [x9], 8 // a1
				199
				200	// BLOCK 1
				201	FMLA v26.4s, v12.4s, v4.s[2]
				202	LDR d16, [x5, 64]
				203	FMLA v21.4s, v13.4s, v3.s[0]
				204	INS v0.d[1], x4 // a1 ins
				205	FMLA v23.4s, v13.4s, v3.s[2]
				206	LDR x4, [x5, 72] // b
				207
				208	// BLOCK 2
				209	FMLA v25.4s, v13.4s, v4.s[0]
				210	LDR d1, [x10], 8 // a2
				211	FMLA v27.4s, v13.4s, v4.s[2]
				212	INS v16.d[1], x4 // b
				213	FMLA v20.4s, v14.4s, v3.s[1]
				214	LDR x4, [x11], 8 // a3
				215
				216	// BLOCK 3
				217	FMLA v22.4s, v14.4s, v3.s[3]
				218	LDR d17, [x5, 80]
				219	FMLA v24.4s, v14.4s, v4.s[1]
				220	INS v1.d[1], x4 // a3 ins
				221	FMLA v26.4s, v14.4s, v4.s[3]
				222	LDR x4, [x5, 88]
				223
				224	// BLOCK 4
				225	FMLA v21.4s, v15.4s, v3.s[1]
				226	LDR d18, [x5, 96]
				227	FMLA v23.4s, v15.4s, v3.s[3]
				228	INS v17.d[1], x4 // b
				229	FMLA v25.4s, v15.4s, v4.s[1]
				230	LDR x4, [x5, 104]
				231
				232	// BLOCK 5
				233	// NOTE that block needs to be 4 cycles for LDR not to stall
				234	FMLA v27.4s, v15.4s, v4.s[3]
				235	LDR d19, [x5, 112]
				236	INS v18.d[1], x4
				237	LDR x4, [x5, 120]
				238	ADD x5, x5, 128
				239	B.HS 1b
				240
				241	# Epilogue - 4 floats of A (16 bytes)
				242	# 32 FMA + 8 LD64 A + 8 LDR B
				243	2:
				244	# First group of 16 FMA, Second group loads
				245	// BLOCK 0
				246	FMLA v20.4s, v16.4s, v0.s[0]
				247	LDR d3, [x3], 8 // a0
				248	FMLA v22.4s, v16.4s, v0.s[2]
				249	INS v19.d[1], x4 // b from second group
				250	FMLA v24.4s, v16.4s, v1.s[0]
				251	LDR x4, [x9], 8 // a1
				252
				253	// BLOCK 1
				254	FMLA v26.4s, v16.4s, v1.s[2]
				255	LDR d12, [x5]
				256	FMLA v21.4s, v17.4s, v0.s[0]
				257	INS v3.d[1], x4 // a1 ins
				258	FMLA v23.4s, v17.4s, v0.s[2]
				259	LDR x4, [x5, 8] // b
				260
				261	// BLOCK 2
				262	FMLA v25.4s, v17.4s, v1.s[0]
				263	LDR d4, [x10], 8 // a2
				264	FMLA v27.4s, v17.4s, v1.s[2]
				265	INS v12.d[1], x4 // b ins
				266	FMLA v20.4s, v18.4s, v0.s[1]
				267	LDR x4, [x11], 8 // a3
				268
				269	// BLOCK 3
				270	FMLA v22.4s, v18.4s, v0.s[3]
				271	LDR d13, [x5, 16]
				272	FMLA v24.4s, v18.4s, v1.s[1]
				273	INS v4.d[1], x4 // a3 ins
				274	FMLA v26.4s, v18.4s, v1.s[3]
				275	LDR x4, [x5, 24]
				276
				277	// BLOCK 4
				278	FMLA v21.4s, v19.4s, v0.s[1]
				279	LDR d14, [x5, 32]
				280	FMLA v23.4s, v19.4s, v0.s[3]
				281	INS v13.d[1], x4 // b
				282	FMLA v25.4s, v19.4s, v1.s[1]
				283	LDR x4, [x5, 40]
				284
				285	// BLOCK 5
				286	// NOPs to ensure 4 cycle LDR lands on next LDR
				287	FMLA v27.4s, v19.4s, v1.s[3]
				288	LDR d15, [x5, 48]
				289	NOP // fma
				290	INS v14.d[1], x4
				291	NOP
				292	LDR x4, [x5, 56]
				293
				294	# Second group of 16 FMA, no loads
				295	// BLOCK 0
				296	FMLA v20.4s, v12.4s, v3.s[0]
				297	FMLA v22.4s, v12.4s, v3.s[2]
				298	INS v15.d[1], x4 // b from previous
				299	FMLA v24.4s, v12.4s, v4.s[0]
				300
				301	// BLOCK 1
				302	FMLA v26.4s, v12.4s, v4.s[2]
				303	FMLA v21.4s, v13.4s, v3.s[0]
				304	FMLA v23.4s, v13.4s, v3.s[2]
				305
				306	// BLOCK 2
				307	FMLA v25.4s, v13.4s, v4.s[0]
				308	FMLA v27.4s, v13.4s, v4.s[2]
				309	FMLA v20.4s, v14.4s, v3.s[1]
				310
				311	// BLOCK 3
				312	FMLA v22.4s, v14.4s, v3.s[3]
				313	FMLA v24.4s, v14.4s, v4.s[1]
				314	FMLA v26.4s, v14.4s, v4.s[3]
				315	TST x0, 15
				316
				317	// BLOCK 4
				318	FMLA v21.4s, v15.4s, v3.s[1]
				319	FMLA v23.4s, v15.4s, v3.s[3]
				320	FMLA v25.4s, v15.4s, v4.s[1]
				321	ADD x5, x5, 64
				322
				323	// BLOCK 5
				324	FMLA v27.4s, v15.4s, v4.s[3]
				325
				326	# Is there a remainder?- 2 floats of A (8 bytes) or less
				327	B.NE 5f
				328
				329	4:
				330	# Clamp
Marat Dukhan	a51cf48	2020-04-08 16:16:19 -0700	[diff] [blame]	331	FMAX v20.4s, v20.4s, v6.4s
Frank Barchard	8fb9055	2020-03-16 11:36:09 -0700	[diff] [blame]	332	SUBS x1, x1, 8
Marat Dukhan	a51cf48	2020-04-08 16:16:19 -0700	[diff] [blame]	333	FMAX v21.4s, v21.4s, v6.4s
				334	FMAX v22.4s, v22.4s, v6.4s
				335	FMAX v23.4s, v23.4s, v6.4s
				336	FMAX v24.4s, v24.4s, v6.4s
				337	FMAX v25.4s, v25.4s, v6.4s
				338	FMAX v26.4s, v26.4s, v6.4s
				339	FMAX v27.4s, v27.4s, v6.4s
				340	FMIN v20.4s, v20.4s, v7.4s
				341	FMIN v21.4s, v21.4s, v7.4s
				342	FMIN v22.4s, v22.4s, v7.4s
				343	FMIN v23.4s, v23.4s, v7.4s
				344	FMIN v24.4s, v24.4s, v7.4s
				345	FMIN v25.4s, v25.4s, v7.4s
				346	FMIN v26.4s, v26.4s, v7.4s
				347	FMIN v27.4s, v27.4s, v7.4s
Frank Barchard	8fb9055	2020-03-16 11:36:09 -0700	[diff] [blame]	348
				349	# Store full 4 x 8
				350	B.LO 8f
				351
				352	ST1 {v20.16b, v21.16b}, [x6], x14
				353	SUB x3, x3, x2 // a0 -= kc
				354	ST1 {v22.16b, v23.16b}, [x16], x14
				355	SUB x9, x9, x2 // a1 -= kc
				356	ST1 {v24.16b, v25.16b}, [x17], x14
				357	SUB x10, x10, x2 // a2 -= kc
				358	ST1 {v26.16b, v27.16b}, [x18], x14
				359	SUB x11, x11, x2 // a3 -= kc
				360
				361	B.HI 0b
				362
				363	// Restore d12-d15 from stack
				364	LDP d14, d15, [sp, 16]
				365	LDP d12, d13, [sp], 32
				366	RET
				367
				368	5:
				369	# Is there a remainder?- 2 floats of A (8 bytes)
				370	TBZ x0, 3, 6f
				371
				372	# Remainder- 2 floats of A (8 bytes)
				373	LDR d0, [x3], 8
				374	LDR q16, [x5], 16
				375	LD1 {v0.d}[1], [x9], 8
				376	LDR d1, [x10], 8
				377	LD1 {v1.d}[1], [x11], 8
				378	LDR q17, [x5], 16
				379	LDR q18, [x5], 16
				380	LDR q19, [x5], 16
				381	FMLA v20.4s, v16.4s, v0.s[0]
				382	FMLA v22.4s, v16.4s, v0.s[2]
				383	FMLA v24.4s, v16.4s, v1.s[0]
				384	FMLA v26.4s, v16.4s, v1.s[2]
				385	FMLA v21.4s, v17.4s, v0.s[0]
				386	FMLA v23.4s, v17.4s, v0.s[2]
				387	FMLA v25.4s, v17.4s, v1.s[0]
				388	FMLA v27.4s, v17.4s, v1.s[2]
				389
				390	FMLA v20.4s, v18.4s, v0.s[1]
				391	FMLA v22.4s, v18.4s, v0.s[3]
				392	FMLA v24.4s, v18.4s, v1.s[1]
				393	FMLA v26.4s, v18.4s, v1.s[3]
				394	FMLA v21.4s, v19.4s, v0.s[1]
				395	FMLA v23.4s, v19.4s, v0.s[3]
				396	FMLA v25.4s, v19.4s, v1.s[1]
				397	FMLA v27.4s, v19.4s, v1.s[3]
				398
				399	# Is there a remainder?- 1 floats of A (4 bytes)
				400	TBZ x0, 2, 4b
				401
				402	6:
				403	# Remainder- 1 floats of A (4 bytes)
				404	LDR s0, [x3], 4
				405	LDR q16, [x5], 16
				406	LD1 {v0.s}[2], [x9], 4
				407	LDR s1, [x10], 4
				408	LD1 {v1.s}[2], [x11], 4
				409	LDR q17, [x5], 16
				410
				411	FMLA v20.4s, v16.4s, v0.s[0]
				412	FMLA v22.4s, v16.4s, v0.s[2]
				413	FMLA v24.4s, v16.4s, v1.s[0]
				414	FMLA v26.4s, v16.4s, v1.s[2]
				415	FMLA v21.4s, v17.4s, v0.s[0]
				416	FMLA v23.4s, v17.4s, v0.s[2]
				417	FMLA v25.4s, v17.4s, v1.s[0]
				418	FMLA v27.4s, v17.4s, v1.s[2]
				419	B 4b
				420
				421	# Store odd width
				422	8:
				423	TBZ x1, 2, 9f
				424	STR q20, [x6], 16
				425	MOV v20.16b, v21.16b
				426	STR q22, [x16], 16
				427	MOV v22.16b, v23.16b
				428	STR q24, [x17], 16
				429	MOV v24.16b, v25.16b
				430	STR q26, [x18], 16
				431	MOV v26.16b, v27.16b
				432
				433	9:
				434	TBZ x1, 1, 10f
				435	STR d20, [x6], 8
				436	DUP d20, v20.d[1]
				437	STR d22, [x16], 8
				438	DUP d22, v22.d[1]
				439	STR d24, [x17], 8
				440	DUP d24, v24.d[1]
				441	STR d26, [x18], 8
				442	DUP d26, v26.d[1]
				443
				444	10:
				445	TBZ x1, 0, 11f
				446	STR s20, [x6]
				447	STR s22, [x16]
				448	STR s24, [x17]
				449	STR s26, [x18]
				450	11:
				451	// Restore d12-d15 from stack
				452	LDP d14, d15, [sp, 16]
				453	LDP d12, d13, [sp], 32
				454	RET
				455
				456	END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55
				457
				458	#ifdef __ELF__
				459	.section ".note.GNU-stack","",%progbits
				460	#endif