Blame - arch/sparc64/lib/NGmemcpy.S - kernel/msm-4.9

blob: 8e522b3dc095e2100760c87425488de66ecbfad7 [file] [log] [blame]

David S. Miller	398d108	2006-03-05 16:41:56 -0800	[diff] [blame]	1	/* NGmemcpy.S: Niagara optimized memcpy.
				2	*
				3	* Copyright (C) 2006 David S. Miller (davem@davemloft.net)
				4	*/
				5
				6	#ifdef __KERNEL__
				7	#include <asm/asi.h>
David S. Miller	0d4bc95	2006-02-11 10:30:41 -0800	[diff] [blame]	8	#include <asm/thread_info.h>
David S. Miller	398d108	2006-03-05 16:41:56 -0800	[diff] [blame]	9	#define GLOBAL_SPARE %g7
David S. Miller	0d4bc95	2006-02-11 10:30:41 -0800	[diff] [blame]	10	#define RESTORE_ASI(TMP) \
				11	ldub [%g6 + TI_CURRENT_DS], TMP; \
				12	wr TMP, 0x0, %asi;
David S. Miller	398d108	2006-03-05 16:41:56 -0800	[diff] [blame]	13	#else
				14	#define GLOBAL_SPARE %g5
David S. Miller	0d4bc95	2006-02-11 10:30:41 -0800	[diff] [blame]	15	#define RESTORE_ASI(TMP) \
				16	wr %g0, ASI_PNF, %asi
David S. Miller	398d108	2006-03-05 16:41:56 -0800	[diff] [blame]	17	#endif
				18
				19	#ifndef STORE_ASI
				20	#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
				21	#endif
				22
				23	#ifndef EX_LD
				24	#define EX_LD(x) x
				25	#endif
				26
				27	#ifndef EX_ST
				28	#define EX_ST(x) x
				29	#endif
				30
				31	#ifndef EX_RETVAL
				32	#define EX_RETVAL(x) x
				33	#endif
				34
				35	#ifndef LOAD
				36	#ifndef MEMCPY_DEBUG
				37	#define LOAD(type,addr,dest) type [addr], dest
				38	#else
				39	#define LOAD(type,addr,dest) type##a [addr] 0x80, dest
				40	#endif
				41	#endif
				42
				43	#ifndef LOAD_TWIN
				44	#define LOAD_TWIN(addr_reg,dest0,dest1) \
				45	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
				46	#endif
				47
				48	#ifndef STORE
				49	#define STORE(type,src,addr) type src, [addr]
				50	#endif
				51
				52	#ifndef STORE_INIT
				53	#define STORE_INIT(src,addr) stxa src, [addr] %asi
				54	#endif
				55
				56	#ifndef FUNC_NAME
				57	#define FUNC_NAME NGmemcpy
				58	#endif
				59
				60	#ifndef PREAMBLE
				61	#define PREAMBLE
				62	#endif
				63
				64	#ifndef XCC
				65	#define XCC xcc
				66	#endif
				67
				68	.register %g2,#scratch
				69	.register %g3,#scratch
				70
				71	.text
				72	.align 64
				73
				74	.globl FUNC_NAME
				75	.type FUNC_NAME,#function
				76	FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
				77	srlx %o2, 31, %g2
				78	cmp %g2, 0
				79	tne %xcc, 5
				80	PREAMBLE
				81	mov %o0, GLOBAL_SPARE
				82	cmp %o2, 0
				83	be,pn %XCC, 85f
				84	or %o0, %o1, %o3
				85	cmp %o2, 16
				86	blu,a,pn %XCC, 80f
				87	or %o3, %o2, %o3
				88
				89	/* 2 blocks (128 bytes) is the minimum we can do the block
				90	* copy with. We need to ensure that we'll iterate at least
				91	* once in the block copy loop. At worst we'll need to align
				92	* the destination to a 64-byte boundary which can chew up
				93	* to (64 - 1) bytes from the length before we perform the
				94	* block copy loop.
				95	*/
				96	cmp %o2, (2 * 64)
				97	blu,pt %XCC, 70f
				98	andcc %o3, 0x7, %g0
				99
				100	/* %o0: dst
				101	* %o1: src
				102	* %o2: len (known to be >= 128)
				103	*
				104	* The block copy loops will use %o4/%o5,%g2/%g3 as
				105	* temporaries while copying the data.
				106	*/
				107
				108	LOAD(prefetch, %o1, #one_read)
				109	wr %g0, STORE_ASI, %asi
				110
				111	/* Align destination on 64-byte boundary. */
				112	andcc %o0, (64 - 1), %o4
				113	be,pt %XCC, 2f
				114	sub %o4, 64, %o4
				115	sub %g0, %o4, %o4 ! bytes to align dst
				116	sub %o2, %o4, %o2
				117	1: subcc %o4, 1, %o4
				118	EX_LD(LOAD(ldub, %o1, %g1))
				119	EX_ST(STORE(stb, %g1, %o0))
				120	add %o1, 1, %o1
				121	bne,pt %XCC, 1b
				122	add %o0, 1, %o0
				123
				124	/* If the source is on a 16-byte boundary we can do
				125	* the direct block copy loop. If it is 8-byte aligned
				126	* we can do the 16-byte loads offset by -8 bytes and the
				127	* init stores offset by one register.
				128	*
				129	* If the source is not even 8-byte aligned, we need to do
				130	* shifting and masking (basically integer faligndata).
				131	*
				132	* The careful bit with init stores is that if we store
				133	* to any part of the cache line we have to store the whole
				134	* cacheline else we can end up with corrupt L2 cache line
				135	* contents. Since the loop works on 64-bytes of 64-byte
				136	* aligned store data at a time, this is easy to ensure.
				137	*/
				138	2:
				139	andcc %o1, (16 - 1), %o4
				140	andn %o2, (64 - 1), %g1 ! block copy loop iterator
				141	sub %o2, %g1, %o2 ! final sub-block copy bytes
				142	be,pt %XCC, 50f
				143	cmp %o4, 8
				144	be,a,pt %XCC, 10f
				145	sub %o1, 0x8, %o1
				146
				147	/* Neither 8-byte nor 16-byte aligned, shift and mask. */
				148	mov %g1, %o4
				149	and %o1, 0x7, %g1
				150	sll %g1, 3, %g1
				151	mov 64, %o3
				152	andn %o1, 0x7, %o1
				153	EX_LD(LOAD(ldx, %o1, %g2))
				154	sub %o3, %g1, %o3
				155	sllx %g2, %g1, %g2
				156
				157	#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
				158	EX_LD(LOAD(ldx, SRC, TMP1)); \
				159	srlx TMP1, PRE_SHIFT, TMP2; \
				160	or TMP2, PRE_VAL, TMP2; \
				161	EX_ST(STORE_INIT(TMP2, DST)); \
				162	sllx TMP1, POST_SHIFT, PRE_VAL;
				163
				164	1: add %o1, 0x8, %o1
				165	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
				166	add %o1, 0x8, %o1
				167	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
				168	add %o1, 0x8, %o1
				169	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
				170	add %o1, 0x8, %o1
				171	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
				172	add %o1, 32, %o1
				173	LOAD(prefetch, %o1, #one_read)
				174	sub %o1, 32 - 8, %o1
				175	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
				176	add %o1, 8, %o1
				177	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
				178	add %o1, 8, %o1
				179	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
				180	add %o1, 8, %o1
				181	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
				182	subcc %o4, 64, %o4
				183	bne,pt %XCC, 1b
				184	add %o0, 64, %o0
				185
				186	#undef SWIVEL_ONE_DWORD
				187
				188	srl %g1, 3, %g1
				189	ba,pt %XCC, 60f
				190	add %o1, %g1, %o1
				191
				192	10: /* Destination is 64-byte aligned, source was only 8-byte
				193	* aligned but it has been subtracted by 8 and we perform
				194	* one twin load ahead, then add 8 back into source when
				195	* we finish the loop.
				196	*/
				197	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
				198	1: add %o1, 16, %o1
				199	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
				200	add %o1, 16 + 32, %o1
				201	LOAD(prefetch, %o1, #one_read)
				202	sub %o1, 32, %o1
				203	EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line
				204	EX_ST(STORE_INIT(%g2, %o0 + 0x08))
				205	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
				206	add %o1, 16, %o1
				207	EX_ST(STORE_INIT(%g3, %o0 + 0x10))
				208	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
				209	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
				210	add %o1, 16, %o1
				211	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
				212	EX_ST(STORE_INIT(%g2, %o0 + 0x28))
				213	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
				214	EX_ST(STORE_INIT(%g3, %o0 + 0x30))
				215	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
				216	subcc %g1, 64, %g1
				217	bne,pt %XCC, 1b
				218	add %o0, 64, %o0
				219
				220	ba,pt %XCC, 60f
				221	add %o1, 0x8, %o1
				222
				223	50: /* Destination is 64-byte aligned, and source is 16-byte
				224	* aligned.
				225	*/
				226	1: EX_LD(LOAD_TWIN(%o1, %o4, %o5))
				227	add %o1, 16, %o1
				228	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
				229	add %o1, 16 + 32, %o1
				230	LOAD(prefetch, %o1, #one_read)
				231	sub %o1, 32, %o1
				232	EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line
				233	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
				234	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
				235	add %o1, 16, %o1
				236	EX_ST(STORE_INIT(%g2, %o0 + 0x10))
				237	EX_ST(STORE_INIT(%g3, %o0 + 0x18))
				238	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
				239	add %o1, 16, %o1
				240	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
				241	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
				242	EX_ST(STORE_INIT(%g2, %o0 + 0x30))
				243	EX_ST(STORE_INIT(%g3, %o0 + 0x38))
				244	subcc %g1, 64, %g1
				245	bne,pt %XCC, 1b
				246	add %o0, 64, %o0
				247	/* fall through */
				248
				249	60:
				250	/* %o2 contains any final bytes still needed to be copied
				251	* over. If anything is left, we copy it one byte at a time.
				252	*/
David S. Miller	0d4bc95	2006-02-11 10:30:41 -0800	[diff] [blame]	253	RESTORE_ASI(%o3)
David S. Miller	398d108	2006-03-05 16:41:56 -0800	[diff] [blame]	254	brz,pt %o2, 85f
				255	sub %o0, %o1, %o3
				256	ba,a,pt %XCC, 90f
				257
				258	.align 64
				259	70: /* 16 < len <= 64 */
				260	bne,pn %XCC, 75f
				261	sub %o0, %o1, %o3
				262
				263	72:
				264	andn %o2, 0xf, %o4
				265	and %o2, 0xf, %o2
				266	1: subcc %o4, 0x10, %o4
				267	EX_LD(LOAD(ldx, %o1, %o5))
				268	add %o1, 0x08, %o1
				269	EX_LD(LOAD(ldx, %o1, %g1))
				270	sub %o1, 0x08, %o1
				271	EX_ST(STORE(stx, %o5, %o1 + %o3))
				272	add %o1, 0x8, %o1
				273	EX_ST(STORE(stx, %g1, %o1 + %o3))
				274	bgu,pt %XCC, 1b
				275	add %o1, 0x8, %o1
				276	73: andcc %o2, 0x8, %g0
				277	be,pt %XCC, 1f
				278	nop
				279	sub %o2, 0x8, %o2
				280	EX_LD(LOAD(ldx, %o1, %o5))
				281	EX_ST(STORE(stx, %o5, %o1 + %o3))
				282	add %o1, 0x8, %o1
				283	1: andcc %o2, 0x4, %g0
				284	be,pt %XCC, 1f
				285	nop
				286	sub %o2, 0x4, %o2
				287	EX_LD(LOAD(lduw, %o1, %o5))
				288	EX_ST(STORE(stw, %o5, %o1 + %o3))
				289	add %o1, 0x4, %o1
				290	1: cmp %o2, 0
				291	be,pt %XCC, 85f
				292	nop
				293	ba,pt %xcc, 90f
				294	nop
				295
				296	75:
				297	andcc %o0, 0x7, %g1
				298	sub %g1, 0x8, %g1
				299	be,pn %icc, 2f
				300	sub %g0, %g1, %g1
				301	sub %o2, %g1, %o2
				302
				303	1: subcc %g1, 1, %g1
				304	EX_LD(LOAD(ldub, %o1, %o5))
				305	EX_ST(STORE(stb, %o5, %o1 + %o3))
				306	bgu,pt %icc, 1b
				307	add %o1, 1, %o1
				308
				309	2: add %o1, %o3, %o0
				310	andcc %o1, 0x7, %g1
				311	bne,pt %icc, 8f
				312	sll %g1, 3, %g1
				313
				314	cmp %o2, 16
				315	bgeu,pt %icc, 72b
				316	nop
				317	ba,a,pt %xcc, 73b
				318
				319	8: mov 64, %o3
				320	andn %o1, 0x7, %o1
				321	EX_LD(LOAD(ldx, %o1, %g2))
				322	sub %o3, %g1, %o3
				323	andn %o2, 0x7, %o4
				324	sllx %g2, %g1, %g2
				325	1: add %o1, 0x8, %o1
				326	EX_LD(LOAD(ldx, %o1, %g3))
				327	subcc %o4, 0x8, %o4
				328	srlx %g3, %o3, %o5
				329	or %o5, %g2, %o5
				330	EX_ST(STORE(stx, %o5, %o0))
				331	add %o0, 0x8, %o0
				332	bgu,pt %icc, 1b
				333	sllx %g3, %g1, %g2
				334
				335	srl %g1, 3, %g1
				336	andcc %o2, 0x7, %o2
				337	be,pn %icc, 85f
				338	add %o1, %g1, %o1
				339	ba,pt %xcc, 90f
				340	sub %o0, %o1, %o3
				341
				342	.align 64
				343	80: /* 0 < len <= 16 */
				344	andcc %o3, 0x3, %g0
				345	bne,pn %XCC, 90f
				346	sub %o0, %o1, %o3
				347
				348	1:
				349	subcc %o2, 4, %o2
				350	EX_LD(LOAD(lduw, %o1, %g1))
				351	EX_ST(STORE(stw, %g1, %o1 + %o3))
				352	bgu,pt %XCC, 1b
				353	add %o1, 4, %o1
				354
				355	85: retl
				356	mov EX_RETVAL(GLOBAL_SPARE), %o0
				357
				358	.align 32
				359	90:
				360	subcc %o2, 1, %o2
				361	EX_LD(LOAD(ldub, %o1, %g1))
				362	EX_ST(STORE(stb, %g1, %o1 + %o3))
				363	bgu,pt %XCC, 90b
				364	add %o1, 1, %o1
				365	retl
				366	mov EX_RETVAL(GLOBAL_SPARE), %o0
				367
				368	.size FUNC_NAME, .-FUNC_NAME