Blame - arch/xtensa/lib/memcopy.S - kernel/msm-4.9

blob: b1c219acabe7a56b2bd06b5ff3f42e63505e9031 [file] [log] [blame]

Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	1	/*
				2	* arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
				3	* xthal_memcpy and xthal_bcopy
				4	*
				5	* This file is subject to the terms and conditions of the GNU General Public
				6	* License. See the file "COPYING" in the main directory of this archive
				7	* for more details.
				8	*
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	9	* Copyright (C) 2002 - 2012 Tensilica Inc.
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	10	*/
				11
Chris Zankel	367b811	2008-11-06 06:40:46 -0800	[diff] [blame]	12	#include <variant/core.h>
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	13
				14	.macro src_b r, w0, w1
				15	#ifdef __XTENSA_EB__
				16	src \r, \w0, \w1
				17	#else
				18	src \r, \w1, \w0
				19	#endif
				20	.endm
				21
				22	.macro ssa8 r
				23	#ifdef __XTENSA_EB__
				24	ssa8b \r
				25	#else
				26	ssa8l \r
				27	#endif
				28	.endm
				29
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	30	/*
				31	* void memcpy(void dst, const void *src, size_t len);
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	32	*
				33	* This function is intended to do the same thing as the standard
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	34	* library function memcpy() for most cases.
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	35	* However, where the source and/or destination references
				36	* an instruction RAM or ROM or a data RAM or ROM, that
				37	* source and/or destination will always be accessed with
				38	* 32-bit load and store instructions (as required for these
				39	* types of devices).
				40	*
				41	* !!!!!!! XTFIXME:
				42	* !!!!!!! Handling of IRAM/IROM has not yet
				43	* !!!!!!! been implemented.
				44	*
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	45	* The (general case) algorithm is as follows:
				46	* If destination is unaligned, align it by conditionally
				47	* copying 1 and 2 bytes.
				48	* If source is aligned,
				49	* do 16 bytes with a loop, and then finish up with
				50	* 8, 4, 2, and 1 byte copies conditional on the length;
				51	* else (if source is unaligned),
				52	* do the same, but use SRC to align the source data.
				53	* This code tries to use fall-through branches for the common
				54	* case of aligned source and destination and multiple
				55	* of 4 (or 8) length.
				56	*
				57	* Register use:
				58	* a0/ return address
				59	* a1/ stack pointer
				60	* a2/ return value
				61	* a3/ src
				62	* a4/ length
				63	* a5/ dst
				64	* a6/ tmp
				65	* a7/ tmp
				66	* a8/ tmp
				67	* a9/ tmp
				68	* a10/ tmp
				69	* a11/ tmp
				70	*/
				71
				72	.text
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	73
				74	/*
				75	* Byte by byte copy
				76	*/
				77	.align 4
				78	.byte 0 # 1 mod 4 alignment for LOOPNEZ
				79	# (0 mod 4 alignment for LBEG)
				80	.Lbytecopy:
				81	#if XCHAL_HAVE_LOOPS
				82	loopnez a4, .Lbytecopydone
				83	#else /* !XCHAL_HAVE_LOOPS */
				84	beqz a4, .Lbytecopydone
				85	add a7, a3, a4 # a7 = end address for source
				86	#endif /* !XCHAL_HAVE_LOOPS */
				87	.Lnextbyte:
				88	l8ui a6, a3, 0
				89	addi a3, a3, 1
				90	s8i a6, a5, 0
				91	addi a5, a5, 1
				92	#if !XCHAL_HAVE_LOOPS
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	93	bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	94	#endif /* !XCHAL_HAVE_LOOPS */
				95	.Lbytecopydone:
				96	retw
				97
				98	/*
				99	* Destination is unaligned
				100	*/
				101
				102	.align 4
				103	.Ldst1mod2: # dst is only byte aligned
				104	_bltui a4, 7, .Lbytecopy # do short copies byte by byte
				105
				106	# copy 1 byte
				107	l8ui a6, a3, 0
				108	addi a3, a3, 1
				109	addi a4, a4, -1
				110	s8i a6, a5, 0
				111	addi a5, a5, 1
				112	_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
				113	# return to main algorithm
				114	.Ldst2mod4: # dst 16-bit aligned
				115	# copy 2 bytes
				116	_bltui a4, 6, .Lbytecopy # do short copies byte by byte
				117	l8ui a6, a3, 0
				118	l8ui a7, a3, 1
				119	addi a3, a3, 2
				120	addi a4, a4, -2
				121	s8i a6, a5, 0
				122	s8i a7, a5, 1
				123	addi a5, a5, 2
				124	j .Ldstaligned # dst is now aligned, return to main algorithm
				125
				126	.align 4
				127	.global memcpy
				128	.type memcpy,@function
				129	memcpy:
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	130
				131	entry sp, 16 # minimal stack frame
				132	# a2/ dst, a3/ src, a4/ len
				133	mov a5, a2 # copy dst so that a2 is return value
				134	.Lcommon:
				135	_bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
				136	_bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
				137	.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
				138	srli a7, a4, 4 # number of loop iterations with 16B
				139	# per iteration
				140	movi a8, 3 # if source is not aligned,
				141	_bany a3, a8, .Lsrcunaligned # then use shifting copy
				142	/*
				143	* Destination and source are word-aligned, use word copy.
				144	*/
				145	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
				146	#if XCHAL_HAVE_LOOPS
				147	loopnez a7, .Loop1done
				148	#else /* !XCHAL_HAVE_LOOPS */
				149	beqz a7, .Loop1done
				150	slli a8, a7, 4
				151	add a8, a8, a3 # a8 = end of last 16B source chunk
				152	#endif /* !XCHAL_HAVE_LOOPS */
				153	.Loop1:
				154	l32i a6, a3, 0
				155	l32i a7, a3, 4
				156	s32i a6, a5, 0
				157	l32i a6, a3, 8
				158	s32i a7, a5, 4
				159	l32i a7, a3, 12
				160	s32i a6, a5, 8
				161	addi a3, a3, 16
				162	s32i a7, a5, 12
				163	addi a5, a5, 16
				164	#if !XCHAL_HAVE_LOOPS
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	165	bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	166	#endif /* !XCHAL_HAVE_LOOPS */
				167	.Loop1done:
				168	bbci.l a4, 3, .L2
				169	# copy 8 bytes
				170	l32i a6, a3, 0
				171	l32i a7, a3, 4
				172	addi a3, a3, 8
				173	s32i a6, a5, 0
				174	s32i a7, a5, 4
				175	addi a5, a5, 8
				176	.L2:
				177	bbsi.l a4, 2, .L3
				178	bbsi.l a4, 1, .L4
				179	bbsi.l a4, 0, .L5
				180	retw
				181	.L3:
				182	# copy 4 bytes
				183	l32i a6, a3, 0
				184	addi a3, a3, 4
				185	s32i a6, a5, 0
				186	addi a5, a5, 4
				187	bbsi.l a4, 1, .L4
				188	bbsi.l a4, 0, .L5
				189	retw
				190	.L4:
				191	# copy 2 bytes
				192	l16ui a6, a3, 0
				193	addi a3, a3, 2
				194	s16i a6, a5, 0
				195	addi a5, a5, 2
				196	bbsi.l a4, 0, .L5
				197	retw
				198	.L5:
				199	# copy 1 byte
				200	l8ui a6, a3, 0
				201	s8i a6, a5, 0
				202	retw
				203
				204	/*
				205	* Destination is aligned, Source is unaligned
				206	*/
				207
				208	.align 4
				209	.Lsrcunaligned:
				210	_beqz a4, .Ldone # avoid loading anything for zero-length copies
				211	# copy 16 bytes per iteration for word-aligned dst and unaligned src
				212	ssa8 a3 # set shift amount from byte offset
Chris Zankel	c4c4594	2012-11-28 16:53:51 -0800	[diff] [blame]	213
				214	/* set to 1 when running on ISS (simulator) with the
				215	lint or ferret client, or 0 to save a few cycles */
				216	#define SIM_CHECKS_ALIGNMENT 1
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	217	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				218	and a11, a3, a8 # save unalignment offset for below
				219	sub a3, a3, a11 # align a3
				220	#endif
				221	l32i a6, a3, 0 # load first word
				222	#if XCHAL_HAVE_LOOPS
				223	loopnez a7, .Loop2done
				224	#else /* !XCHAL_HAVE_LOOPS */
				225	beqz a7, .Loop2done
				226	slli a10, a7, 4
				227	add a10, a10, a3 # a10 = end of last 16B source chunk
				228	#endif /* !XCHAL_HAVE_LOOPS */
				229	.Loop2:
				230	l32i a7, a3, 4
				231	l32i a8, a3, 8
				232	src_b a6, a6, a7
				233	s32i a6, a5, 0
				234	l32i a9, a3, 12
				235	src_b a7, a7, a8
				236	s32i a7, a5, 4
				237	l32i a6, a3, 16
				238	src_b a8, a8, a9
				239	s32i a8, a5, 8
				240	addi a3, a3, 16
				241	src_b a9, a9, a6
				242	s32i a9, a5, 12
				243	addi a5, a5, 16
				244	#if !XCHAL_HAVE_LOOPS
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	245	bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	246	#endif /* !XCHAL_HAVE_LOOPS */
				247	.Loop2done:
				248	bbci.l a4, 3, .L12
				249	# copy 8 bytes
				250	l32i a7, a3, 4
				251	l32i a8, a3, 8
				252	src_b a6, a6, a7
				253	s32i a6, a5, 0
				254	addi a3, a3, 8
				255	src_b a7, a7, a8
				256	s32i a7, a5, 4
				257	addi a5, a5, 8
				258	mov a6, a8
				259	.L12:
				260	bbci.l a4, 2, .L13
				261	# copy 4 bytes
				262	l32i a7, a3, 4
				263	addi a3, a3, 4
				264	src_b a6, a6, a7
				265	s32i a6, a5, 0
				266	addi a5, a5, 4
				267	mov a6, a7
				268	.L13:
				269	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				270	add a3, a3, a11 # readjust a3 with correct misalignment
				271	#endif
				272	bbsi.l a4, 1, .L14
				273	bbsi.l a4, 0, .L15
				274	.Ldone: retw
				275	.L14:
				276	# copy 2 bytes
				277	l8ui a6, a3, 0
				278	l8ui a7, a3, 1
				279	addi a3, a3, 2
				280	s8i a6, a5, 0
				281	s8i a7, a5, 1
				282	addi a5, a5, 2
				283	bbsi.l a4, 0, .L15
				284	retw
				285	.L15:
				286	# copy 1 byte
				287	l8ui a6, a3, 0
				288	s8i a6, a5, 0
				289	retw
Chris Zankel	eae8a41	2012-10-15 21:41:19 -0700	[diff] [blame]	290
				291
				292	/*
				293	* void bcopy(const void src, void dest, size_t n);
				294	*/
				295	.align 4
				296	.global bcopy
				297	.type bcopy,@function
				298	bcopy:
				299	entry sp, 16 # minimal stack frame
				300	# a2=src, a3=dst, a4=len
				301	mov a5, a3
				302	mov a3, a2
				303	mov a2, a5
				304	j .Lmovecommon # go to common code for memmove+bcopy
				305
				306	/*
				307	* void memmove(void dst, const void *src, size_t len);
				308	*
				309	* This function is intended to do the same thing as the standard
				310	* library function memmove() for most cases.
				311	* However, where the source and/or destination references
				312	* an instruction RAM or ROM or a data RAM or ROM, that
				313	* source and/or destination will always be accessed with
				314	* 32-bit load and store instructions (as required for these
				315	* types of devices).
				316	*
				317	* !!!!!!! XTFIXME:
				318	* !!!!!!! Handling of IRAM/IROM has not yet
				319	* !!!!!!! been implemented.
				320	*
				321	* The (general case) algorithm is as follows:
				322	* If end of source doesn't overlap destination then use memcpy.
				323	* Otherwise do memcpy backwards.
				324	*
				325	* Register use:
				326	* a0/ return address
				327	* a1/ stack pointer
				328	* a2/ return value
				329	* a3/ src
				330	* a4/ length
				331	* a5/ dst
				332	* a6/ tmp
				333	* a7/ tmp
				334	* a8/ tmp
				335	* a9/ tmp
				336	* a10/ tmp
				337	* a11/ tmp
				338	*/
				339
				340	/*
				341	* Byte by byte copy
				342	*/
				343	.align 4
				344	.byte 0 # 1 mod 4 alignment for LOOPNEZ
				345	# (0 mod 4 alignment for LBEG)
				346	.Lbackbytecopy:
				347	#if XCHAL_HAVE_LOOPS
				348	loopnez a4, .Lbackbytecopydone
				349	#else /* !XCHAL_HAVE_LOOPS */
				350	beqz a4, .Lbackbytecopydone
				351	sub a7, a3, a4 # a7 = start address for source
				352	#endif /* !XCHAL_HAVE_LOOPS */
				353	.Lbacknextbyte:
				354	addi a3, a3, -1
				355	l8ui a6, a3, 0
				356	addi a5, a5, -1
				357	s8i a6, a5, 0
				358	#if !XCHAL_HAVE_LOOPS
				359	bne a3, a7, .Lbacknextbyte # continue loop if
				360	# $a3:src != $a7:src_start
				361	#endif /* !XCHAL_HAVE_LOOPS */
				362	.Lbackbytecopydone:
				363	retw
				364
				365	/*
				366	* Destination is unaligned
				367	*/
				368
				369	.align 4
				370	.Lbackdst1mod2: # dst is only byte aligned
				371	_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
				372
				373	# copy 1 byte
				374	addi a3, a3, -1
				375	l8ui a6, a3, 0
				376	addi a5, a5, -1
				377	s8i a6, a5, 0
				378	addi a4, a4, -1
				379	_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
				380	# return to main algorithm
				381	.Lbackdst2mod4: # dst 16-bit aligned
				382	# copy 2 bytes
				383	_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
				384	addi a3, a3, -2
				385	l8ui a6, a3, 0
				386	l8ui a7, a3, 1
				387	addi a5, a5, -2
				388	s8i a6, a5, 0
				389	s8i a7, a5, 1
				390	addi a4, a4, -2
				391	j .Lbackdstaligned # dst is now aligned,
				392	# return to main algorithm
				393
				394	.align 4
				395	.global memmove
				396	.type memmove,@function
				397	memmove:
				398
				399	entry sp, 16 # minimal stack frame
				400	# a2/ dst, a3/ src, a4/ len
				401	mov a5, a2 # copy dst so that a2 is return value
				402	.Lmovecommon:
				403	sub a6, a5, a3
				404	bgeu a6, a4, .Lcommon
				405
				406	add a5, a5, a4
				407	add a3, a3, a4
				408
				409	_bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
				410	_bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
				411	.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
				412	srli a7, a4, 4 # number of loop iterations with 16B
				413	# per iteration
				414	movi a8, 3 # if source is not aligned,
				415	_bany a3, a8, .Lbacksrcunaligned # then use shifting copy
				416	/*
				417	* Destination and source are word-aligned, use word copy.
				418	*/
				419	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
				420	#if XCHAL_HAVE_LOOPS
				421	loopnez a7, .backLoop1done
				422	#else /* !XCHAL_HAVE_LOOPS */
				423	beqz a7, .backLoop1done
				424	slli a8, a7, 4
				425	sub a8, a3, a8 # a8 = start of first 16B source chunk
				426	#endif /* !XCHAL_HAVE_LOOPS */
				427	.backLoop1:
				428	addi a3, a3, -16
				429	l32i a7, a3, 12
				430	l32i a6, a3, 8
				431	addi a5, a5, -16
				432	s32i a7, a5, 12
				433	l32i a7, a3, 4
				434	s32i a6, a5, 8
				435	l32i a6, a3, 0
				436	s32i a7, a5, 4
				437	s32i a6, a5, 0
				438	#if !XCHAL_HAVE_LOOPS
				439	bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
				440	#endif /* !XCHAL_HAVE_LOOPS */
				441	.backLoop1done:
				442	bbci.l a4, 3, .Lback2
				443	# copy 8 bytes
				444	addi a3, a3, -8
				445	l32i a6, a3, 0
				446	l32i a7, a3, 4
				447	addi a5, a5, -8
				448	s32i a6, a5, 0
				449	s32i a7, a5, 4
				450	.Lback2:
				451	bbsi.l a4, 2, .Lback3
				452	bbsi.l a4, 1, .Lback4
				453	bbsi.l a4, 0, .Lback5
				454	retw
				455	.Lback3:
				456	# copy 4 bytes
				457	addi a3, a3, -4
				458	l32i a6, a3, 0
				459	addi a5, a5, -4
				460	s32i a6, a5, 0
				461	bbsi.l a4, 1, .Lback4
				462	bbsi.l a4, 0, .Lback5
				463	retw
				464	.Lback4:
				465	# copy 2 bytes
				466	addi a3, a3, -2
				467	l16ui a6, a3, 0
				468	addi a5, a5, -2
				469	s16i a6, a5, 0
				470	bbsi.l a4, 0, .Lback5
				471	retw
				472	.Lback5:
				473	# copy 1 byte
				474	addi a3, a3, -1
				475	l8ui a6, a3, 0
				476	addi a5, a5, -1
				477	s8i a6, a5, 0
				478	retw
				479
				480	/*
				481	* Destination is aligned, Source is unaligned
				482	*/
				483
				484	.align 4
				485	.Lbacksrcunaligned:
				486	_beqz a4, .Lbackdone # avoid loading anything for zero-length copies
				487	# copy 16 bytes per iteration for word-aligned dst and unaligned src
				488	ssa8 a3 # set shift amount from byte offset
				489	#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
				490	* the lint or ferret client, or 0
				491	* to save a few cycles */
				492	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				493	and a11, a3, a8 # save unalignment offset for below
				494	sub a3, a3, a11 # align a3
				495	#endif
				496	l32i a6, a3, 0 # load first word
				497	#if XCHAL_HAVE_LOOPS
				498	loopnez a7, .backLoop2done
				499	#else /* !XCHAL_HAVE_LOOPS */
				500	beqz a7, .backLoop2done
				501	slli a10, a7, 4
				502	sub a10, a3, a10 # a10 = start of first 16B source chunk
				503	#endif /* !XCHAL_HAVE_LOOPS */
				504	.backLoop2:
				505	addi a3, a3, -16
				506	l32i a7, a3, 12
				507	l32i a8, a3, 8
				508	addi a5, a5, -16
				509	src_b a6, a7, a6
				510	s32i a6, a5, 12
				511	l32i a9, a3, 4
				512	src_b a7, a8, a7
				513	s32i a7, a5, 8
				514	l32i a6, a3, 0
				515	src_b a8, a9, a8
				516	s32i a8, a5, 4
				517	src_b a9, a6, a9
				518	s32i a9, a5, 0
				519	#if !XCHAL_HAVE_LOOPS
				520	bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
				521	#endif /* !XCHAL_HAVE_LOOPS */
				522	.backLoop2done:
				523	bbci.l a4, 3, .Lback12
				524	# copy 8 bytes
				525	addi a3, a3, -8
				526	l32i a7, a3, 4
				527	l32i a8, a3, 0
				528	addi a5, a5, -8
				529	src_b a6, a7, a6
				530	s32i a6, a5, 4
				531	src_b a7, a8, a7
				532	s32i a7, a5, 0
				533	mov a6, a8
				534	.Lback12:
				535	bbci.l a4, 2, .Lback13
				536	# copy 4 bytes
				537	addi a3, a3, -4
				538	l32i a7, a3, 0
				539	addi a5, a5, -4
				540	src_b a6, a7, a6
				541	s32i a6, a5, 0
				542	mov a6, a7
				543	.Lback13:
				544	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				545	add a3, a3, a11 # readjust a3 with correct misalignment
				546	#endif
				547	bbsi.l a4, 1, .Lback14
				548	bbsi.l a4, 0, .Lback15
				549	.Lbackdone:
				550	retw
				551	.Lback14:
				552	# copy 2 bytes
				553	addi a3, a3, -2
				554	l8ui a6, a3, 0
				555	l8ui a7, a3, 1
				556	addi a5, a5, -2
				557	s8i a6, a5, 0
				558	s8i a7, a5, 1
				559	bbsi.l a4, 0, .Lback15
				560	retw
				561	.Lback15:
				562	# copy 1 byte
				563	addi a3, a3, -1
				564	addi a5, a5, -1
				565	l8ui a6, a3, 0
				566	s8i a6, a5, 0
				567	retw
				568
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	569
				570	/*
				571	* Local Variables:
				572	* mode:fundamental
				573	* comment-start: "# "
				574	* comment-start-skip: "# *"
				575	* End:
				576	*/