Blame - libc/arch-arm/generic/bionic/memcpy.S - fp2-dev/platform/bionic

blob: cd4a13d128a0ec5691ed11b77d399d75aff209d1 [file] [log] [blame]

Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	#include <machine/cpu-features.h>
Elliott Hughes	851e68a	2014-02-19 16:53:20 -0800	[diff] [blame]	30	#include <private/bionic_asm.h>
				31	#include <private/libc_events.h>
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	32
				33	/*
				34	* Optimized memcpy() for ARM.
				35	*
				36	* note that memcpy() always returns the destination pointer,
				37	* so we have to preserve R0.
				38	*/
				39
Christopher Ferris	59a13c1	2013-08-01 13:13:33 -0700	[diff] [blame]	40	ENTRY(__memcpy_chk)
				41	cmp r2, r3
				42	bgt fortify_check_failed
				43
				44	// Fall through to memcpy...
				45	END(__memcpy_chk)
				46
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	47	ENTRY(memcpy)
				48	/* The stack must always be 64-bits aligned to be compliant with the
				49	* ARM ABI. Since we have to save R0, we might as well save R4
				50	* which we can use for better pipelining of the reads below
				51	*/
				52	.save {r0, r4, lr}
				53	stmfd sp!, {r0, r4, lr}
				54	/* Making room for r5-r11 which will be spilled later */
				55	.pad #28
				56	sub sp, sp, #28
				57
				58	// preload the destination because we'll align it to a cache line
				59	// with small writes. Also start the source "pump".
Elliott Hughes	c54ca40	2013-12-13 12:17:13 -0800	[diff] [blame]	60	pld [r0, #0]
				61	pld [r1, #0]
				62	pld [r1, #32]
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	63
				64	/* it simplifies things to take care of len<4 early */
				65	cmp r2, #4
				66	blo copy_last_3_and_return
				67
				68	/* compute the offset to align the source
				69	* offset = (4-(src&3))&3 = -src & 3
				70	*/
				71	rsb r3, r1, #0
				72	ands r3, r3, #3
				73	beq src_aligned
				74
				75	/* align source to 32 bits. We need to insert 2 instructions between
				76	* a ldr[b\|h] and str[b\|h] because byte and half-word instructions
				77	* stall 2 cycles.
				78	*/
				79	movs r12, r3, lsl #31
				80	sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
				81	ldrmib r3, [r1], #1
				82	ldrcsb r4, [r1], #1
				83	ldrcsb r12,[r1], #1
				84	strmib r3, [r0], #1
				85	strcsb r4, [r0], #1
				86	strcsb r12,[r0], #1
				87
				88	src_aligned:
				89
				90	/* see if src and dst are aligned together (congruent) */
				91	eor r12, r0, r1
				92	tst r12, #3
				93	bne non_congruent
				94
				95	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				96	* frame. Don't update sp.
				97	*/
				98	stmea sp, {r5-r11}
				99
				100	/* align the destination to a cache-line */
				101	rsb r3, r0, #0
				102	ands r3, r3, #0x1C
				103	beq congruent_aligned32
				104	cmp r3, r2
				105	andhi r3, r2, #0x1C
				106
				107	/* conditionally copies 0 to 7 words (length in r3) */
				108	movs r12, r3, lsl #28
				109	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				110	ldmmiia r1!, {r8, r9} /* 8 bytes */
				111	stmcsia r0!, {r4, r5, r6, r7}
				112	stmmiia r0!, {r8, r9}
				113	tst r3, #0x4
				114	ldrne r10,[r1], #4 /* 4 bytes */
				115	strne r10,[r0], #4
				116	sub r2, r2, r3
				117
				118	congruent_aligned32:
				119	/*
				120	* here source is aligned to 32 bytes.
				121	*/
				122
				123	cached_aligned32:
				124	subs r2, r2, #32
				125	blo less_than_32_left
				126
				127	/*
				128	* We preload a cache-line up to 64 bytes ahead. On the 926, this will
				129	* stall only until the requested world is fetched, but the linefill
				130	* continues in the the background.
				131	* While the linefill is going, we write our previous cache-line
				132	* into the write-buffer (which should have some free space).
				133	* When the linefill is done, the writebuffer will
				134	* start dumping its content into memory
				135	*
				136	* While all this is going, we then load a full cache line into
				137	* 8 registers, this cache line should be in the cache by now
				138	* (or partly in the cache).
				139	*
				140	* This code should work well regardless of the source/dest alignment.
				141	*
				142	*/
				143
				144	// Align the preload register to a cache-line because the cpu does
				145	// "critical word first" (the first word requested is loaded first).
				146	bic r12, r1, #0x1F
				147	add r12, r12, #64
				148
				149	1: ldmia r1!, { r4-r11 }
Elliott Hughes	c54ca40	2013-12-13 12:17:13 -0800	[diff] [blame]	150	pld [r12, #64]
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	151	subs r2, r2, #32
				152
				153	// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
				154	// for ARM9 preload will not be safely guarded by the preceding subs.
				155	// When it is safely guarded the only possibility to have SIGSEGV here
				156	// is because the caller overstates the length.
				157	ldrhi r3, [r12], #32 /* cheap ARM9 preload */
				158	stmia r0!, { r4-r11 }
				159	bhs 1b
				160
				161	add r2, r2, #32
				162
				163
				164
				165
				166	less_than_32_left:
				167	/*
				168	* less than 32 bytes left at this point (length in r2)
				169	*/
				170
				171	/* skip all this if there is nothing to do, which should
				172	* be a common case (if not executed the code below takes
				173	* about 16 cycles)
				174	*/
				175	tst r2, #0x1F
				176	beq 1f
				177
				178	/* conditionnaly copies 0 to 31 bytes */
				179	movs r12, r2, lsl #28
				180	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				181	ldmmiia r1!, {r8, r9} /* 8 bytes */
				182	stmcsia r0!, {r4, r5, r6, r7}
				183	stmmiia r0!, {r8, r9}
				184	movs r12, r2, lsl #30
				185	ldrcs r3, [r1], #4 /* 4 bytes */
				186	ldrmih r4, [r1], #2 /* 2 bytes */
				187	strcs r3, [r0], #4
				188	strmih r4, [r0], #2
				189	tst r2, #0x1
				190	ldrneb r3, [r1] /* last byte */
				191	strneb r3, [r0]
				192
				193	/* we're done! restore everything and return */
				194	1: ldmfd sp!, {r5-r11}
				195	ldmfd sp!, {r0, r4, lr}
				196	bx lr
				197
				198	/********************************************************************/
				199
				200	non_congruent:
				201	/*
				202	* here source is aligned to 4 bytes
				203	* but destination is not.
				204	*
				205	* in the code below r2 is the number of bytes read
				206	* (the number of bytes written is always smaller, because we have
				207	* partial words in the shift queue)
				208	*/
				209	cmp r2, #4
				210	blo copy_last_3_and_return
				211
				212	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				213	* frame. Don't update sp.
				214	*/
				215	stmea sp, {r5-r11}
				216
				217	/* compute shifts needed to align src to dest */
				218	rsb r5, r0, #0
				219	and r5, r5, #3 /* r5 = # bytes in partial words */
				220	mov r12, r5, lsl #3 /* r12 = right */
				221	rsb lr, r12, #32 /* lr = left */
				222
				223	/* read the first word */
				224	ldr r3, [r1], #4
				225	sub r2, r2, #4
				226
				227	/* write a partial word (0 to 3 bytes), such that destination
				228	* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
				229	*/
				230	movs r5, r5, lsl #31
				231	strmib r3, [r0], #1
				232	movmi r3, r3, lsr #8
				233	strcsb r3, [r0], #1
				234	movcs r3, r3, lsr #8
				235	strcsb r3, [r0], #1
				236	movcs r3, r3, lsr #8
				237
				238	cmp r2, #4
				239	blo partial_word_tail
				240
				241	/* Align destination to 32 bytes (cache line boundary) */
				242	1: tst r0, #0x1c
				243	beq 2f
				244	ldr r5, [r1], #4
				245	sub r2, r2, #4
				246	orr r4, r3, r5, lsl lr
				247	mov r3, r5, lsr r12
				248	str r4, [r0], #4
				249	cmp r2, #4
				250	bhs 1b
				251	blo partial_word_tail
				252
				253	/* copy 32 bytes at a time */
				254	2: subs r2, r2, #32
				255	blo less_than_thirtytwo
				256
				257	/* Use immediate mode for the shifts, because there is an extra cycle
				258	* for register shifts, which could account for up to 50% of
				259	* performance hit.
				260	*/
				261
				262	cmp r12, #24
				263	beq loop24
				264	cmp r12, #8
				265	beq loop8
				266
				267	loop16:
				268	ldr r12, [r1], #4
				269	1: mov r4, r12
				270	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughes	c54ca40	2013-12-13 12:17:13 -0800	[diff] [blame]	271	pld [r1, #64]
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	272	subs r2, r2, #32
				273	ldrhs r12, [r1], #4
				274	orr r3, r3, r4, lsl #16
				275	mov r4, r4, lsr #16
				276	orr r4, r4, r5, lsl #16
				277	mov r5, r5, lsr #16
				278	orr r5, r5, r6, lsl #16
				279	mov r6, r6, lsr #16
				280	orr r6, r6, r7, lsl #16
				281	mov r7, r7, lsr #16
				282	orr r7, r7, r8, lsl #16
				283	mov r8, r8, lsr #16
				284	orr r8, r8, r9, lsl #16
				285	mov r9, r9, lsr #16
				286	orr r9, r9, r10, lsl #16
				287	mov r10, r10, lsr #16
				288	orr r10, r10, r11, lsl #16
				289	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				290	mov r3, r11, lsr #16
				291	bhs 1b
				292	b less_than_thirtytwo
				293
				294	loop8:
				295	ldr r12, [r1], #4
				296	1: mov r4, r12
				297	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughes	c54ca40	2013-12-13 12:17:13 -0800	[diff] [blame]	298	pld [r1, #64]
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	299	subs r2, r2, #32
				300	ldrhs r12, [r1], #4
				301	orr r3, r3, r4, lsl #24
				302	mov r4, r4, lsr #8
				303	orr r4, r4, r5, lsl #24
				304	mov r5, r5, lsr #8
				305	orr r5, r5, r6, lsl #24
				306	mov r6, r6, lsr #8
				307	orr r6, r6, r7, lsl #24
				308	mov r7, r7, lsr #8
				309	orr r7, r7, r8, lsl #24
				310	mov r8, r8, lsr #8
				311	orr r8, r8, r9, lsl #24
				312	mov r9, r9, lsr #8
				313	orr r9, r9, r10, lsl #24
				314	mov r10, r10, lsr #8
				315	orr r10, r10, r11, lsl #24
				316	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				317	mov r3, r11, lsr #8
				318	bhs 1b
				319	b less_than_thirtytwo
				320
				321	loop24:
				322	ldr r12, [r1], #4
				323	1: mov r4, r12
				324	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughes	c54ca40	2013-12-13 12:17:13 -0800	[diff] [blame]	325	pld [r1, #64]
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	326	subs r2, r2, #32
				327	ldrhs r12, [r1], #4
				328	orr r3, r3, r4, lsl #8
				329	mov r4, r4, lsr #24
				330	orr r4, r4, r5, lsl #8
				331	mov r5, r5, lsr #24
				332	orr r5, r5, r6, lsl #8
				333	mov r6, r6, lsr #24
				334	orr r6, r6, r7, lsl #8
				335	mov r7, r7, lsr #24
				336	orr r7, r7, r8, lsl #8
				337	mov r8, r8, lsr #24
				338	orr r8, r8, r9, lsl #8
				339	mov r9, r9, lsr #24
				340	orr r9, r9, r10, lsl #8
				341	mov r10, r10, lsr #24
				342	orr r10, r10, r11, lsl #8
				343	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				344	mov r3, r11, lsr #24
				345	bhs 1b
				346
				347
				348	less_than_thirtytwo:
				349	/* copy the last 0 to 31 bytes of the source */
				350	rsb r12, lr, #32 /* we corrupted r12, recompute it */
				351	add r2, r2, #32
				352	cmp r2, #4
				353	blo partial_word_tail
				354
				355	1: ldr r5, [r1], #4
				356	sub r2, r2, #4
				357	orr r4, r3, r5, lsl lr
				358	mov r3, r5, lsr r12
				359	str r4, [r0], #4
				360	cmp r2, #4
				361	bhs 1b
				362
				363	partial_word_tail:
				364	/* we have a partial word in the input buffer */
				365	movs r5, lr, lsl #(31-3)
				366	strmib r3, [r0], #1
				367	movmi r3, r3, lsr #8
				368	strcsb r3, [r0], #1
				369	movcs r3, r3, lsr #8
				370	strcsb r3, [r0], #1
				371
				372	/* Refill spilled registers from the stack. Don't update sp. */
				373	ldmfd sp, {r5-r11}
				374
				375	copy_last_3_and_return:
				376	movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
				377	ldrmib r2, [r1], #1
				378	ldrcsb r3, [r1], #1
				379	ldrcsb r12,[r1]
				380	strmib r2, [r0], #1
				381	strcsb r3, [r0], #1
				382	strcsb r12,[r0]
				383
				384	/* we're done! restore sp and spilled registers and return */
				385	add sp, sp, #28
				386	ldmfd sp!, {r0, r4, lr}
				387	bx lr
Christopher Ferris	59a13c1	2013-08-01 13:13:33 -0700	[diff] [blame]	388
				389	// Only reached when the __memcpy_chk check fails.
				390	fortify_check_failed:
				391	ldr r0, error_message
				392	ldr r1, error_code
				393	1:
				394	add r0, pc
				395	bl __fortify_chk_fail
				396	error_code:
				397	.word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
				398	error_message:
				399	.word error_string-(1b+8)
Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	400	END(memcpy)
Christopher Ferris	59a13c1	2013-08-01 13:13:33 -0700	[diff] [blame]	401
				402	.data
				403	error_string:
Elliott Hughes	68b6711	2013-10-15 17:17:05 -0700	[diff] [blame]	404	.string "memcpy: prevented write past end of buffer"