Blame - libc/arch-arm/cortex-a15/bionic/memcpy_base.S - fp2-dev/platform/bionic

blob: 6ba4931f9d5088d728638eb0e52859581dea33ad [file] [log] [blame]

Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28	/*
				29	* Copyright (c) 2013 ARM Ltd
				30	* All rights reserved.
				31	*
				32	* Redistribution and use in source and binary forms, with or without
				33	* modification, are permitted provided that the following conditions
				34	* are met:
				35	* 1. Redistributions of source code must retain the above copyright
				36	* notice, this list of conditions and the following disclaimer.
				37	* 2. Redistributions in binary form must reproduce the above copyright
				38	* notice, this list of conditions and the following disclaimer in the
				39	* documentation and/or other materials provided with the distribution.
				40	* 3. The name of the company may not be used to endorse or promote
				41	* products derived from this software without specific prior written
				42	* permission.
				43	*
				44	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
				45	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				46	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				47	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				48	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
				49	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				50	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				51	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				52	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				53	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				54	*/
				55
Nick Kralevich	32bbf8a	2013-10-02 16:54:58 -0700	[diff] [blame]	56	ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	57	.save {r0, lr}
				58	.cfi_def_cfa_offset 8
				59	.cfi_rel_offset r0, 0
				60	.cfi_rel_offset lr, 4
				61
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	62	// Assumes that n >= 0, and dst, src are valid pointers.
				63	// For any sizes less than 832 use the neon code that doesn't
				64	// care about the src alignment. This avoids any checks
				65	// for src alignment, and offers the best improvement since
				66	// smaller sized copies are dominated by the overhead of
				67	// the pre and post main loop.
				68	// For larger copies, if src and dst cannot both be aligned to
				69	// word boundaries, use the neon code.
				70	// For all other copies, align dst to a double word boundary
				71	// and copy using LDRD/STRD instructions.
				72
				73	cmp r2, #16
				74	blo .L_copy_less_than_16_unknown_align
				75
Christopher Ferris	ac6bc31	2013-10-15 14:54:02 -0700	[diff] [blame]	76	// TODO: The aligned copy code is extremely slow copying some large
				77	// buffers so always go through the unaligned path for now.
				78	//cmp r2, #832
				79	//bge .L_check_alignment
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	80
				81	.L_copy_unknown_alignment:
				82	// Unknown alignment of src and dst.
				83	// Assumes that the first few bytes have already been prefetched.
				84
				85	// Align destination to 128 bits. The mainloop store instructions
				86	// require this alignment or they will throw an exception.
				87	rsb r3, r0, #0
				88	ands r3, r3, #0xF
				89	beq 2f
				90
				91	// Copy up to 15 bytes (count in r3).
				92	sub r2, r2, r3
				93	movs ip, r3, lsl #31
				94
				95	itt mi
				96	ldrbmi lr, [r1], #1
				97	strbmi lr, [r0], #1
				98	itttt cs
				99	ldrbcs ip, [r1], #1
				100	ldrbcs lr, [r1], #1
				101	strbcs ip, [r0], #1
				102	strbcs lr, [r0], #1
				103
				104	movs ip, r3, lsl #29
				105	bge 1f
				106	// Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
				107	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				108	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				109	1: bcc 2f
				110	// Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
				111	vld1.8 {d0}, [r1]!
				112	vst1.8 {d0}, [r0, :64]!
				113
				114	2: // Make sure we have at least 64 bytes to copy.
				115	subs r2, r2, #64
				116	blo 2f
				117
				118	1: // The main loop copies 64 bytes at a time.
				119	vld1.8 {d0 - d3}, [r1]!
				120	vld1.8 {d4 - d7}, [r1]!
				121	pld [r1, #(64*4)]
				122	subs r2, r2, #64
				123	vst1.8 {d0 - d3}, [r0, :128]!
				124	vst1.8 {d4 - d7}, [r0, :128]!
				125	bhs 1b
				126
				127	2: // Fix-up the remaining count and make sure we have >= 32 bytes left.
				128	adds r2, r2, #32
				129	blo 3f
				130
				131	// 32 bytes. These cache lines were already preloaded.
				132	vld1.8 {d0 - d3}, [r1]!
				133	sub r2, r2, #32
				134	vst1.8 {d0 - d3}, [r0, :128]!
				135	3: // Less than 32 left.
				136	add r2, r2, #32
				137	tst r2, #0x10
				138	beq .L_copy_less_than_16_unknown_align
				139	// Copies 16 bytes, destination 128 bits aligned.
				140	vld1.8 {d0, d1}, [r1]!
				141	vst1.8 {d0, d1}, [r0, :128]!
				142
				143	.L_copy_less_than_16_unknown_align:
				144	// Copy up to 15 bytes (count in r2).
				145	movs ip, r2, lsl #29
				146	bcc 1f
				147	vld1.8 {d0}, [r1]!
				148	vst1.8 {d0}, [r0]!
				149	1: bge 2f
				150	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				151	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				152
				153	2: // Copy 0 to 4 bytes.
				154	lsls r2, r2, #31
				155	itt ne
				156	ldrbne lr, [r1], #1
				157	strbne lr, [r0], #1
				158	itttt cs
				159	ldrbcs ip, [r1], #1
				160	ldrbcs lr, [r1]
				161	strbcs ip, [r0], #1
				162	strbcs lr, [r0]
				163
				164	pop {r0, pc}
				165
				166	.L_check_alignment:
				167	// If src and dst cannot both be aligned to a word boundary,
				168	// use the unaligned copy version.
				169	eor r3, r0, r1
				170	ands r3, r3, #0x3
				171	bne .L_copy_unknown_alignment
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	172	END(MEMCPY_BASE)
				173
Nick Kralevich	32bbf8a	2013-10-02 16:54:58 -0700	[diff] [blame]	174	ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	175	.save {r0, lr}
				176	.cfi_def_cfa_offset 8
				177	.cfi_rel_offset r0, 0
				178	.cfi_rel_offset lr, 4
				179
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	180	// To try and improve performance, stack layout changed,
				181	// i.e., not keeping the stack looking like users expect
				182	// (highest numbered register at highest address).
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	183	strd r4, r5, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	184	.save {r4, r5}
				185	.cfi_adjust_cfa_offset 8
				186	.cfi_rel_offset r4, 0
				187	.cfi_rel_offset r5, 4
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	188	strd r6, r7, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	189	.save {r6, r7}
				190	.cfi_adjust_cfa_offset 8
				191	.cfi_rel_offset r6, 0
				192	.cfi_rel_offset r7, 0
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	193	strd r8, r9, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	194	.save {r8, r9}
				195	.cfi_adjust_cfa_offset 8
				196	.cfi_rel_offset r8, 0
				197	.cfi_rel_offset r9, 4
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	198
				199	// Optimized for already aligned dst code.
				200	ands ip, r0, #3
				201	bne .L_dst_not_word_aligned
				202
				203	.L_word_aligned:
				204	// Align the destination buffer to 8 bytes, to make sure double
				205	// loads and stores don't cross a cache line boundary,
				206	// as they are then more expensive even if the data is in the cache
				207	// (require two load/store issue cycles instead of one).
				208	// If only one of the buffers is not 8 bytes aligned,
				209	// then it's more important to align dst than src,
				210	// because there is more penalty for stores
				211	// than loads that cross a cacheline boundary.
				212	// This check and realignment are only done if there is >= 832
				213	// bytes to copy.
				214
				215	// Dst is word aligned, but check if it is already double word aligned.
				216	ands r3, r0, #4
				217	beq 1f
				218	ldr r3, [r1], #4
				219	str r3, [r0], #4
				220	sub r2, #4
				221
				222	1: // Can only get here if > 64 bytes to copy, so don't do check r2.
				223	sub r2, #64
				224
				225	2: // Every loop iteration copies 64 bytes.
				226	.irp offset, #0, #8, #16, #24, #32
				227	ldrd r4, r5, [r1, \offset]
				228	strd r4, r5, [r0, \offset]
				229	.endr
				230
				231	ldrd r4, r5, [r1, #40]
				232	ldrd r6, r7, [r1, #48]
				233	ldrd r8, r9, [r1, #56]
				234
				235	// Keep the pld as far from the next load as possible.
				236	// The amount to prefetch was determined experimentally using
				237	// large sizes, and verifying the prefetch size does not affect
				238	// the smaller copies too much.
				239	// WARNING: If the ldrd and strd instructions get too far away
				240	// from each other, performance suffers. Three loads
				241	// in a row is the best tradeoff.
				242	pld [r1, #(64*16)]
				243	strd r4, r5, [r0, #40]
				244	strd r6, r7, [r0, #48]
				245	strd r8, r9, [r0, #56]
				246
				247	add r0, r0, #64
				248	add r1, r1, #64
				249	subs r2, r2, #64
				250	bge 2b
				251
				252	// Fix-up the remaining count and make sure we have >= 32 bytes left.
				253	adds r2, r2, #32
				254	blo 4f
				255
				256	// Copy 32 bytes. These cache lines were already preloaded.
				257	.irp offset, #0, #8, #16, #24
				258	ldrd r4, r5, [r1, \offset]
				259	strd r4, r5, [r0, \offset]
				260	.endr
				261	add r1, r1, #32
				262	add r0, r0, #32
				263	sub r2, r2, #32
				264	4: // Less than 32 left.
				265	add r2, r2, #32
				266	tst r2, #0x10
				267	beq 5f
				268	// Copy 16 bytes.
				269	.irp offset, #0, #8
				270	ldrd r4, r5, [r1, \offset]
				271	strd r4, r5, [r0, \offset]
				272	.endr
				273	add r1, r1, #16
				274	add r0, r0, #16
				275
				276	5: // Copy up to 15 bytes (count in r2).
				277	movs ip, r2, lsl #29
				278	bcc 1f
				279	// Copy 8 bytes.
				280	ldrd r4, r5, [r1], #8
				281	strd r4, r5, [r0], #8
				282	1: bge 2f
				283	// Copy 4 bytes.
				284	ldr r4, [r1], #4
				285	str r4, [r0], #4
				286	2: // Copy 0 to 4 bytes.
				287	lsls r2, r2, #31
				288	itt ne
				289	ldrbne lr, [r1], #1
				290	strbne lr, [r0], #1
				291	itttt cs
				292	ldrbcs ip, [r1], #1
				293	ldrbcs lr, [r1]
				294	strbcs ip, [r0], #1
				295	strbcs lr, [r0]
				296
				297	// Restore registers: optimized pop {r0, pc}
				298	ldrd r8, r9, [sp], #8
				299	ldrd r6, r7, [sp], #8
				300	ldrd r4, r5, [sp], #8
				301	pop {r0, pc}
				302
				303	.L_dst_not_word_aligned:
				304	// Align dst to word.
				305	rsb ip, ip, #4
				306	cmp ip, #2
				307
				308	itt gt
				309	ldrbgt lr, [r1], #1
				310	strbgt lr, [r0], #1
				311
				312	itt ge
				313	ldrbge lr, [r1], #1
				314	strbge lr, [r0], #1
				315
				316	ldrb lr, [r1], #1
				317	strb lr, [r0], #1
				318
				319	sub r2, r2, ip
				320
				321	// Src is guaranteed to be at least word aligned by this point.
				322	b .L_word_aligned
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	323	END(MEMCPY_BASE_ALIGNED)