Blame - libc/arch-arm/cortex-a15/bionic/memcpy_base.S - fp2-dev/platform/bionic

blob: 2a7385247d9385d8f13fa1913b7dd00ef709d770 [file] [log] [blame]

Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28	/*
				29	* Copyright (c) 2013 ARM Ltd
				30	* All rights reserved.
				31	*
				32	* Redistribution and use in source and binary forms, with or without
				33	* modification, are permitted provided that the following conditions
				34	* are met:
				35	* 1. Redistributions of source code must retain the above copyright
				36	* notice, this list of conditions and the following disclaimer.
				37	* 2. Redistributions in binary form must reproduce the above copyright
				38	* notice, this list of conditions and the following disclaimer in the
				39	* documentation and/or other materials provided with the distribution.
				40	* 3. The name of the company may not be used to endorse or promote
				41	* products derived from this software without specific prior written
				42	* permission.
				43	*
				44	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
				45	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				46	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				47	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				48	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
				49	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				50	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				51	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				52	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				53	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				54	*/
				55
Nick Kralevich	32bbf8a	2013-10-02 16:54:58 -0700	[diff] [blame]	56	ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	57	.cfi_def_cfa_offset 8
				58	.cfi_rel_offset r0, 0
				59	.cfi_rel_offset lr, 4
				60
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	61	// Assumes that n >= 0, and dst, src are valid pointers.
				62	// For any sizes less than 832 use the neon code that doesn't
				63	// care about the src alignment. This avoids any checks
				64	// for src alignment, and offers the best improvement since
				65	// smaller sized copies are dominated by the overhead of
				66	// the pre and post main loop.
				67	// For larger copies, if src and dst cannot both be aligned to
				68	// word boundaries, use the neon code.
				69	// For all other copies, align dst to a double word boundary
				70	// and copy using LDRD/STRD instructions.
				71
				72	cmp r2, #16
				73	blo .L_copy_less_than_16_unknown_align
				74
Christopher Ferris	ac6bc31	2013-10-15 14:54:02 -0700	[diff] [blame]	75	// TODO: The aligned copy code is extremely slow copying some large
				76	// buffers so always go through the unaligned path for now.
				77	//cmp r2, #832
				78	//bge .L_check_alignment
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	79
				80	.L_copy_unknown_alignment:
				81	// Unknown alignment of src and dst.
				82	// Assumes that the first few bytes have already been prefetched.
				83
				84	// Align destination to 128 bits. The mainloop store instructions
				85	// require this alignment or they will throw an exception.
				86	rsb r3, r0, #0
				87	ands r3, r3, #0xF
				88	beq 2f
				89
				90	// Copy up to 15 bytes (count in r3).
				91	sub r2, r2, r3
				92	movs ip, r3, lsl #31
				93
				94	itt mi
				95	ldrbmi lr, [r1], #1
				96	strbmi lr, [r0], #1
				97	itttt cs
				98	ldrbcs ip, [r1], #1
				99	ldrbcs lr, [r1], #1
				100	strbcs ip, [r0], #1
				101	strbcs lr, [r0], #1
				102
				103	movs ip, r3, lsl #29
				104	bge 1f
				105	// Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
				106	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				107	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				108	1: bcc 2f
				109	// Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
				110	vld1.8 {d0}, [r1]!
				111	vst1.8 {d0}, [r0, :64]!
				112
				113	2: // Make sure we have at least 64 bytes to copy.
				114	subs r2, r2, #64
				115	blo 2f
				116
				117	1: // The main loop copies 64 bytes at a time.
				118	vld1.8 {d0 - d3}, [r1]!
				119	vld1.8 {d4 - d7}, [r1]!
				120	pld [r1, #(64*4)]
				121	subs r2, r2, #64
				122	vst1.8 {d0 - d3}, [r0, :128]!
				123	vst1.8 {d4 - d7}, [r0, :128]!
				124	bhs 1b
				125
				126	2: // Fix-up the remaining count and make sure we have >= 32 bytes left.
				127	adds r2, r2, #32
				128	blo 3f
				129
				130	// 32 bytes. These cache lines were already preloaded.
				131	vld1.8 {d0 - d3}, [r1]!
				132	sub r2, r2, #32
				133	vst1.8 {d0 - d3}, [r0, :128]!
				134	3: // Less than 32 left.
				135	add r2, r2, #32
				136	tst r2, #0x10
				137	beq .L_copy_less_than_16_unknown_align
				138	// Copies 16 bytes, destination 128 bits aligned.
				139	vld1.8 {d0, d1}, [r1]!
				140	vst1.8 {d0, d1}, [r0, :128]!
				141
				142	.L_copy_less_than_16_unknown_align:
				143	// Copy up to 15 bytes (count in r2).
				144	movs ip, r2, lsl #29
				145	bcc 1f
				146	vld1.8 {d0}, [r1]!
				147	vst1.8 {d0}, [r0]!
				148	1: bge 2f
				149	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				150	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				151
				152	2: // Copy 0 to 4 bytes.
				153	lsls r2, r2, #31
				154	itt ne
				155	ldrbne lr, [r1], #1
				156	strbne lr, [r0], #1
				157	itttt cs
				158	ldrbcs ip, [r1], #1
				159	ldrbcs lr, [r1]
				160	strbcs ip, [r0], #1
				161	strbcs lr, [r0]
				162
				163	pop {r0, pc}
				164
				165	.L_check_alignment:
				166	// If src and dst cannot both be aligned to a word boundary,
				167	// use the unaligned copy version.
				168	eor r3, r0, r1
				169	ands r3, r3, #0x3
				170	bne .L_copy_unknown_alignment
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	171	END(MEMCPY_BASE)
				172
Nick Kralevich	32bbf8a	2013-10-02 16:54:58 -0700	[diff] [blame]	173	ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	174	.cfi_def_cfa_offset 8
				175	.cfi_rel_offset r0, 0
				176	.cfi_rel_offset lr, 4
				177
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	178	// To try and improve performance, stack layout changed,
				179	// i.e., not keeping the stack looking like users expect
				180	// (highest numbered register at highest address).
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	181	strd r4, r5, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	182	.cfi_adjust_cfa_offset 8
				183	.cfi_rel_offset r4, 0
				184	.cfi_rel_offset r5, 4
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	185	strd r6, r7, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	186	.cfi_adjust_cfa_offset 8
				187	.cfi_rel_offset r6, 0
				188	.cfi_rel_offset r7, 0
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	189	strd r8, r9, [sp, #-8]!
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	190	.cfi_adjust_cfa_offset 8
				191	.cfi_rel_offset r8, 0
				192	.cfi_rel_offset r9, 4
Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame]	193
				194	// Optimized for already aligned dst code.
				195	ands ip, r0, #3
				196	bne .L_dst_not_word_aligned
				197
				198	.L_word_aligned:
				199	// Align the destination buffer to 8 bytes, to make sure double
				200	// loads and stores don't cross a cache line boundary,
				201	// as they are then more expensive even if the data is in the cache
				202	// (require two load/store issue cycles instead of one).
				203	// If only one of the buffers is not 8 bytes aligned,
				204	// then it's more important to align dst than src,
				205	// because there is more penalty for stores
				206	// than loads that cross a cacheline boundary.
				207	// This check and realignment are only done if there is >= 832
				208	// bytes to copy.
				209
				210	// Dst is word aligned, but check if it is already double word aligned.
				211	ands r3, r0, #4
				212	beq 1f
				213	ldr r3, [r1], #4
				214	str r3, [r0], #4
				215	sub r2, #4
				216
				217	1: // Can only get here if > 64 bytes to copy, so don't do check r2.
				218	sub r2, #64
				219
				220	2: // Every loop iteration copies 64 bytes.
				221	.irp offset, #0, #8, #16, #24, #32
				222	ldrd r4, r5, [r1, \offset]
				223	strd r4, r5, [r0, \offset]
				224	.endr
				225
				226	ldrd r4, r5, [r1, #40]
				227	ldrd r6, r7, [r1, #48]
				228	ldrd r8, r9, [r1, #56]
				229
				230	// Keep the pld as far from the next load as possible.
				231	// The amount to prefetch was determined experimentally using
				232	// large sizes, and verifying the prefetch size does not affect
				233	// the smaller copies too much.
				234	// WARNING: If the ldrd and strd instructions get too far away
				235	// from each other, performance suffers. Three loads
				236	// in a row is the best tradeoff.
				237	pld [r1, #(64*16)]
				238	strd r4, r5, [r0, #40]
				239	strd r6, r7, [r0, #48]
				240	strd r8, r9, [r0, #56]
				241
				242	add r0, r0, #64
				243	add r1, r1, #64
				244	subs r2, r2, #64
				245	bge 2b
				246
				247	// Fix-up the remaining count and make sure we have >= 32 bytes left.
				248	adds r2, r2, #32
				249	blo 4f
				250
				251	// Copy 32 bytes. These cache lines were already preloaded.
				252	.irp offset, #0, #8, #16, #24
				253	ldrd r4, r5, [r1, \offset]
				254	strd r4, r5, [r0, \offset]
				255	.endr
				256	add r1, r1, #32
				257	add r0, r0, #32
				258	sub r2, r2, #32
				259	4: // Less than 32 left.
				260	add r2, r2, #32
				261	tst r2, #0x10
				262	beq 5f
				263	// Copy 16 bytes.
				264	.irp offset, #0, #8
				265	ldrd r4, r5, [r1, \offset]
				266	strd r4, r5, [r0, \offset]
				267	.endr
				268	add r1, r1, #16
				269	add r0, r0, #16
				270
				271	5: // Copy up to 15 bytes (count in r2).
				272	movs ip, r2, lsl #29
				273	bcc 1f
				274	// Copy 8 bytes.
				275	ldrd r4, r5, [r1], #8
				276	strd r4, r5, [r0], #8
				277	1: bge 2f
				278	// Copy 4 bytes.
				279	ldr r4, [r1], #4
				280	str r4, [r0], #4
				281	2: // Copy 0 to 4 bytes.
				282	lsls r2, r2, #31
				283	itt ne
				284	ldrbne lr, [r1], #1
				285	strbne lr, [r0], #1
				286	itttt cs
				287	ldrbcs ip, [r1], #1
				288	ldrbcs lr, [r1]
				289	strbcs ip, [r0], #1
				290	strbcs lr, [r0]
				291
				292	// Restore registers: optimized pop {r0, pc}
				293	ldrd r8, r9, [sp], #8
				294	ldrd r6, r7, [sp], #8
				295	ldrd r4, r5, [sp], #8
				296	pop {r0, pc}
				297
				298	.L_dst_not_word_aligned:
				299	// Align dst to word.
				300	rsb ip, ip, #4
				301	cmp ip, #2
				302
				303	itt gt
				304	ldrbgt lr, [r1], #1
				305	strbgt lr, [r0], #1
				306
				307	itt ge
				308	ldrbge lr, [r1], #1
				309	strbge lr, [r0], #1
				310
				311	ldrb lr, [r1], #1
				312	strb lr, [r0], #1
				313
				314	sub r2, r2, ip
				315
				316	// Src is guaranteed to be at least word aligned by this point.
				317	b .L_word_aligned
Christopher Ferris	a57c9c0	2013-08-21 09:41:12 -0700	[diff] [blame]	318	END(MEMCPY_BASE_ALIGNED)