Blame - libc/arch-arm/cortex-a15/bionic/memcpy_base.S - fp2-dev/platform/bionic

blob: 647e0653fb5a93f040ee1dddf75b259f15b2d386 [file] [log] [blame]

Christopher Ferris	5f45d58	2013-08-07 13:09:51 -0700	[diff] [blame^]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28	/*
				29	* Copyright (c) 2013 ARM Ltd
				30	* All rights reserved.
				31	*
				32	* Redistribution and use in source and binary forms, with or without
				33	* modification, are permitted provided that the following conditions
				34	* are met:
				35	* 1. Redistributions of source code must retain the above copyright
				36	* notice, this list of conditions and the following disclaimer.
				37	* 2. Redistributions in binary form must reproduce the above copyright
				38	* notice, this list of conditions and the following disclaimer in the
				39	* documentation and/or other materials provided with the distribution.
				40	* 3. The name of the company may not be used to endorse or promote
				41	* products derived from this software without specific prior written
				42	* permission.
				43	*
				44	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
				45	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				46	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				47	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				48	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
				49	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				50	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				51	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				52	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				53	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				54	*/
				55
				56	// Assumes that n >= 0, and dst, src are valid pointers.
				57	// For any sizes less than 832 use the neon code that doesn't
				58	// care about the src alignment. This avoids any checks
				59	// for src alignment, and offers the best improvement since
				60	// smaller sized copies are dominated by the overhead of
				61	// the pre and post main loop.
				62	// For larger copies, if src and dst cannot both be aligned to
				63	// word boundaries, use the neon code.
				64	// For all other copies, align dst to a double word boundary
				65	// and copy using LDRD/STRD instructions.
				66
				67	cmp r2, #16
				68	blo .L_copy_less_than_16_unknown_align
				69
				70	cmp r2, #832
				71	bge .L_check_alignment
				72
				73	.L_copy_unknown_alignment:
				74	// Unknown alignment of src and dst.
				75	// Assumes that the first few bytes have already been prefetched.
				76
				77	// Align destination to 128 bits. The mainloop store instructions
				78	// require this alignment or they will throw an exception.
				79	rsb r3, r0, #0
				80	ands r3, r3, #0xF
				81	beq 2f
				82
				83	// Copy up to 15 bytes (count in r3).
				84	sub r2, r2, r3
				85	movs ip, r3, lsl #31
				86
				87	itt mi
				88	ldrbmi lr, [r1], #1
				89	strbmi lr, [r0], #1
				90	itttt cs
				91	ldrbcs ip, [r1], #1
				92	ldrbcs lr, [r1], #1
				93	strbcs ip, [r0], #1
				94	strbcs lr, [r0], #1
				95
				96	movs ip, r3, lsl #29
				97	bge 1f
				98	// Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
				99	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				100	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				101	1: bcc 2f
				102	// Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
				103	vld1.8 {d0}, [r1]!
				104	vst1.8 {d0}, [r0, :64]!
				105
				106	2: // Make sure we have at least 64 bytes to copy.
				107	subs r2, r2, #64
				108	blo 2f
				109
				110	1: // The main loop copies 64 bytes at a time.
				111	vld1.8 {d0 - d3}, [r1]!
				112	vld1.8 {d4 - d7}, [r1]!
				113	pld [r1, #(64*4)]
				114	subs r2, r2, #64
				115	vst1.8 {d0 - d3}, [r0, :128]!
				116	vst1.8 {d4 - d7}, [r0, :128]!
				117	bhs 1b
				118
				119	2: // Fix-up the remaining count and make sure we have >= 32 bytes left.
				120	adds r2, r2, #32
				121	blo 3f
				122
				123	// 32 bytes. These cache lines were already preloaded.
				124	vld1.8 {d0 - d3}, [r1]!
				125	sub r2, r2, #32
				126	vst1.8 {d0 - d3}, [r0, :128]!
				127	3: // Less than 32 left.
				128	add r2, r2, #32
				129	tst r2, #0x10
				130	beq .L_copy_less_than_16_unknown_align
				131	// Copies 16 bytes, destination 128 bits aligned.
				132	vld1.8 {d0, d1}, [r1]!
				133	vst1.8 {d0, d1}, [r0, :128]!
				134
				135	.L_copy_less_than_16_unknown_align:
				136	// Copy up to 15 bytes (count in r2).
				137	movs ip, r2, lsl #29
				138	bcc 1f
				139	vld1.8 {d0}, [r1]!
				140	vst1.8 {d0}, [r0]!
				141	1: bge 2f
				142	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				143	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				144
				145	2: // Copy 0 to 4 bytes.
				146	lsls r2, r2, #31
				147	itt ne
				148	ldrbne lr, [r1], #1
				149	strbne lr, [r0], #1
				150	itttt cs
				151	ldrbcs ip, [r1], #1
				152	ldrbcs lr, [r1]
				153	strbcs ip, [r0], #1
				154	strbcs lr, [r0]
				155
				156	pop {r0, pc}
				157
				158	.L_check_alignment:
				159	// If src and dst cannot both be aligned to a word boundary,
				160	// use the unaligned copy version.
				161	eor r3, r0, r1
				162	ands r3, r3, #0x3
				163	bne .L_copy_unknown_alignment
				164
				165	// To try and improve performance, stack layout changed,
				166	// i.e., not keeping the stack looking like users expect
				167	// (highest numbered register at highest address).
				168	// TODO: Add debug frame directives.
				169	// We don't need exception unwind directives, because the code below
				170	// does not throw any exceptions and does not call any other functions.
				171	// Generally, newlib functions like this lack debug information for
				172	// assembler source.
				173	.save {r4, r5}
				174	strd r4, r5, [sp, #-8]!
				175	.save {r6, r7}
				176	strd r6, r7, [sp, #-8]!
				177	.save {r8, r9}
				178	strd r8, r9, [sp, #-8]!
				179
				180	// Optimized for already aligned dst code.
				181	ands ip, r0, #3
				182	bne .L_dst_not_word_aligned
				183
				184	.L_word_aligned:
				185	// Align the destination buffer to 8 bytes, to make sure double
				186	// loads and stores don't cross a cache line boundary,
				187	// as they are then more expensive even if the data is in the cache
				188	// (require two load/store issue cycles instead of one).
				189	// If only one of the buffers is not 8 bytes aligned,
				190	// then it's more important to align dst than src,
				191	// because there is more penalty for stores
				192	// than loads that cross a cacheline boundary.
				193	// This check and realignment are only done if there is >= 832
				194	// bytes to copy.
				195
				196	// Dst is word aligned, but check if it is already double word aligned.
				197	ands r3, r0, #4
				198	beq 1f
				199	ldr r3, [r1], #4
				200	str r3, [r0], #4
				201	sub r2, #4
				202
				203	1: // Can only get here if > 64 bytes to copy, so don't do check r2.
				204	sub r2, #64
				205
				206	2: // Every loop iteration copies 64 bytes.
				207	.irp offset, #0, #8, #16, #24, #32
				208	ldrd r4, r5, [r1, \offset]
				209	strd r4, r5, [r0, \offset]
				210	.endr
				211
				212	ldrd r4, r5, [r1, #40]
				213	ldrd r6, r7, [r1, #48]
				214	ldrd r8, r9, [r1, #56]
				215
				216	// Keep the pld as far from the next load as possible.
				217	// The amount to prefetch was determined experimentally using
				218	// large sizes, and verifying the prefetch size does not affect
				219	// the smaller copies too much.
				220	// WARNING: If the ldrd and strd instructions get too far away
				221	// from each other, performance suffers. Three loads
				222	// in a row is the best tradeoff.
				223	pld [r1, #(64*16)]
				224	strd r4, r5, [r0, #40]
				225	strd r6, r7, [r0, #48]
				226	strd r8, r9, [r0, #56]
				227
				228	add r0, r0, #64
				229	add r1, r1, #64
				230	subs r2, r2, #64
				231	bge 2b
				232
				233	// Fix-up the remaining count and make sure we have >= 32 bytes left.
				234	adds r2, r2, #32
				235	blo 4f
				236
				237	// Copy 32 bytes. These cache lines were already preloaded.
				238	.irp offset, #0, #8, #16, #24
				239	ldrd r4, r5, [r1, \offset]
				240	strd r4, r5, [r0, \offset]
				241	.endr
				242	add r1, r1, #32
				243	add r0, r0, #32
				244	sub r2, r2, #32
				245	4: // Less than 32 left.
				246	add r2, r2, #32
				247	tst r2, #0x10
				248	beq 5f
				249	// Copy 16 bytes.
				250	.irp offset, #0, #8
				251	ldrd r4, r5, [r1, \offset]
				252	strd r4, r5, [r0, \offset]
				253	.endr
				254	add r1, r1, #16
				255	add r0, r0, #16
				256
				257	5: // Copy up to 15 bytes (count in r2).
				258	movs ip, r2, lsl #29
				259	bcc 1f
				260	// Copy 8 bytes.
				261	ldrd r4, r5, [r1], #8
				262	strd r4, r5, [r0], #8
				263	1: bge 2f
				264	// Copy 4 bytes.
				265	ldr r4, [r1], #4
				266	str r4, [r0], #4
				267	2: // Copy 0 to 4 bytes.
				268	lsls r2, r2, #31
				269	itt ne
				270	ldrbne lr, [r1], #1
				271	strbne lr, [r0], #1
				272	itttt cs
				273	ldrbcs ip, [r1], #1
				274	ldrbcs lr, [r1]
				275	strbcs ip, [r0], #1
				276	strbcs lr, [r0]
				277
				278	// Restore registers: optimized pop {r0, pc}
				279	ldrd r8, r9, [sp], #8
				280	ldrd r6, r7, [sp], #8
				281	ldrd r4, r5, [sp], #8
				282	pop {r0, pc}
				283
				284	.L_dst_not_word_aligned:
				285	// Align dst to word.
				286	rsb ip, ip, #4
				287	cmp ip, #2
				288
				289	itt gt
				290	ldrbgt lr, [r1], #1
				291	strbgt lr, [r0], #1
				292
				293	itt ge
				294	ldrbge lr, [r1], #1
				295	strbge lr, [r0], #1
				296
				297	ldrb lr, [r1], #1
				298	strb lr, [r0], #1
				299
				300	sub r2, r2, ip
				301
				302	// Src is guaranteed to be at least word aligned by this point.
				303	b .L_word_aligned