Blame - libc/arch-arm64/denver64/bionic/memcpy_base.S - fp2-dev/platform/bionic

blob: 3d7e9dd795069de87a394f3fbd2a4092e82ba0aa [file] [log] [blame]

Christopher Ferris	0cc59dd	2014-09-24 17:05:20 -0700	[diff] [blame]	1	/* Copyright (c) 2012, Linaro Limited
				2	All rights reserved.
				3	Copyright (c) 2014, NVIDIA Corporation. All rights reserved.
				4
				5	Redistribution and use in source and binary forms, with or without
				6	modification, are permitted provided that the following conditions are met:
				7	* Redistributions of source code must retain the above copyright
				8	notice, this list of conditions and the following disclaimer.
				9	* Redistributions in binary form must reproduce the above copyright
				10	notice, this list of conditions and the following disclaimer in the
				11	documentation and/or other materials provided with the distribution.
				12	* Neither the name of the Linaro nor the
				13	names of its contributors may be used to endorse or promote products
				14	derived from this software without specific prior written permission.
				15
				16	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27	*/
				28
				29	/* Assumptions:
				30	*
				31	* denver, ARMv8-a, AArch64
				32	* Unaligned accesses
				33	*
				34	*/
				35
				36	#define dstin x0
				37	#define src x1
				38	#define count x2
				39	#define tmp1 x3
				40	#define tmp1w w3
				41	#define tmp2 x4
				42	#define tmp2w w4
				43	#define tmp3 x5
				44	#define tmp3w w5
				45	#define dst x6
				46
				47	#define A_l x7
				48	#define A_h x8
				49	#define B_l x9
				50	#define B_h x10
				51	#define C_l x11
				52	#define C_h x12
				53	#define D_l x13
				54	#define D_h x14
				55
				56	#define QA_l q0
				57	#define QA_h q1
				58	#define QB_l q2
				59	#define QB_h q3
				60
				61	mov dst, dstin
				62	cmp count, #64
				63	b.ge .Lcpy_not_short
				64	cmp count, #15
				65	b.le .Ltail15tiny
				66
				67	/* Deal with small copies quickly by dropping straight into the
				68	* exit block. */
				69	.Ltail63:
				70	/* Copy up to 48 bytes of data. At this point we only need the
				71	* bottom 6 bits of count to be accurate. */
				72	ands tmp1, count, #0x30
				73	b.eq .Ltail15
				74	add dst, dst, tmp1
				75	add src, src, tmp1
				76	cmp tmp1w, #0x20
				77	b.eq 1f
				78	b.lt 2f
				79	ldp A_l, A_h, [src, #-48]
				80	stp A_l, A_h, [dst, #-48]
				81	1:
				82	ldp A_l, A_h, [src, #-32]
				83	stp A_l, A_h, [dst, #-32]
				84	2:
				85	ldp A_l, A_h, [src, #-16]
				86	stp A_l, A_h, [dst, #-16]
				87
				88	.Ltail15:
				89	ands count, count, #15
				90	beq 1f
				91	add src, src, count
				92	ldp A_l, A_h, [src, #-16]
				93	add dst, dst, count
				94	stp A_l, A_h, [dst, #-16]
				95	1:
				96	ret
				97
				98	.Ltail15tiny:
				99	/* Copy up to 15 bytes of data. Does not assume additional data
				100	being copied. */
				101	tbz count, #3, 1f
				102	ldr tmp1, [src], #8
				103	str tmp1, [dst], #8
				104	1:
				105	tbz count, #2, 1f
				106	ldr tmp1w, [src], #4
				107	str tmp1w, [dst], #4
				108	1:
				109	tbz count, #1, 1f
				110	ldrh tmp1w, [src], #2
				111	strh tmp1w, [dst], #2
				112	1:
				113	tbz count, #0, 1f
				114	ldrb tmp1w, [src]
				115	strb tmp1w, [dst]
				116	1:
				117	ret
				118
				119	.Lcpy_not_short:
				120	/* We don't much care about the alignment of DST, but we want SRC
				121	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				122	* boundaries on both loads and stores. */
				123	neg tmp2, src
				124	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				125	b.eq 2f
				126	sub count, count, tmp2
				127	/* Copy more data than needed; it's faster than jumping
				128	* around copying sub-Quadword quantities. We know that
				129	* it can't overrun. */
				130	ldp A_l, A_h, [src]
				131	add src, src, tmp2
				132	stp A_l, A_h, [dst]
				133	add dst, dst, tmp2
				134	/* There may be less than 63 bytes to go now. */
				135	cmp count, #63
				136	b.le .Ltail63
				137	2:
				138	subs count, count, #128
				139	b.ge .Lcpy_body_large
				140	/* Less than 128 bytes to copy, so handle 64 here and then jump
				141	* to the tail. */
				142	ldp QA_l, QA_h, [src]
				143	ldp QB_l, QB_h, [src, #32]
				144	stp QA_l, QA_h, [dst]
				145	stp QB_l, QB_h, [dst, #32]
				146	tst count, #0x3f
				147	add src, src, #64
				148	add dst, dst, #64
				149	b.ne .Ltail63
				150	ret
				151
				152	/* Critical loop. Start at a new cache line boundary. Assuming
				153	* 64 bytes per line this ensures the entire loop is in one line. */
				154	.p2align 6
				155	.Lcpy_body_large:
				156	cmp count, 65536
				157	bhi .Lcpy_body_huge
				158	/* There are at least 128 bytes to copy. */
				159	ldp QA_l, QA_h, [src, #0]
				160	sub dst, dst, #32 /* Pre-bias. */
				161	ldp QB_l, QB_h, [src, #32]! /* src += 64 - Pre-bias. */
				162	1:
				163	stp QA_l, QA_h, [dst, #32]
				164	ldp QA_l, QA_h, [src, #32]
				165	stp QB_l, QB_h, [dst, #64]!
				166	ldp QB_l, QB_h, [src, #64]!
				167
				168	subs count, count, #64
				169	b.ge 1b
				170
				171	stp QA_l, QA_h, [dst, #32]
				172	stp QB_l, QB_h, [dst, #64]
				173	add src, src, #32
				174	add dst, dst, #64 + 32
				175	tst count, #0x3f
				176	b.ne .Ltail63
				177	ret
				178	.Lcpy_body_huge:
				179	/* There are at least 128 bytes to copy. */
				180	ldp QA_l, QA_h, [src, #0]
				181	sub dst, dst, #32 /* Pre-bias. */
				182	ldp QB_l, QB_h, [src, #32]!
				183	1:
				184	stnp QA_l, QA_h, [dst, #32]
				185	stnp QB_l, QB_h, [dst, #64]
				186	ldp QA_l, QA_h, [src, #32]
				187	ldp QB_l, QB_h, [src, #64]!
				188	add dst, dst, #64
				189
				190	subs count, count, #64
				191	b.ge 1b
				192
				193	stnp QA_l, QA_h, [dst, #32]
				194	stnp QB_l, QB_h, [dst, #64]
				195	add src, src, #32
				196	add dst, dst, #64 + 32
				197	tst count, #0x3f
				198	b.ne .Ltail63
				199	ret