Blame - arch/arm64/lib/copy_template.S - kernel/msm-4.9

blob: 410fbdb8163ffdba5af4a8742849bfd3f73efeea [file] [log] [blame]

Feng Kan	e5c88e3	2015-09-23 11:55:38 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 ARM Ltd.
				3	* Copyright (C) 2013 Linaro.
				4	*
				5	* This code is based on glibc cortex strings work originally authored by Linaro
				6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
				7	* be found @
				8	*
				9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
				10	* files/head:/src/aarch64/
				11	*
				12	* This program is free software; you can redistribute it and/or modify
				13	* it under the terms of the GNU General Public License version 2 as
				14	* published by the Free Software Foundation.
				15	*
				16	* This program is distributed in the hope that it will be useful,
				17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				19	* GNU General Public License for more details.
				20	*
				21	* You should have received a copy of the GNU General Public License
				22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
				23	*/
				24
				25
				26	/*
				27	* Copy a buffer from src to dest (alignment handled by the hardware)
				28	*
				29	* Parameters:
				30	* x0 - dest
				31	* x1 - src
				32	* x2 - n
				33	* Returns:
				34	* x0 - dest
				35	*/
				36	dstin .req x0
				37	src .req x1
				38	count .req x2
				39	tmp1 .req x3
				40	tmp1w .req w3
				41	tmp2 .req x4
				42	tmp2w .req w4
				43	dst .req x6
				44
				45	A_l .req x7
				46	A_h .req x8
				47	B_l .req x9
				48	B_h .req x10
				49	C_l .req x11
				50	C_h .req x12
				51	D_l .req x13
				52	D_h .req x14
				53
				54	mov dst, dstin
				55	cmp count, #16
				56	/When memory length is less than 16, the accessed are not aligned./
				57	b.lo .Ltiny15
				58
				59	neg tmp2, src
				60	ands tmp2, tmp2, #15/* Bytes to reach alignment. */
				61	b.eq .LSrcAligned
				62	sub count, count, tmp2
				63	/*
				64	* Copy the leading memory data from src to dst in an increasing
				65	* address order.By this way,the risk of overwritting the source
				66	* memory data is eliminated when the distance between src and
				67	* dst is less than 16. The memory accesses here are alignment.
				68	*/
				69	tbz tmp2, #0, 1f
				70	ldrb1 tmp1w, src, #1
				71	strb1 tmp1w, dst, #1
				72	1:
				73	tbz tmp2, #1, 2f
				74	ldrh1 tmp1w, src, #2
				75	strh1 tmp1w, dst, #2
				76	2:
				77	tbz tmp2, #2, 3f
				78	ldr1 tmp1w, src, #4
				79	str1 tmp1w, dst, #4
				80	3:
				81	tbz tmp2, #3, .LSrcAligned
				82	ldr1 tmp1, src, #8
				83	str1 tmp1, dst, #8
				84
				85	.LSrcAligned:
				86	cmp count, #64
				87	b.ge .Lcpy_over64
				88	/*
				89	* Deal with small copies quickly by dropping straight into the
				90	* exit block.
				91	*/
				92	.Ltail63:
				93	/*
				94	* Copy up to 48 bytes of data. At this point we only need the
				95	* bottom 6 bits of count to be accurate.
				96	*/
				97	ands tmp1, count, #0x30
				98	b.eq .Ltiny15
				99	cmp tmp1w, #0x20
				100	b.eq 1f
				101	b.lt 2f
				102	ldp1 A_l, A_h, src, #16
				103	stp1 A_l, A_h, dst, #16
				104	1:
				105	ldp1 A_l, A_h, src, #16
				106	stp1 A_l, A_h, dst, #16
				107	2:
				108	ldp1 A_l, A_h, src, #16
				109	stp1 A_l, A_h, dst, #16
				110	.Ltiny15:
				111	/*
				112	* Prefer to break one ldp/stp into several load/store to access
				113	* memory in an increasing address order,rather than to load/store 16
				114	* bytes from (src-16) to (dst-16) and to backward the src to aligned
				115	* address,which way is used in original cortex memcpy. If keeping
				116	* the original memcpy process here, memmove need to satisfy the
				117	* precondition that src address is at least 16 bytes bigger than dst
				118	* address,otherwise some source data will be overwritten when memove
				119	* call memcpy directly. To make memmove simpler and decouple the
				120	* memcpy's dependency on memmove, withdrew the original process.
				121	*/
				122	tbz count, #3, 1f
				123	ldr1 tmp1, src, #8
				124	str1 tmp1, dst, #8
				125	1:
				126	tbz count, #2, 2f
				127	ldr1 tmp1w, src, #4
				128	str1 tmp1w, dst, #4
				129	2:
				130	tbz count, #1, 3f
				131	ldrh1 tmp1w, src, #2
				132	strh1 tmp1w, dst, #2
				133	3:
				134	tbz count, #0, .Lexitfunc
				135	ldrb1 tmp1w, src, #1
				136	strb1 tmp1w, dst, #1
				137
				138	b .Lexitfunc
				139
				140	.Lcpy_over64:
				141	subs count, count, #128
				142	b.ge .Lcpy_body_large
				143	/*
				144	* Less than 128 bytes to copy, so handle 64 here and then jump
				145	* to the tail.
				146	*/
				147	ldp1 A_l, A_h, src, #16
				148	stp1 A_l, A_h, dst, #16
				149	ldp1 B_l, B_h, src, #16
				150	ldp1 C_l, C_h, src, #16
				151	stp1 B_l, B_h, dst, #16
				152	stp1 C_l, C_h, dst, #16
				153	ldp1 D_l, D_h, src, #16
				154	stp1 D_l, D_h, dst, #16
				155
				156	tst count, #0x3f
				157	b.ne .Ltail63
				158	b .Lexitfunc
				159
				160	/*
				161	* Critical loop. Start at a new cache line boundary. Assuming
				162	* 64 bytes per line this ensures the entire loop is in one line.
				163	*/
				164	.p2align L1_CACHE_SHIFT
				165	.Lcpy_body_large:
				166	/* pre-get 64 bytes data. */
				167	ldp1 A_l, A_h, src, #16
				168	ldp1 B_l, B_h, src, #16
				169	ldp1 C_l, C_h, src, #16
				170	ldp1 D_l, D_h, src, #16
				171	1:
				172	/*
				173	* interlace the load of next 64 bytes data block with store of the last
				174	* loaded 64 bytes data.
				175	*/
				176	stp1 A_l, A_h, dst, #16
				177	ldp1 A_l, A_h, src, #16
				178	stp1 B_l, B_h, dst, #16
				179	ldp1 B_l, B_h, src, #16
				180	stp1 C_l, C_h, dst, #16
				181	ldp1 C_l, C_h, src, #16
				182	stp1 D_l, D_h, dst, #16
				183	ldp1 D_l, D_h, src, #16
				184	subs count, count, #64
				185	b.ge 1b
				186	stp1 A_l, A_h, dst, #16
				187	stp1 B_l, B_h, dst, #16
				188	stp1 C_l, C_h, dst, #16
				189	stp1 D_l, D_h, dst, #16
				190
				191	tst count, #0x3f
				192	b.ne .Ltail63
				193	.Lexitfunc: