Blame - arch/arm64/lib/memcpy.S - kernel/msm-4.9

blob: 8a9a96d3ddae04331828c9744b4d94368ef70620 [file] [log] [blame]

Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 ARM Ltd.
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	3	* Copyright (C) 2013 Linaro.
				4	*
				5	* This code is based on glibc cortex strings work originally authored by Linaro
				6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
				7	* be found @
				8	*
				9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
				10	* files/head:/src/aarch64/
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	11	*
				12	* This program is free software; you can redistribute it and/or modify
				13	* it under the terms of the GNU General Public License version 2 as
				14	* published by the Free Software Foundation.
				15	*
				16	* This program is distributed in the hope that it will be useful,
				17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				19	* GNU General Public License for more details.
				20	*
				21	* You should have received a copy of the GNU General Public License
				22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
				23	*/
				24
				25	#include <linux/linkage.h>
				26	#include <asm/assembler.h>
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	27	#include <asm/cache.h>
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	28
				29	/*
				30	* Copy a buffer from src to dest (alignment handled by the hardware)
				31	*
				32	* Parameters:
				33	* x0 - dest
				34	* x1 - src
				35	* x2 - n
				36	* Returns:
				37	* x0 - dest
				38	*/
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	39	dstin .req x0
				40	src .req x1
				41	count .req x2
				42	tmp1 .req x3
				43	tmp1w .req w3
				44	tmp2 .req x4
				45	tmp2w .req w4
				46	tmp3 .req x5
				47	tmp3w .req w5
				48	dst .req x6
				49
				50	A_l .req x7
				51	A_h .req x8
				52	B_l .req x9
				53	B_h .req x10
				54	C_l .req x11
				55	C_h .req x12
				56	D_l .req x13
				57	D_h .req x14
				58
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	59	ENTRY(memcpy)
zhichang.yuan	808dbac	2014-04-28 06:11:29 +0100	[diff] [blame]	60	mov dst, dstin
				61	cmp count, #16
				62	/When memory length is less than 16, the accessed are not aligned./
				63	b.lo .Ltiny15
				64
				65	neg tmp2, src
				66	ands tmp2, tmp2, #15/* Bytes to reach alignment. */
				67	b.eq .LSrcAligned
				68	sub count, count, tmp2
				69	/*
				70	* Copy the leading memory data from src to dst in an increasing
				71	* address order.By this way,the risk of overwritting the source
				72	* memory data is eliminated when the distance between src and
				73	* dst is less than 16. The memory accesses here are alignment.
				74	*/
				75	tbz tmp2, #0, 1f
				76	ldrb tmp1w, [src], #1
				77	strb tmp1w, [dst], #1
				78	1:
				79	tbz tmp2, #1, 2f
				80	ldrh tmp1w, [src], #2
				81	strh tmp1w, [dst], #2
				82	2:
				83	tbz tmp2, #2, 3f
				84	ldr tmp1w, [src], #4
				85	str tmp1w, [dst], #4
				86	3:
				87	tbz tmp2, #3, .LSrcAligned
				88	ldr tmp1, [src],#8
				89	str tmp1, [dst],#8
				90
				91	.LSrcAligned:
				92	cmp count, #64
				93	b.ge .Lcpy_over64
				94	/*
				95	* Deal with small copies quickly by dropping straight into the
				96	* exit block.
				97	*/
				98	.Ltail63:
				99	/*
				100	* Copy up to 48 bytes of data. At this point we only need the
				101	* bottom 6 bits of count to be accurate.
				102	*/
				103	ands tmp1, count, #0x30
				104	b.eq .Ltiny15
				105	cmp tmp1w, #0x20
				106	b.eq 1f
				107	b.lt 2f
				108	ldp A_l, A_h, [src], #16
				109	stp A_l, A_h, [dst], #16
				110	1:
				111	ldp A_l, A_h, [src], #16
				112	stp A_l, A_h, [dst], #16
				113	2:
				114	ldp A_l, A_h, [src], #16
				115	stp A_l, A_h, [dst], #16
				116	.Ltiny15:
				117	/*
				118	* Prefer to break one ldp/stp into several load/store to access
				119	* memory in an increasing address order,rather than to load/store 16
				120	* bytes from (src-16) to (dst-16) and to backward the src to aligned
				121	* address,which way is used in original cortex memcpy. If keeping
				122	* the original memcpy process here, memmove need to satisfy the
				123	* precondition that src address is at least 16 bytes bigger than dst
				124	* address,otherwise some source data will be overwritten when memove
				125	* call memcpy directly. To make memmove simpler and decouple the
				126	* memcpy's dependency on memmove, withdrew the original process.
				127	*/
				128	tbz count, #3, 1f
				129	ldr tmp1, [src], #8
				130	str tmp1, [dst], #8
				131	1:
				132	tbz count, #2, 2f
				133	ldr tmp1w, [src], #4
				134	str tmp1w, [dst], #4
				135	2:
				136	tbz count, #1, 3f
				137	ldrh tmp1w, [src], #2
				138	strh tmp1w, [dst], #2
				139	3:
				140	tbz count, #0, .Lexitfunc
				141	ldrb tmp1w, [src]
				142	strb tmp1w, [dst]
				143
				144	.Lexitfunc:
				145	ret
				146
				147	.Lcpy_over64:
				148	subs count, count, #128
				149	b.ge .Lcpy_body_large
				150	/*
				151	* Less than 128 bytes to copy, so handle 64 here and then jump
				152	* to the tail.
				153	*/
				154	ldp A_l, A_h, [src],#16
				155	stp A_l, A_h, [dst],#16
				156	ldp B_l, B_h, [src],#16
				157	ldp C_l, C_h, [src],#16
				158	stp B_l, B_h, [dst],#16
				159	stp C_l, C_h, [dst],#16
				160	ldp D_l, D_h, [src],#16
				161	stp D_l, D_h, [dst],#16
				162
				163	tst count, #0x3f
				164	b.ne .Ltail63
				165	ret
				166
				167	/*
				168	* Critical loop. Start at a new cache line boundary. Assuming
				169	* 64 bytes per line this ensures the entire loop is in one line.
				170	*/
				171	.p2align L1_CACHE_SHIFT
				172	.Lcpy_body_large:
				173	/* pre-get 64 bytes data. */
				174	ldp A_l, A_h, [src],#16
				175	ldp B_l, B_h, [src],#16
				176	ldp C_l, C_h, [src],#16
				177	ldp D_l, D_h, [src],#16
				178	1:
				179	/*
				180	* interlace the load of next 64 bytes data block with store of the last
				181	* loaded 64 bytes data.
				182	*/
				183	stp A_l, A_h, [dst],#16
				184	ldp A_l, A_h, [src],#16
				185	stp B_l, B_h, [dst],#16
				186	ldp B_l, B_h, [src],#16
				187	stp C_l, C_h, [dst],#16
				188	ldp C_l, C_h, [src],#16
				189	stp D_l, D_h, [dst],#16
				190	ldp D_l, D_h, [src],#16
				191	subs count, count, #64
				192	b.ge 1b
				193	stp A_l, A_h, [dst],#16
				194	stp B_l, B_h, [dst],#16
				195	stp C_l, C_h, [dst],#16
				196	stp D_l, D_h, [dst],#16
				197
				198	tst count, #0x3f
				199	b.ne .Ltail63
				200	ret
Catalin Marinas	4a89922	2013-03-21 16:16:43 +0000	[diff] [blame]	201	ENDPROC(memcpy)