Blame - arch/sh/lib64/copy_user_memcpy.S - kernel/msm-4.9

blob: 49aeabeba2c2c3b48d14c89004b3dc0ec04b987d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	!
				2	! Fast SH memcpy
				3	!
				4	! by Toshiyasu Morita (tm@netcom.com)
				5	! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
				6	! SH5 code Copyright 2002 SuperH Ltd.
				7	!
				8	! Entry: ARG0: destination pointer
				9	! ARG1: source pointer
				10	! ARG2: byte count
				11	!
				12	! Exit: RESULT: destination pointer
				13	! any other registers in the range r0-r7: trashed
				14	!
				15	! Notes: Usually one wants to do small reads and write a longword, but
				16	! unfortunately it is difficult in some cases to concatanate bytes
				17	! into a longword on the SH, so this does a longword read and small
				18	! writes.
				19	!
				20	! This implementation makes two assumptions about how it is called:
				21	!
				22	! 1.: If the byte count is nonzero, the address of the last byte to be
				23	! copied is unsigned greater than the address of the first byte to
				24	! be copied. This could be easily swapped for a signed comparison,
				25	! but the algorithm used needs some comparison.
				26	!
				27	! 2.: When there are two or three bytes in the last word of an 11-or-more
				28	! bytes memory chunk to b copied, the rest of the word can be read
				29	! without side effects.
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	30	! This could be easily changed by increasing the minimum size of
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	31	! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
				32	! however, this would cost a few extra cyles on average.
				33	! For SHmedia, the assumption is that any quadword can be read in its
				34	! enirety if at least one byte is included in the copy.
				35
				36	/* Imported into Linux kernel by Richard Curnow. This is used to implement the
				37	__copy_user function in the general case, so it has to be a distinct
				38	function from intra-kernel memcpy to allow for exception fix-ups in the
				39	event that the user pointer is bad somewhere in the copy (e.g. due to
				40	running off the end of the vma).
				41
				42	Note, this algorithm will be slightly wasteful in the case where the source
				43	and destination pointers are equally aligned, because the stlo/sthi pairs
				44	could then be merged back into single stores. If there are a lot of cache
				45	misses, this is probably offset by the stall lengths on the preloads.
				46
				47	*/
				48
				49	/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
				50	* erratum. The first two prefetches are nop-ed out to avoid upsetting the
				51	* instruction counts used in the jump address calculation.
				52	* */
				53
				54	.section .text..SHmedia32,"ax"
				55	.little
				56	.balign 32
				57	.global copy_user_memcpy
				58	.global copy_user_memcpy_end
				59	copy_user_memcpy:
				60
				61	#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
				62	#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
				63	#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
				64	#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
				65
				66	nop ! ld.b r3,0,r63 ! TAKum03020
				67	pta/l Large,tr0
				68	movi 25,r0
				69	bgeu/u r4,r0,tr0
				70	nsb r4,r0
				71	shlli r0,5,r0
				72	movi (L1-L0+63*32 + 1) & 0xffff,r1
				73	sub r1, r0, r0
				74	L0: ptrel r0,tr0
				75	add r2,r4,r5
				76	ptabs r18,tr1
				77	add r3,r4,r6
				78	blink tr0,r63
				79
				80	/* Rearranged to make cut2 safe */
				81	.balign 8
				82	L4_7: /* 4..7 byte memcpy cntd. */
				83	stlo.l r2, 0, r0
				84	or r6, r7, r6
				85	sthi.l r5, -1, r6
				86	stlo.l r5, -4, r6
				87	blink tr1,r63
				88
				89	.balign 8
				90	L1: /* 0 byte memcpy */
				91	nop
				92	blink tr1,r63
				93	nop
				94	nop
				95	nop
				96	nop
				97
				98	L2_3: /* 2 or 3 byte memcpy cntd. */
				99	st.b r5,-1,r6
				100	blink tr1,r63
				101
				102	/* 1 byte memcpy */
				103	ld.b r3,0,r0
				104	st.b r2,0,r0
				105	blink tr1,r63
				106
				107	L8_15: /* 8..15 byte memcpy cntd. */
				108	stlo.q r2, 0, r0
				109	or r6, r7, r6
				110	sthi.q r5, -1, r6
				111	stlo.q r5, -8, r6
				112	blink tr1,r63
				113
				114	/* 2 or 3 byte memcpy */
				115	ld.b r3,0,r0
				116	nop ! ld.b r2,0,r63 ! TAKum03020
				117	ld.b r3,1,r1
				118	st.b r2,0,r0
				119	pta/l L2_3,tr0
				120	ld.b r6,-1,r6
				121	st.b r2,1,r1
				122	blink tr0, r63
				123
				124	/* 4 .. 7 byte memcpy */
				125	LDUAL (r3, 0, r0, r1)
				126	pta L4_7, tr0
				127	ldlo.l r6, -4, r7
				128	or r0, r1, r0
				129	sthi.l r2, 3, r0
				130	ldhi.l r6, -1, r6
				131	blink tr0, r63
				132
				133	/* 8 .. 15 byte memcpy */
				134	LDUAQ (r3, 0, r0, r1)
				135	pta L8_15, tr0
				136	ldlo.q r6, -8, r7
				137	or r0, r1, r0
				138	sthi.q r2, 7, r0
				139	ldhi.q r6, -1, r6
				140	blink tr0, r63
				141
				142	/* 16 .. 24 byte memcpy */
				143	LDUAQ (r3, 0, r0, r1)
				144	LDUAQ (r3, 8, r8, r9)
				145	or r0, r1, r0
				146	sthi.q r2, 7, r0
				147	or r8, r9, r8
				148	sthi.q r2, 15, r8
				149	ldlo.q r6, -8, r7
				150	ldhi.q r6, -1, r6
				151	stlo.q r2, 8, r8
				152	stlo.q r2, 0, r0
				153	or r6, r7, r6
				154	sthi.q r5, -1, r6
				155	stlo.q r5, -8, r6
				156	blink tr1,r63
				157
				158	Large:
				159	! ld.b r2, 0, r63 ! TAKum03020
				160	pta/l Loop_ua, tr1
				161	ori r3, -8, r7
				162	sub r2, r7, r22
				163	sub r3, r2, r6
				164	add r2, r4, r5
				165	ldlo.q r3, 0, r0
				166	addi r5, -16, r5
				167	movi 64+8, r27 ! could subtract r7 from that.
				168	stlo.q r2, 0, r0
				169	sthi.q r2, 7, r0
				170	ldx.q r22, r6, r0
				171	bgtu/l r27, r4, tr1
				172
				173	addi r5, -48, r27
				174	pta/l Loop_line, tr0
				175	addi r6, 64, r36
				176	addi r6, -24, r19
				177	addi r6, -16, r20
				178	addi r6, -8, r21
				179
				180	Loop_line:
				181	! ldx.q r22, r36, r63 ! TAKum03020
				182	alloco r22, 32
				183	synco
				184	addi r22, 32, r22
				185	ldx.q r22, r19, r23
				186	sthi.q r22, -25, r0
				187	ldx.q r22, r20, r24
				188	ldx.q r22, r21, r25
				189	stlo.q r22, -32, r0
				190	ldx.q r22, r6, r0
				191	sthi.q r22, -17, r23
				192	sthi.q r22, -9, r24
				193	sthi.q r22, -1, r25
				194	stlo.q r22, -24, r23
				195	stlo.q r22, -16, r24
				196	stlo.q r22, -8, r25
				197	bgeu r27, r22, tr0
				198
				199	Loop_ua:
				200	addi r22, 8, r22
				201	sthi.q r22, -1, r0
				202	stlo.q r22, -8, r0
				203	ldx.q r22, r6, r0
				204	bgtu/l r5, r22, tr1
				205
				206	add r3, r4, r7
				207	ldlo.q r7, -8, r1
				208	sthi.q r22, 7, r0
				209	ldhi.q r7, -1, r7
				210	ptabs r18,tr1
				211	stlo.q r22, 0, r0
				212	or r1, r7, r1
				213	sthi.q r5, 15, r1
				214	stlo.q r5, 8, r1
				215	blink tr1, r63
				216	copy_user_memcpy_end:
				217	nop