Blame - arch/sh64/lib/page_copy.S - kernel/msm

blob: e159c3cd2582ec2389affe1a3bfeabe1ee5074a7 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
				3
				4	This file is subject to the terms and conditions of the GNU General Public
				5	License. See the file "COPYING" in the main directory of this archive
				6	for more details.
				7
				8	Tight version of mempy for the case of just copying a page.
				9	Prefetch strategy empirically optimised against RTL simulations
				10	of SH5-101 cut2 eval chip with Cayman board DDR memory.
				11
				12	Parameters:
				13	r2 : source effective address (start of page)
				14	r3 : destination effective address (start of page)
				15
				16	Always copies 4096 bytes.
				17
				18	Points to review.
				19	* Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
				20	It seems like the prefetch needs to be at at least 4 lines ahead to get
				21	the data into the cache in time, and the allocos contend with outstanding
				22	prefetches for the same cache set, so it's better to have the numbers
				23	different.
				24	*/
				25
				26	.section .text..SHmedia32,"ax"
				27	.little
				28
				29	.balign 8
				30	.global sh64_page_copy
				31	sh64_page_copy:
				32
				33	/* Copy 4096 bytes worth of data from r2 to r3.
				34	Do prefetches 4 lines ahead.
				35	Do alloco 2 lines ahead */
				36
				37	pta 1f, tr1
				38	pta 2f, tr2
				39	pta 3f, tr3
				40	ptabs r18, tr0
				41
				42	#if 0
				43	/* TAKum03020 */
				44	ld.q r2, 0x00, r63
				45	ld.q r2, 0x20, r63
				46	ld.q r2, 0x40, r63
				47	ld.q r2, 0x60, r63
				48	#endif
				49	alloco r3, 0x00
				50	synco ! TAKum03020
				51	alloco r3, 0x20
				52	synco ! TAKum03020
				53
				54	movi 3968, r6
				55	add r3, r6, r6
				56	addi r6, 64, r7
				57	addi r7, 64, r8
				58	sub r2, r3, r60
				59	addi r60, 8, r61
				60	addi r61, 8, r62
				61	addi r62, 8, r23
				62	addi r60, 0x80, r22
				63
				64	/* Minimal code size. The extra branches inside the loop don't cost much
				65	because they overlap with the time spent waiting for prefetches to
				66	complete. */
				67	1:
				68	#if 0
				69	/* TAKum03020 */
				70	bge/u r3, r6, tr2 ! skip prefetch for last 4 lines
				71	ldx.q r3, r22, r63 ! prefetch 4 lines hence
				72	#endif
				73	2:
				74	bge/u r3, r7, tr3 ! skip alloco for last 2 lines
				75	alloco r3, 0x40 ! alloc destination line 2 lines ahead
				76	synco ! TAKum03020
				77	3:
				78	ldx.q r3, r60, r36
				79	ldx.q r3, r61, r37
				80	ldx.q r3, r62, r38
				81	ldx.q r3, r23, r39
				82	st.q r3, 0, r36
				83	st.q r3, 8, r37
				84	st.q r3, 16, r38
				85	st.q r3, 24, r39
				86	addi r3, 32, r3
				87	bgt/l r8, r3, tr1
				88
				89	blink tr0, r63 ! return
				90
				91