blob: 0ec6fca63b563c70cb2861a039e8e26878a524d1 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
3
4 This file is subject to the terms and conditions of the GNU General Public
5 License. See the file "COPYING" in the main directory of this archive
6 for more details.
7
8 Tight version of mempy for the case of just copying a page.
9 Prefetch strategy empirically optimised against RTL simulations
10 of SH5-101 cut2 eval chip with Cayman board DDR memory.
11
12 Parameters:
Paul Mundt379a95d2007-11-20 16:51:28 +090013 r2 : destination effective address (start of page)
14 r3 : source effective address (start of page)
Linus Torvalds1da177e2005-04-16 15:20:36 -070015
16 Always copies 4096 bytes.
17
18 Points to review.
19 * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
20 It seems like the prefetch needs to be at at least 4 lines ahead to get
21 the data into the cache in time, and the allocos contend with outstanding
22 prefetches for the same cache set, so it's better to have the numbers
23 different.
24 */
25
26 .section .text..SHmedia32,"ax"
27 .little
28
29 .balign 8
Paul Mundt379a95d2007-11-20 16:51:28 +090030 .global copy_page
31copy_page:
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
Paul Mundt379a95d2007-11-20 16:51:28 +090033 /* Copy 4096 bytes worth of data from r3 to r2.
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 Do prefetches 4 lines ahead.
35 Do alloco 2 lines ahead */
36
37 pta 1f, tr1
38 pta 2f, tr2
39 pta 3f, tr3
40 ptabs r18, tr0
41
42#if 0
43 /* TAKum03020 */
Paul Mundt379a95d2007-11-20 16:51:28 +090044 ld.q r3, 0x00, r63
45 ld.q r3, 0x20, r63
46 ld.q r3, 0x40, r63
47 ld.q r3, 0x60, r63
Linus Torvalds1da177e2005-04-16 15:20:36 -070048#endif
Paul Mundt379a95d2007-11-20 16:51:28 +090049 alloco r2, 0x00
Linus Torvalds1da177e2005-04-16 15:20:36 -070050 synco ! TAKum03020
Paul Mundt379a95d2007-11-20 16:51:28 +090051 alloco r2, 0x20
Linus Torvalds1da177e2005-04-16 15:20:36 -070052 synco ! TAKum03020
53
54 movi 3968, r6
Paul Mundt379a95d2007-11-20 16:51:28 +090055 add r2, r6, r6
Linus Torvalds1da177e2005-04-16 15:20:36 -070056 addi r6, 64, r7
57 addi r7, 64, r8
Paul Mundt379a95d2007-11-20 16:51:28 +090058 sub r3, r2, r60
Linus Torvalds1da177e2005-04-16 15:20:36 -070059 addi r60, 8, r61
60 addi r61, 8, r62
61 addi r62, 8, r23
62 addi r60, 0x80, r22
63
64/* Minimal code size. The extra branches inside the loop don't cost much
65 because they overlap with the time spent waiting for prefetches to
66 complete. */
671:
68#if 0
69 /* TAKum03020 */
Paul Mundt379a95d2007-11-20 16:51:28 +090070 bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
71 ldx.q r2, r22, r63 ! prefetch 4 lines hence
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#endif
732:
Paul Mundt379a95d2007-11-20 16:51:28 +090074 bge/u r2, r7, tr3 ! skip alloco for last 2 lines
75 alloco r2, 0x40 ! alloc destination line 2 lines ahead
Linus Torvalds1da177e2005-04-16 15:20:36 -070076 synco ! TAKum03020
773:
Paul Mundt379a95d2007-11-20 16:51:28 +090078 ldx.q r2, r60, r36
79 ldx.q r2, r61, r37
80 ldx.q r2, r62, r38
81 ldx.q r2, r23, r39
82 st.q r2, 0, r36
83 st.q r2, 8, r37
84 st.q r2, 16, r38
85 st.q r2, 24, r39
86 addi r2, 32, r2
87 bgt/l r8, r2, tr1
Linus Torvalds1da177e2005-04-16 15:20:36 -070088
89 blink tr0, r63 ! return