Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | Copyright 2003 Richard Curnow, SuperH (UK) Ltd. |
| 3 | |
| 4 | This file is subject to the terms and conditions of the GNU General Public |
| 5 | License. See the file "COPYING" in the main directory of this archive |
| 6 | for more details. |
| 7 | |
| 8 | Tight version of mempy for the case of just copying a page. |
| 9 | Prefetch strategy empirically optimised against RTL simulations |
| 10 | of SH5-101 cut2 eval chip with Cayman board DDR memory. |
| 11 | |
| 12 | Parameters: |
| 13 | r2 : source effective address (start of page) |
| 14 | r3 : destination effective address (start of page) |
| 15 | |
| 16 | Always copies 4096 bytes. |
| 17 | |
| 18 | Points to review. |
| 19 | * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. |
| 20 | It seems like the prefetch needs to be at at least 4 lines ahead to get |
| 21 | the data into the cache in time, and the allocos contend with outstanding |
| 22 | prefetches for the same cache set, so it's better to have the numbers |
| 23 | different. |
| 24 | */ |
| 25 | |
| 26 | .section .text..SHmedia32,"ax" |
| 27 | .little |
| 28 | |
| 29 | .balign 8 |
| 30 | .global sh64_page_copy |
| 31 | sh64_page_copy: |
| 32 | |
| 33 | /* Copy 4096 bytes worth of data from r2 to r3. |
| 34 | Do prefetches 4 lines ahead. |
| 35 | Do alloco 2 lines ahead */ |
| 36 | |
| 37 | pta 1f, tr1 |
| 38 | pta 2f, tr2 |
| 39 | pta 3f, tr3 |
| 40 | ptabs r18, tr0 |
| 41 | |
| 42 | #if 0 |
| 43 | /* TAKum03020 */ |
| 44 | ld.q r2, 0x00, r63 |
| 45 | ld.q r2, 0x20, r63 |
| 46 | ld.q r2, 0x40, r63 |
| 47 | ld.q r2, 0x60, r63 |
| 48 | #endif |
| 49 | alloco r3, 0x00 |
| 50 | synco ! TAKum03020 |
| 51 | alloco r3, 0x20 |
| 52 | synco ! TAKum03020 |
| 53 | |
| 54 | movi 3968, r6 |
| 55 | add r3, r6, r6 |
| 56 | addi r6, 64, r7 |
| 57 | addi r7, 64, r8 |
| 58 | sub r2, r3, r60 |
| 59 | addi r60, 8, r61 |
| 60 | addi r61, 8, r62 |
| 61 | addi r62, 8, r23 |
| 62 | addi r60, 0x80, r22 |
| 63 | |
| 64 | /* Minimal code size. The extra branches inside the loop don't cost much |
| 65 | because they overlap with the time spent waiting for prefetches to |
| 66 | complete. */ |
| 67 | 1: |
| 68 | #if 0 |
| 69 | /* TAKum03020 */ |
| 70 | bge/u r3, r6, tr2 ! skip prefetch for last 4 lines |
| 71 | ldx.q r3, r22, r63 ! prefetch 4 lines hence |
| 72 | #endif |
| 73 | 2: |
| 74 | bge/u r3, r7, tr3 ! skip alloco for last 2 lines |
| 75 | alloco r3, 0x40 ! alloc destination line 2 lines ahead |
| 76 | synco ! TAKum03020 |
| 77 | 3: |
| 78 | ldx.q r3, r60, r36 |
| 79 | ldx.q r3, r61, r37 |
| 80 | ldx.q r3, r62, r38 |
| 81 | ldx.q r3, r23, r39 |
| 82 | st.q r3, 0, r36 |
| 83 | st.q r3, 8, r37 |
| 84 | st.q r3, 16, r38 |
| 85 | st.q r3, 24, r39 |
| 86 | addi r3, 32, r3 |
| 87 | bgt/l r8, r3, tr1 |
| 88 | |
| 89 | blink tr0, r63 ! return |
| 90 | |
| 91 | |