| /* |
| Copyright 2003 Richard Curnow, SuperH (UK) Ltd. |
| |
| This file is subject to the terms and conditions of the GNU General Public |
| License. See the file "COPYING" in the main directory of this archive |
| for more details. |
| |
| Tight version of mempy for the case of just copying a page. |
| Prefetch strategy empirically optimised against RTL simulations |
| of SH5-101 cut2 eval chip with Cayman board DDR memory. |
| |
| Parameters: |
| r2 : destination effective address (start of page) |
| r3 : source effective address (start of page) |
| |
| Always copies 4096 bytes. |
| |
| Points to review. |
| * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. |
| It seems like the prefetch needs to be at at least 4 lines ahead to get |
| the data into the cache in time, and the allocos contend with outstanding |
| prefetches for the same cache set, so it's better to have the numbers |
| different. |
| */ |
| |
| .section .text..SHmedia32,"ax" |
| .little |
| |
| .balign 8 |
| .global copy_page |
| copy_page: |
| |
| /* Copy 4096 bytes worth of data from r3 to r2. |
| Do prefetches 4 lines ahead. |
| Do alloco 2 lines ahead */ |
| |
| pta 1f, tr1 |
| pta 2f, tr2 |
| pta 3f, tr3 |
| ptabs r18, tr0 |
| |
| #if 0 |
| /* TAKum03020 */ |
| ld.q r3, 0x00, r63 |
| ld.q r3, 0x20, r63 |
| ld.q r3, 0x40, r63 |
| ld.q r3, 0x60, r63 |
| #endif |
| alloco r2, 0x00 |
| synco ! TAKum03020 |
| alloco r2, 0x20 |
| synco ! TAKum03020 |
| |
| movi 3968, r6 |
| add r2, r6, r6 |
| addi r6, 64, r7 |
| addi r7, 64, r8 |
| sub r3, r2, r60 |
| addi r60, 8, r61 |
| addi r61, 8, r62 |
| addi r62, 8, r23 |
| addi r60, 0x80, r22 |
| |
| /* Minimal code size. The extra branches inside the loop don't cost much |
| because they overlap with the time spent waiting for prefetches to |
| complete. */ |
| 1: |
| #if 0 |
| /* TAKum03020 */ |
| bge/u r2, r6, tr2 ! skip prefetch for last 4 lines |
| ldx.q r2, r22, r63 ! prefetch 4 lines hence |
| #endif |
| 2: |
| bge/u r2, r7, tr3 ! skip alloco for last 2 lines |
| alloco r2, 0x40 ! alloc destination line 2 lines ahead |
| synco ! TAKum03020 |
| 3: |
| ldx.q r2, r60, r36 |
| ldx.q r2, r61, r37 |
| ldx.q r2, r62, r38 |
| ldx.q r2, r23, r39 |
| st.q r2, 0, r36 |
| st.q r2, 8, r37 |
| st.q r2, 16, r38 |
| st.q r2, 24, r39 |
| addi r2, 32, r2 |
| bgt/l r8, r2, tr1 |
| |
| blink tr0, r63 ! return |