| /* Copyright (c) 2015 The Linux Foundation. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of The Linux Foundation nor the names of its contributors may |
| * be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifdef PLDOFFS |
| #undef PLDOFFS |
| #endif |
| #define PLDOFFS (16) |
| |
| #ifdef PLDTHRESH |
| #undef PLDTHRESH |
| #endif |
| #define PLDTHRESH (PLDOFFS) |
| |
| #ifdef BBTHRESH |
| #undef BBTHRESH |
| #endif |
| #define BBTHRESH (2048/128) |
| |
| #if (PLDOFFS < 1) |
| #error Routine does not support offsets less than 1 |
| #endif |
| #if (PLDTHRESH < PLDOFFS) |
| #error PLD threshold must be greater than or equal to the PLD offset |
| #endif |
| |
| #ifdef PLDSIZE |
| #undef PLDSIZE |
| #endif |
| #define PLDSIZE (128) |
| |
| kryo_bb_memcpy: |
| mov x11, x0 |
| cmp x2, #4 |
| blo kryo_bb_lt4 |
| cmp x2, #16 |
| blo kryo_bb_lt16 |
| cmp x2, #32 |
| blo kryo_bb_16 |
| cmp x2, #64 |
| blo kryo_bb_copy_32_a |
| cmp x2, #128 |
| blo kryo_bb_copy_64_a |
| |
| // we have at least 127 bytes to achieve 128-byte alignment |
| neg x3, x1 // calculate count to get SOURCE aligned |
| ands x3, x3, #0x7F |
| b.eq kryo_bb_source_aligned // already aligned |
| // alignment fixup, small to large (favorable alignment) |
| tbz x3, #0, 1f |
| ldrb w5, [x1], #1 |
| strb w5, [x0], #1 |
| 1: tbz x3, #1, 2f |
| ldrh w6, [x1], #2 |
| strh w6, [x0], #2 |
| 2: tbz x3, #2, 3f |
| ldr w8, [x1], #4 |
| str w8, [x0], #4 |
| 3: tbz x3, #3, 4f |
| ldr x9, [x1], #8 |
| str x9, [x0], #8 |
| 4: tbz x3, #4, 5f |
| ldr q7, [x1], #16 |
| str q7, [x0], #16 |
| 5: tbz x3, #5, 55f |
| ldp q0, q1, [x1], #32 |
| stp q0, q1, [x0], #32 |
| 55: tbz x3, #6, 6f |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| 6: subs x2, x2, x3 // fixup count after alignment |
| b.eq kryo_bb_exit |
| cmp x2, #128 |
| blo kryo_bb_copy_64_a |
| kryo_bb_source_aligned: |
| lsr x12, x2, #7 |
| cmp x12, #PLDTHRESH |
| bls kryo_bb_copy_128_loop_nopld |
| |
| cmp x12, #BBTHRESH |
| bls kryo_bb_prime_pump |
| |
| add x14, x0, #0x400 |
| add x9, x1, #(PLDOFFS*PLDSIZE) |
| sub x14, x14, x9 |
| lsl x14, x14, #(21+32) |
| lsr x14, x14, #(21+32) |
| add x14, x14, #(PLDOFFS*PLDSIZE) |
| cmp x12, x14, lsr #7 |
| bls kryo_bb_prime_pump |
| |
| mov x9, #(PLDOFFS) |
| lsr x13, x14, #7 |
| subs x9, x13, x9 |
| bls kryo_bb_prime_pump |
| |
| add x10, x1, x14 |
| bic x10, x10, #0x7F // Round to multiple of PLDSIZE |
| |
| sub x12, x12, x14, lsr #7 |
| cmp x9, x12 |
| sub x13, x12, x9 |
| csel x12, x13, x12, LS |
| csel x9, x12, x9, HI |
| csel x12, xzr, x12, HI |
| |
| prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)] |
| prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)] |
| kryo_bb_copy_128_loop_outer_doublepld: |
| prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)] |
| prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64] |
| subs x9, x9, #1 |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| ldp q4, q5, [x1], #32 |
| ldp q6, q7, [x1], #32 |
| prfm PLDL1KEEP, [x10] |
| prfm PLDL1KEEP, [x10, #64] |
| add x10, x10, #128 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| stp q4, q5, [x0], #32 |
| stp q6, q7, [x0], #32 |
| bne kryo_bb_copy_128_loop_outer_doublepld |
| cmp x12, #0 |
| beq kryo_bb_pop_before_nopld |
| cmp x12, #(448*1024/128) |
| bls kryo_bb_copy_128_loop_outer |
| |
| kryo_bb_copy_128_loop_ddr: |
| subs x12, x12, #1 |
| ldr x3, [x10], #128 |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| ldp q4, q5, [x1], #32 |
| ldp q6, q7, [x1], #32 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| stp q4, q5, [x0], #32 |
| stp q6, q7, [x0], #32 |
| bne kryo_bb_copy_128_loop_ddr |
| b kryo_bb_pop_before_nopld |
| |
| kryo_bb_prime_pump: |
| mov x14, #(PLDOFFS*PLDSIZE) |
| add x10, x1, #(PLDOFFS*PLDSIZE) |
| bic x10, x10, #0x7F |
| sub x12, x12, #PLDOFFS |
| prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)] |
| prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)] |
| cmp x12, #(448*1024/128) |
| bhi kryo_bb_copy_128_loop_ddr |
| |
| kryo_bb_copy_128_loop_outer: |
| subs x12, x12, #1 |
| prfm PLDL1KEEP, [x10] |
| prfm PLDL1KEEP, [x10, #64] |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| ldp q4, q5, [x1], #32 |
| ldp q6, q7, [x1], #32 |
| add x10, x10, #128 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| stp q4, q5, [x0], #32 |
| stp q6, q7, [x0], #32 |
| bne kryo_bb_copy_128_loop_outer |
| |
| kryo_bb_pop_before_nopld: |
| lsr x12, x14, #7 |
| kryo_bb_copy_128_loop_nopld: |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| ldp q4, q5, [x1], #32 |
| ldp q6, q7, [x1], #32 |
| subs x12, x12, #1 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| stp q4, q5, [x0], #32 |
| stp q6, q7, [x0], #32 |
| bne kryo_bb_copy_128_loop_nopld |
| ands x2, x2, #0x7f |
| beq kryo_bb_exit |
| |
| kryo_bb_copy_64_a: |
| tbz x2, #6, kryo_bb_copy_32_a |
| ldp q0, q1, [x1], #32 |
| ldp q2, q3, [x1], #32 |
| stp q0, q1, [x0], #32 |
| stp q2, q3, [x0], #32 |
| kryo_bb_copy_32_a: |
| tbz x2, #5, kryo_bb_16 |
| ldp q0, q1, [x1], #32 |
| stp q0, q1, [x0], #32 |
| kryo_bb_16: |
| tbz x2, #4, kryo_bb_lt16 |
| ldr q7, [x1], #16 |
| str q7, [x0], #16 |
| ands x2, x2, #0x0f |
| beq kryo_bb_exit |
| kryo_bb_lt16: |
| tbz x2, #3, kryo_bb_lt8 |
| ldr x3, [x1], #8 |
| str x3, [x0], #8 |
| kryo_bb_lt8: |
| tbz x2, #2, kryo_bb_lt4 |
| ldr w3, [x1], #4 |
| str w3, [x0], #4 |
| kryo_bb_lt4: |
| tbz x2, #1, kryo_bb_lt2 |
| ldrh w3, [x1], #2 |
| strh w3, [x0], #2 |
| kryo_bb_lt2: |
| tbz x2, #0, kryo_bb_exit |
| ldrb w3, [x1], #1 |
| strb w3, [x0], #1 |
| kryo_bb_exit: |
| mov x0, x11 |
| ret |
| |