| /* libs/pixelflinger/t32cb16blend.S |
| ** |
| ** Copyright 2006, The Android Open Source Project |
| ** |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| */ |
| |
| |
| .text |
| .align |
| |
| .global scanline_t32cb16blend_arm |
| |
| // uses r6, r7, lr |
| |
| .macro pixel, DREG, SRC, FB, OFFSET |
| |
| // SRC = AARRGGBB |
| mov r7, \SRC, lsr #24 // sA |
| add r7, r7, r7, lsr #7 // sA + (sA >> 7) |
| rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) |
| |
| 1: |
| |
| .if \OFFSET |
| |
| // red |
| mov lr, \DREG, lsr #(\OFFSET + 6 + 5) |
| smulbb lr, r7, lr |
| mov r6, \SRC, lsr #3 |
| and r6, r6, #0x1F |
| add lr, r6, lr, lsr #8 |
| orr \FB, lr, lsl #(\OFFSET + 11) |
| |
| // green |
| and r6, \DREG, #(0x3F<<(\OFFSET + 5)) |
| smulbt r6, r7, r6 |
| mov lr, \SRC, lsr #(8+2) |
| and lr, lr, #0x3F |
| add r6, lr, r6, lsr #(5+8) |
| orr \FB, \FB, r6, lsl #(\OFFSET + 5) |
| |
| // blue |
| and lr, \DREG, #(0x1F << \OFFSET) |
| smulbt lr, r7, lr |
| mov r6, \SRC, lsr #(8+8+3) |
| and r6, r6, #0x1F |
| add lr, r6, lr, lsr #8 |
| orr \FB, \FB, lr, lsl #\OFFSET |
| |
| .else |
| |
| // red |
| mov lr, \DREG, lsr #(6+5) |
| and lr, lr, #0x1F |
| smulbb lr, r7, lr |
| mov r6, \SRC, lsr #3 |
| and r6, r6, #0x1F |
| add lr, r6, lr, lsr #8 |
| mov \FB, lr, lsl #11 |
| |
| // green |
| and r6, \DREG, #(0x3F<<5) |
| smulbb r6, r7, r6 |
| mov lr, \SRC, lsr #(8+2) |
| and lr, lr, #0x3F |
| add r6, lr, r6, lsr #(5+8) |
| orr \FB, \FB, r6, lsl #5 |
| |
| // blue |
| and lr, \DREG, #0x1F |
| smulbb lr, r7, lr |
| mov r6, \SRC, lsr #(8+8+3) |
| and r6, r6, #0x1F |
| add lr, r6, lr, lsr #8 |
| orr \FB, \FB, lr |
| |
| .endif |
| |
| .endm |
| |
| |
| // r0: dst ptr |
| // r1: src ptr |
| // r2: count |
| // r3: d |
| // r4: s0 |
| // r5: s1 |
| // r6: pixel |
| // r7: pixel |
| // r8: free |
| // r9: free |
| // r10: free |
| // r11: free |
| // r12: scratch |
| // r14: pixel |
| |
| scanline_t32cb16blend_arm: |
| stmfd sp!, {r4-r7, lr} |
| |
| pld [r0] |
| pld [r1] |
| |
| // align DST to 32 bits |
| tst r0, #0x3 |
| beq aligned |
| subs r2, r2, #1 |
| ldmlofd sp!, {r4-r7, lr} // return |
| bxlo lr |
| |
| last: |
| ldr r4, [r1], #4 |
| ldrh r3, [r0] |
| pixel r3, r4, r12, 0 |
| strh r12, [r0], #2 |
| |
| aligned: |
| subs r2, r2, #2 |
| blo 9f |
| |
| // The main loop is unrolled twice and process 4 pixels |
| 8: ldmia r1!, {r4, r5} |
| // stream the source |
| pld [r1, #32] |
| add r0, r0, #4 |
| // it's all zero, skip this pixel |
| orrs r3, r4, r5 |
| beq 7f |
| |
| // load the destination |
| ldr r3, [r0, #-4] |
| // stream the destination |
| pld [r0, #32] |
| pixel r3, r4, r12, 0 |
| pixel r3, r5, r12, 16 |
| // effectively, we're getting write-combining by virtue of the |
| // cpu's write-back cache. |
| str r12, [r0, #-4] |
| |
| // 2nd iterration of the loop, don't stream anything |
| subs r2, r2, #2 |
| movlt r4, r5 |
| blt 9f |
| ldmia r1!, {r4, r5} |
| add r0, r0, #4 |
| orrs r3, r4, r5 |
| beq 7f |
| ldr r3, [r0, #-4] |
| pixel r3, r4, r12, 0 |
| pixel r3, r5, r12, 16 |
| str r12, [r0, #-4] |
| |
| |
| 7: subs r2, r2, #2 |
| bhs 8b |
| mov r4, r5 |
| |
| 9: adds r2, r2, #1 |
| ldmlofd sp!, {r4-r7, lr} // return |
| bxlo lr |
| b last |