reed@android.com | a0bd7f4 | 2009-08-03 17:22:46 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (C) 2009 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "SkBitmapProcState.h" |
| 18 | |
| 19 | #if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN) |
| 20 | void S16_D16_nofilter_DX_arm(const SkBitmapProcState& s, |
| 21 | const uint32_t* SK_RESTRICT xy, |
| 22 | int count, uint16_t* SK_RESTRICT colors) { |
| 23 | SkASSERT(count > 0 && colors != NULL); |
| 24 | SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); |
| 25 | SkASSERT(s.fDoFilter == false); |
| 26 | |
| 27 | const uint16_t* SK_RESTRICT srcAddr = (const uint16_t*)s.fBitmap->getPixels(); |
| 28 | |
| 29 | // buffer is y32, x16, x16, x16, x16, x16 |
| 30 | // bump srcAddr to the proper row, since we're told Y never changes |
| 31 | SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height()); |
| 32 | srcAddr = (const uint16_t*)((const char*)srcAddr + |
| 33 | xy[0] * s.fBitmap->rowBytes()); |
| 34 | |
| 35 | uint16_t src; |
| 36 | |
| 37 | if (1 == s.fBitmap->width()) { |
| 38 | src = srcAddr[0]; |
| 39 | uint16_t dstValue = src; |
| 40 | sk_memset16(colors, dstValue, count); |
| 41 | } else { |
| 42 | int i; |
| 43 | const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1); |
| 44 | |
| 45 | if((count >> 2) > 0) { |
| 46 | asm volatile ( |
| 47 | "mov r8, %[count], lsr #2 \n\t" // shift down count so we iterate in fours |
| 48 | "1: \n\t" |
| 49 | "subs r8, r8, #1 \n\t" // decrement loop counter |
| 50 | "ldrh r4, [%[xx]], #2 \n\t" // load xx value, update ptr |
| 51 | "ldrh r5, [%[xx]], #2 \n\t" // load xx value, update ptr |
| 52 | "ldrh r6, [%[xx]], #2 \n\t" // load xx value, update ptr |
| 53 | "add r4, r4, r4 \n\t" // double offset for half word addressing |
| 54 | "ldrh r7, [%[xx]], #2 \n\t" // load xx value, update ptr |
| 55 | "add r5, r5, r5 \n\t" // double offset for half word addressing |
| 56 | "ldrh r4, [%[srcAddr], r4] \n\t" // load value from srcAddr[*xx] |
| 57 | "add r6, r6, r6 \n\t" // double offset for half word addressing |
| 58 | "ldrh r5, [%[srcAddr], r5] \n\t" // load value from srcAddr[*xx] |
| 59 | "add r7, r7, r7 \n\t" // double offset for half word addressing |
| 60 | "ldrh r6, [%[srcAddr], r6] \n\t" // load value from srcAddr[*xx] |
| 61 | "ldrh r7, [%[srcAddr], r7] \n\t" // load value from srcAddr[*xx] |
| 62 | "strh r4, [%[colors]], #2 \n\t" // store value to colors, update ptr |
| 63 | "strh r5, [%[colors]], #2 \n\t" // store value to colors, update ptr |
| 64 | "strh r6, [%[colors]], #2 \n\t" // store value to colors, update ptr |
| 65 | "strh r7, [%[colors]], #2 \n\t" // store value to colors, update ptr |
| 66 | "bgt 1b \n\t" // branch if loop counter > 0 |
| 67 | : [count] "+r" (count), [xx] "+r" (xx), [srcAddr] "+r" (srcAddr), [colors] "+r" (colors) |
| 68 | : |
| 69 | : "cc", "memory", "r4", "r5", "r6", "r7", "r8" |
| 70 | ); |
| 71 | } |
| 72 | for (i = (count & 3); i > 0; --i) { |
| 73 | SkASSERT(*xx < (unsigned)s.fBitmap->width()); |
| 74 | src = srcAddr[*xx++]; *colors++ = src; |
| 75 | } |
| 76 | } |
| 77 | } |
| 78 | #endif //__ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN) |
| 79 | |
| 80 | #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 81 | void S16_D16_filter_DX_arm(const SkBitmapProcState& s, |
| 82 | const uint32_t* SK_RESTRICT xy, |
| 83 | int count, uint16_t* SK_RESTRICT colors) |
| 84 | { |
| 85 | SkASSERT(count > 0 && colors != NULL); |
| 86 | SkASSERT(s.fDoFilter); |
| 87 | |
| 88 | const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); |
| 89 | unsigned rb = s.fBitmap->rowBytes(); |
| 90 | unsigned subY; |
| 91 | const uint16_t* SK_RESTRICT row0; |
| 92 | const uint16_t* SK_RESTRICT row1; |
| 93 | unsigned int rowgap; |
| 94 | const uint32_t c7ffe = 0x7ffe; |
| 95 | |
| 96 | // setup row ptrs and update proc_table |
| 97 | { |
| 98 | uint32_t XY = *xy++; |
| 99 | unsigned y0 = XY >> 14; |
| 100 | row0 = (const uint16_t*)(srcAddr + (y0 >> 4) * rb); |
| 101 | row1 = (const uint16_t*)(srcAddr + (XY & 0x3FFF) * rb); |
| 102 | rowgap = (unsigned int)row1 - (unsigned int)row0; |
| 103 | subY = y0 & 0xF; |
| 104 | } |
| 105 | |
| 106 | unsigned int count4 = ((count >> 2) << 4) | subY; |
| 107 | count &= 3; |
| 108 | |
| 109 | asm volatile ( |
| 110 | "and r4, %[count4], #0xF \n\t" // mask off subY |
| 111 | "vmov.u16 d2[0], r4 \n\t" // move subY to Neon |
| 112 | "rsb r4, r4, #16 \n\t" // r4 = 16-subY |
| 113 | "vmov.u16 d2[1], r4 \n\t" // move 16-subY to Neon |
| 114 | "movs %[count4], %[count4], lsr #4 \n\t" // shift count down, lose subY |
| 115 | "vmov.u16 d3, #16 \n\t" // create constant |
| 116 | "vmov.u16 q2, #31 \n\t" // set up blue mask |
| 117 | "beq 2f \n\t" // if count4 == 0, exit |
| 118 | |
| 119 | "1: \n\t" |
| 120 | "ldmia %[xy]!, {r4, r5, r6, r7} \n\t" // load four xy values |
| 121 | // xy = [ x0:14 | subX:4 | x1:14 ] |
| 122 | // extract subX for iter 0-3 |
| 123 | "vmov d0, r4, r5 \n\t" // move xy to Neon, iter 0-1 |
| 124 | "vmov d1, r6, r7 \n\t" // move xy to Neon, iter 2-3 |
| 125 | |
| 126 | // Load 16 pixels for four filter iterations from memory. |
| 127 | // Because the source pixels are potentially scattered, each lane |
| 128 | // of each vector is loaded separately. Also, the X sub pixel |
| 129 | // offset is extracted. |
| 130 | |
| 131 | // iter 0 |
| 132 | "mov r8, r4, lsr #18 \n\t" // extract x0 |
| 133 | "and r4, %[c7ffe], r4, lsl #1 \n\t" // extract x1 and make byte offset |
| 134 | "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0] |
| 135 | "add r4, %[row0], r4 \n\t" // calculate address of row0[x1] |
| 136 | "vld1.u16 {d16[0]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1 |
| 137 | "vld1.u16 {d17[0]}, [r4], %[rowgap] \n\t" // load row0[x1] and move ptr to row1 |
| 138 | "vld1.u16 {d18[0]}, [r8] \n\t" // load row1[x0] |
| 139 | "vld1.u16 {d19[0]}, [r4] \n\t" // load row1[x1] |
| 140 | |
| 141 | // iter 1 |
| 142 | "mov r8, r5, lsr #18 \n\t" // extract x0 |
| 143 | "and r5, %[c7ffe], r5, lsl #1 \n\t" // extract x1 and make byte offset |
| 144 | "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0] |
| 145 | "add r5, %[row0], r5 \n\t" // calculate address of row0[x1] |
| 146 | "vld1.u16 {d16[1]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1 |
| 147 | "vld1.u16 {d17[1]}, [r5], %[rowgap] \n\t" // load row0[x1] and move ptr to row1 |
| 148 | "vld1.u16 {d18[1]}, [r8] \n\t" // load row1[x0] |
| 149 | "vld1.u16 {d19[1]}, [r5] \n\t" // load row1[x1] |
| 150 | |
| 151 | "vshrn.u32 d0, q0, #2 \n\t" // shift right subX by 2 and narrow |
| 152 | // iter 2 |
| 153 | "mov r8, r6, lsr #18 \n\t" // extract x0 |
| 154 | "and r6, %[c7ffe], r6, lsl #1 \n\t" // extract x1 and make byte offset |
| 155 | "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0] |
| 156 | "add r6, %[row0], r6 \n\t" // calculate address of row0[x1] |
| 157 | "vld1.u16 {d16[2]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1 |
| 158 | "vld1.u16 {d17[2]}, [r6], %[rowgap] \n\t" // load row0[x1] and move ptr to row1 |
| 159 | "vld1.u16 {d18[2]}, [r8] \n\t" // load row1[x0] |
| 160 | "vld1.u16 {d19[2]}, [r6] \n\t" // load row1[x1] |
| 161 | |
| 162 | "vshr.u16 d0, d0, #12 \n\t" // shift right subX to bottom 4 bits |
| 163 | // iter 3 |
| 164 | "mov r8, r7, lsr #18 \n\t" // extract x0 |
| 165 | "and r7, %[c7ffe], r7, lsl #1 \n\t" // extract x1 and make byte offset |
| 166 | "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0] |
| 167 | "add r7, %[row0], r7 \n\t" // calculate address of row0[x1] |
| 168 | "vld1.u16 {d16[3]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1 |
| 169 | "vld1.u16 {d17[3]}, [r7], %[rowgap] \n\t" // load row0[x1] and move ptr to row1 |
| 170 | "vld1.u16 {d18[3]}, [r8] \n\t" // load row1[x0] |
| 171 | "vld1.u16 {d19[3]}, [r7] \n\t" // load row1[x1] |
| 172 | |
| 173 | // Registers d16-d19 now contain pixels a00-a11 for 4 iterations: |
| 174 | // d16 = [ a00_3 | a00_2 | a00_1 | a00_0 ] |
| 175 | // d17 = [ a01_3 | a01_2 | a01_1 | a01_0 ] |
| 176 | // d18 = [ a10_3 | a10_2 | a10_1 | a10_0 ] |
| 177 | // d19 = [ a11_3 | a11_2 | a11_1 | a11_0 ] |
| 178 | // |
| 179 | // Extract RGB channels from each 565 pixel. |
| 180 | |
| 181 | "vshl.i16 q11, q8, #5 \n\t" // shift greens to top of each lane |
| 182 | "vand q12, q8, q2 \n\t" // mask blues |
| 183 | "vshr.u16 q10, q8, #11 \n\t" // shift reds to bottom of each lane |
| 184 | "vshr.u16 q11, q11, #10 \n\t" // shift greens to bottom of each lane |
| 185 | "vshl.i16 q14, q9, #5 \n\t" // shift greens to top of each lane |
| 186 | "vand q15, q9, q2 \n\t" // mask blues |
| 187 | "vshr.u16 q13, q9, #11 \n\t" // shift reds to bottom of each lane |
| 188 | "vshr.u16 q14, q14, #10 \n\t" // shift greens to bottom of each lane |
| 189 | |
| 190 | // There are now six Q regs, containing |
| 191 | // q10 = [ a01r3 | a01r2 | a01r1 | a01r0 | a00r3 | a00r2 | a00r1 | a00r0 ] |
| 192 | // q11 = [ a01g3 | a01g2 | a01g1 | a01g0 | a00g3 | a00g2 | a00g1 | a00g0 ] |
| 193 | // q12 = [ a01b3 | a01b2 | a01b1 | a01b0 | a00b3 | a00b2 | a00b1 | a00b0 ] |
| 194 | // q13 = [ a11r3 | a11r2 | a11r1 | a11r0 | a01r3 | a01r2 | a01r1 | a01r0 ] |
| 195 | // q14 = [ a11g3 | a11g2 | a11g1 | a11g0 | a01g3 | a01g2 | a01g1 | a01g0 ] |
| 196 | // q15 = [ a11b3 | a11b2 | a11b1 | a11b0 | a01b3 | a01b2 | a01b1 | a01b0 ] |
| 197 | // where aXXyZ: XX = pixel position, y = colour channel, Z = iteration |
| 198 | // d0 = subX, d1 = 16-subX |
| 199 | // d2[0] = subY, d2[1] = 16-subY |
| 200 | // d3 = 16, q2(d4d5) = 31 |
| 201 | |
| 202 | // The filter: |
| 203 | // |
| 204 | // | | |
| 205 | // ---- a00 ---- a01 ----> * (16-y) |
| 206 | // | | |
| 207 | // -----a10 ---- a11 ----> * y |
| 208 | // | | |
| 209 | // V V |
| 210 | // * (16-x) * x |
| 211 | // |
| 212 | // result = (a00.(16-y).(16-x) + a01.(16-y).x + a10.(16-x).y + a11.x.y) >> 8 |
| 213 | // |
| 214 | |
| 215 | "vsub.u16 d1, d3, d0 \n\t" // calculate 16-subX |
| 216 | // multiply top pixel pair by (16-y) |
| 217 | "vmul.i16 q10, q10, d2[1] \n\t" // top reds multiplied by (16-y) |
| 218 | "vmul.i16 q11, q11, d2[1] \n\t" // top greens multiplied by (16-y) |
| 219 | "vmul.i16 q12, q12, d2[1] \n\t" // top blues multiplied by (16-y) |
| 220 | // multiply bottom pixel pair by y |
| 221 | "vmul.i16 q13, q13, d2[0] \n\t" // bottom reds multiplied by y |
| 222 | "vmul.i16 q14, q14, d2[0] \n\t" // bottom greens multiplied by y |
| 223 | "vmul.i16 q15, q15, d2[0] \n\t" // bottom blues multiplied by y |
| 224 | // mul/acc left pixels by (16-x) |
| 225 | "vmul.i16 d16, d20, d1 \n\t" // resultr = a00r * (16-x) |
| 226 | "vmul.i16 d17, d22, d1 \n\t" // resultg = a00g * (16-x) |
| 227 | "vmul.i16 d18, d24, d1 \n\t" // resultb = a00b * (16-x) |
| 228 | "vmla.i16 d16, d26, d1 \n\t" // resultr += a00r * (16-x) |
| 229 | "vmla.i16 d17, d28, d1 \n\t" // resultg += a00g * (16-x) |
| 230 | "vmla.i16 d18, d30, d1 \n\t" // resultb += a00b * (16-x) |
| 231 | // mul/acc right pixels by x |
| 232 | "vmla.i16 d16, d21, d0 \n\t" // resultr += a01r * x |
| 233 | "vmla.i16 d17, d23, d0 \n\t" // resultg += a01g * x |
| 234 | "vmla.i16 d18, d25, d0 \n\t" // resultb += a01b * x |
| 235 | "vmla.i16 d16, d27, d0 \n\t" // resultr += a11r * x |
| 236 | "vmla.i16 d17, d29, d0 \n\t" // resultg += a11g * x |
| 237 | "vmla.i16 d18, d31, d0 \n\t" // resultb += a11b * x |
| 238 | "subs %[count4], %[count4], #1 \n\t" // decrement counter |
| 239 | // shift results down 8 bits |
| 240 | "vshr.u16 q8, q8, #8 \n\t" // resultr >>= 8, resultg >>=8 |
| 241 | "vshr.u16 d18, d18, #8 \n\t" // resultb >>= 8 |
| 242 | // put rgb into 565 |
| 243 | "vsli.i16 d18, d17, #5 \n\t" // shift greens into blues |
| 244 | "vsli.i16 d18, d16, #11 \n\t" // shift reds into greens and blues |
| 245 | "vst1.i16 {d18}, [%[colors]]! \n\t" // store result |
| 246 | "bgt 1b \n\t" // if counter > 0, loop |
| 247 | "2: \n\t" // exit |
| 248 | : [xy] "+r" (xy), [count4] "+r" (count4), [colors] "+r" (colors) |
| 249 | : [row0] "r" (row0), [rowgap] "r" (rowgap), [c7ffe] "r" (c7ffe) |
| 250 | : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" |
| 251 | ); |
| 252 | |
| 253 | while(count != 0) |
| 254 | { |
| 255 | uint32_t XX = *xy++; // x0:14 | subX:4 | x1:14 |
| 256 | unsigned x0 = XX >> 14; |
| 257 | unsigned x1 = XX & 0x3FFF; |
| 258 | unsigned subX = x0 & 0xF; |
| 259 | x0 >>= 4; |
| 260 | |
| 261 | uint32_t a00 = SkExpand_rgb_16(row0[x0]); |
| 262 | uint32_t a01 = SkExpand_rgb_16(row0[x1]); |
| 263 | uint32_t a10 = SkExpand_rgb_16(row1[x0]); |
| 264 | uint32_t a11 = SkExpand_rgb_16(row1[x1]); |
| 265 | |
| 266 | int xy = subX * subY >> 3; |
| 267 | uint32_t c = a00 * (32 - 2*subY - 2*subX + xy) + |
| 268 | a01 * (2*subX - xy) + |
| 269 | a10 * (2*subY - xy) + |
| 270 | a11 * xy; |
| 271 | |
| 272 | *colors++ = SkCompact_rgb_16(c>>5); |
| 273 | count--; |
| 274 | } |
| 275 | } |
| 276 | #endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 277 | |
| 278 | #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) |
| 279 | void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s, |
| 280 | const uint32_t* SK_RESTRICT xy, |
| 281 | int count, uint16_t* SK_RESTRICT colors) { |
| 282 | SkASSERT(count > 0 && colors != NULL); |
| 283 | SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); |
| 284 | SkASSERT(s.fDoFilter == false); |
| 285 | |
| 286 | const uint16_t* SK_RESTRICT table = s.fBitmap->getColorTable()->lock16BitCache(); |
| 287 | const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels(); |
| 288 | |
| 289 | // buffer is y32, x16, x16, x16, x16, x16 |
| 290 | // bump srcAddr to the proper row, since we're told Y never changes |
| 291 | SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height()); |
| 292 | srcAddr = (const uint8_t*)((const char*)srcAddr + |
| 293 | xy[0] * s.fBitmap->rowBytes()); |
| 294 | |
| 295 | uint8_t src; |
| 296 | |
| 297 | if (1 == s.fBitmap->width()) { |
| 298 | src = srcAddr[0]; |
| 299 | uint16_t dstValue = table[src]; |
| 300 | sk_memset16(colors, dstValue, count); |
| 301 | } else { |
| 302 | int i; |
| 303 | int count8 = count >> 3; |
| 304 | const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1); |
| 305 | |
| 306 | asm volatile ( |
| 307 | "cmp %[count8], #0 \n\t" // compare loop counter with 0 |
| 308 | "beq 2f \n\t" // if loop counter == 0, exit |
| 309 | "1: \n\t" |
| 310 | "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 |
| 311 | "subs %[count8], %[count8], #1 \n\t" // decrement loop counter |
| 312 | "uxth r4, r5 \n\t" // extract ptr 0 |
| 313 | "mov r5, r5, lsr #16 \n\t" // extract ptr 1 |
| 314 | "uxth r6, r7 \n\t" // extract ptr 2 |
| 315 | "mov r7, r7, lsr #16 \n\t" // extract ptr 3 |
| 316 | "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image |
| 317 | "uxth r8, r9 \n\t" // extract ptr 4 |
| 318 | "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image |
| 319 | "mov r9, r9, lsr #16 \n\t" // extract ptr 5 |
| 320 | "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image |
| 321 | "uxth r10, r11 \n\t" // extract ptr 6 |
| 322 | "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image |
| 323 | "mov r11, r11, lsr #16 \n\t" // extract ptr 7 |
| 324 | "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image |
| 325 | "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup |
| 326 | "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image |
| 327 | "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup |
| 328 | "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image |
| 329 | "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup |
| 330 | "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image |
| 331 | "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup |
| 332 | "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap |
| 333 | "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup |
| 334 | "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap |
| 335 | "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup |
| 336 | "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap |
| 337 | "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup |
| 338 | "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap |
| 339 | "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup |
| 340 | "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap |
| 341 | "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap |
| 342 | "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap |
| 343 | "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap |
| 344 | "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1 |
| 345 | "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3 |
| 346 | "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5 |
| 347 | "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7 |
| 348 | "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels |
| 349 | "bgt 1b \n\t" // loop if counter > 0 |
| 350 | "2: \n\t" |
| 351 | : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors) |
| 352 | : [table] "r" (table), [srcAddr] "r" (srcAddr) |
| 353 | : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" |
| 354 | ); |
| 355 | |
| 356 | for (i = (count & 7); i > 0; --i) { |
| 357 | src = srcAddr[*xx++]; *colors++ = table[src]; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | s.fBitmap->getColorTable()->unlock16BitCache(); |
| 362 | } |
| 363 | |
| 364 | void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s, |
| 365 | const uint32_t* SK_RESTRICT xy, |
| 366 | int count, SkPMColor* SK_RESTRICT colors) { |
| 367 | SkASSERT(count > 0 && colors != NULL); |
| 368 | SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); |
| 369 | SkASSERT(s.fDoFilter == false); |
| 370 | |
| 371 | const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); |
| 372 | const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels(); |
| 373 | |
| 374 | // buffer is y32, x16, x16, x16, x16, x16 |
| 375 | // bump srcAddr to the proper row, since we're told Y never changes |
| 376 | SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height()); |
| 377 | srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes()); |
| 378 | |
| 379 | if (1 == s.fBitmap->width()) { |
| 380 | uint8_t src = srcAddr[0]; |
| 381 | SkPMColor dstValue = table[src]; |
| 382 | sk_memset32(colors, dstValue, count); |
| 383 | } else { |
| 384 | const uint16_t* xx = (const uint16_t*)(xy + 1); |
| 385 | |
| 386 | asm volatile ( |
| 387 | "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags |
| 388 | "blt 2f \n\t" // if count < 0, branch to singles |
| 389 | "1: \n\t" // eights loop |
| 390 | "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7 |
| 391 | "uxth r4, r5 \n\t" // extract ptr 0 |
| 392 | "mov r5, r5, lsr #16 \n\t" // extract ptr 1 |
| 393 | "uxth r6, r7 \n\t" // extract ptr 2 |
| 394 | "mov r7, r7, lsr #16 \n\t" // extract ptr 3 |
| 395 | "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image |
| 396 | "uxth r8, r9 \n\t" // extract ptr 4 |
| 397 | "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image |
| 398 | "mov r9, r9, lsr #16 \n\t" // extract ptr 5 |
| 399 | "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image |
| 400 | "uxth r10, r11 \n\t" // extract ptr 6 |
| 401 | "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image |
| 402 | "mov r11, r11, lsr #16 \n\t" // extract ptr 7 |
| 403 | "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image |
| 404 | "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image |
| 405 | "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image |
| 406 | "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image |
| 407 | "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap |
| 408 | "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap |
| 409 | "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap |
| 410 | "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap |
| 411 | "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap |
| 412 | "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap |
| 413 | "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap |
| 414 | "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap |
| 415 | "subs %[count], %[count], #8 \n\t" // decrement loop counter |
| 416 | "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels |
| 417 | "bge 1b \n\t" // loop if counter >= 0 |
| 418 | "2: \n\t" |
| 419 | "adds %[count], %[count], #8 \n\t" // fix up counter, set flags |
| 420 | "beq 4f \n\t" // if count == 0, branch to exit |
| 421 | "3: \n\t" // singles loop |
| 422 | "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr |
| 423 | "subs %[count], %[count], #1 \n\t" // decrement loop counter |
| 424 | "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image |
| 425 | "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap |
| 426 | "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr |
| 427 | "bne 3b \n\t" // loop if counter != 0 |
| 428 | "4: \n\t" // exit |
| 429 | : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors) |
| 430 | : [table] "r" (table), [srcAddr] "r" (srcAddr) |
| 431 | : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" |
| 432 | ); |
| 433 | } |
| 434 | |
| 435 | s.fBitmap->getColorTable()->unlockColors(false); |
| 436 | } |
| 437 | #endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) |
| 438 | |
| 439 | #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 440 | static inline void Filter_32_direct(unsigned x, unsigned y, |
| 441 | SkPMColor a00, SkPMColor a01, |
| 442 | SkPMColor a10, SkPMColor a11, |
| 443 | SkPMColor *dst) { |
| 444 | asm volatile( |
| 445 | "vdup.8 d0, %[y] \n\t" // duplicate y into d0 |
| 446 | "vmov.u8 d16, #16 \n\t" // set up constant in d16 |
| 447 | "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y |
| 448 | |
| 449 | "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4 |
| 450 | "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5 |
| 451 | "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01 |
| 452 | "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11 |
| 453 | |
| 454 | "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y) |
| 455 | "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y |
| 456 | |
| 457 | "vdup.16 d5, %[x] \n\t" // duplicate x into d5 |
| 458 | "vmov.u16 d16, #16 \n\t" // set up constant in d16 |
| 459 | "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x |
| 460 | |
| 461 | "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x |
| 462 | "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x |
| 463 | "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x) |
| 464 | "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x) |
| 465 | "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8 |
| 466 | "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result |
| 467 | : |
| 468 | : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst) |
| 469 | : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16" |
| 470 | ); |
| 471 | } |
| 472 | |
| 473 | static inline void Filter_32_direct_alpha(unsigned x, unsigned y, |
| 474 | SkPMColor a00, SkPMColor a01, |
| 475 | SkPMColor a10, SkPMColor a11, |
| 476 | SkPMColor *dst, uint16_t scale) { |
| 477 | asm volatile( |
| 478 | "vdup.8 d0, %[y] \n\t" // duplicate y into d0 |
| 479 | "vmov.u8 d16, #16 \n\t" // set up constant in d16 |
| 480 | "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y |
| 481 | |
| 482 | "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4 |
| 483 | "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5 |
| 484 | "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01 |
| 485 | "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11 |
| 486 | |
| 487 | "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y) |
| 488 | "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y |
| 489 | |
| 490 | "vdup.16 d5, %[x] \n\t" // duplicate x into d5 |
| 491 | "vmov.u16 d16, #16 \n\t" // set up constant in d16 |
| 492 | "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x |
| 493 | |
| 494 | "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x |
| 495 | "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x |
| 496 | "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x) |
| 497 | "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x) |
| 498 | "vdup.16 d3, %[scale] \n\t" // duplicate scale into d3 |
| 499 | "vshr.u16 d4, d4, #8 \n\t" // shift down result by 8 |
| 500 | "vmul.i16 d4, d4, d3 \n\t" // multiply result by scale |
| 501 | "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8 |
| 502 | "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result |
| 503 | : |
| 504 | : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale) |
| 505 | : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16" |
| 506 | ); |
| 507 | } |
| 508 | |
| 509 | void SI8_opaque_D32_filter_DX_arm(const SkBitmapProcState& s, |
| 510 | const uint32_t* SK_RESTRICT xy, |
| 511 | int count, SkPMColor* SK_RESTRICT colors) { |
| 512 | SkASSERT(count > 0 && colors != NULL); |
| 513 | SkASSERT(s.fDoFilter); |
| 514 | |
| 515 | const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); |
| 516 | const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); |
| 517 | unsigned rb = s.fBitmap->rowBytes(); |
| 518 | unsigned subY; |
| 519 | const uint8_t* SK_RESTRICT row0; |
| 520 | const uint8_t* SK_RESTRICT row1; |
| 521 | |
| 522 | // setup row ptrs and update proc_table |
| 523 | { |
| 524 | uint32_t XY = *xy++; |
| 525 | unsigned y0 = XY >> 14; |
| 526 | row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb); |
| 527 | row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb); |
| 528 | subY = y0 & 0xF; |
| 529 | } |
| 530 | |
| 531 | do { |
| 532 | uint32_t XX = *xy++; // x0:14 | 4 | x1:14 |
| 533 | unsigned x0 = XX >> 14; |
| 534 | unsigned x1 = XX & 0x3FFF; |
| 535 | unsigned subX = x0 & 0xF; |
| 536 | x0 >>= 4; |
| 537 | |
| 538 | Filter_32_direct(subX, subY, table[row0[x0]], |
| 539 | table[row0[x1]], |
| 540 | table[row1[x0]], |
| 541 | table[row1[x1]], colors); |
| 542 | colors++; |
| 543 | } while (--count != 0); |
| 544 | |
| 545 | s.fBitmap->getColorTable()->unlockColors(false); |
| 546 | } |
| 547 | |
| 548 | void SI8_opaque_D32_filter_DXDY_arm(const SkBitmapProcState& s, |
| 549 | const uint32_t* SK_RESTRICT xy, |
| 550 | int count, SkPMColor* SK_RESTRICT colors) { |
| 551 | SkASSERT(count > 0 && colors != NULL); |
| 552 | SkASSERT(s.fDoFilter); |
| 553 | |
| 554 | const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); |
| 555 | const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); |
| 556 | int rb = s.fBitmap->rowBytes(); |
| 557 | |
| 558 | do { |
| 559 | uint32_t data = *xy++; |
| 560 | unsigned y0 = data >> 14; |
| 561 | unsigned y1 = data & 0x3FFF; |
| 562 | unsigned subY = y0 & 0xF; |
| 563 | y0 >>= 4; |
| 564 | |
| 565 | data = *xy++; |
| 566 | unsigned x0 = data >> 14; |
| 567 | unsigned x1 = data & 0x3FFF; |
| 568 | unsigned subX = x0 & 0xF; |
| 569 | x0 >>= 4; |
| 570 | |
| 571 | const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb); |
| 572 | const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb); |
| 573 | |
| 574 | Filter_32_direct(subX, subY, table[row0[x0]], |
| 575 | table[row0[x1]], |
| 576 | table[row1[x0]], |
| 577 | table[row1[x1]], colors); |
| 578 | colors++; |
| 579 | } while (--count != 0); |
| 580 | |
| 581 | s.fBitmap->getColorTable()->unlockColors(false); |
| 582 | } |
| 583 | |
| 584 | void SI8_alpha_D32_filter_DX_arm(const SkBitmapProcState& s, |
| 585 | const uint32_t* SK_RESTRICT xy, |
| 586 | int count, SkPMColor* SK_RESTRICT colors) { |
| 587 | SkASSERT(count > 0 && colors != NULL); |
| 588 | SkASSERT(s.fDoFilter); |
| 589 | |
| 590 | unsigned scale = s.fAlphaScale; |
| 591 | const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); |
| 592 | const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); |
| 593 | unsigned rb = s.fBitmap->rowBytes(); |
| 594 | unsigned subY; |
| 595 | const uint8_t* SK_RESTRICT row0; |
| 596 | const uint8_t* SK_RESTRICT row1; |
| 597 | |
| 598 | // setup row ptrs and update proc_table |
| 599 | { |
| 600 | uint32_t XY = *xy++; |
| 601 | unsigned y0 = XY >> 14; |
| 602 | row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb); |
| 603 | row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb); |
| 604 | subY = y0 & 0xF; |
| 605 | } |
| 606 | |
| 607 | do { |
| 608 | uint32_t XX = *xy++; // x0:14 | 4 | x1:14 |
| 609 | unsigned x0 = XX >> 14; |
| 610 | unsigned x1 = XX & 0x3FFF; |
| 611 | unsigned subX = x0 & 0xF; |
| 612 | x0 >>= 4; |
| 613 | |
| 614 | Filter_32_direct_alpha(subX, subY, table[row0[x0]], |
| 615 | table[row0[x1]], |
| 616 | table[row1[x0]], |
| 617 | table[row1[x1]], colors, scale); |
| 618 | colors++; |
| 619 | } while (--count != 0); |
| 620 | |
| 621 | s.fBitmap->getColorTable()->unlockColors(false); |
| 622 | } |
| 623 | |
| 624 | void SI8_alpha_D32_filter_DXDY_arm(const SkBitmapProcState& s, |
| 625 | const uint32_t* SK_RESTRICT xy, |
| 626 | int count, SkPMColor* SK_RESTRICT colors) { |
| 627 | SkASSERT(count > 0 && colors != NULL); |
| 628 | SkASSERT(s.fDoFilter); |
| 629 | |
| 630 | unsigned scale = s.fAlphaScale; |
| 631 | const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors(); |
| 632 | const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels(); |
| 633 | int rb = s.fBitmap->rowBytes(); |
| 634 | |
| 635 | do { |
| 636 | uint32_t data = *xy++; |
| 637 | unsigned y0 = data >> 14; |
| 638 | unsigned y1 = data & 0x3FFF; |
| 639 | unsigned subY = y0 & 0xF; |
| 640 | y0 >>= 4; |
| 641 | |
| 642 | data = *xy++; |
| 643 | unsigned x0 = data >> 14; |
| 644 | unsigned x1 = data & 0x3FFF; |
| 645 | unsigned subX = x0 & 0xF; |
| 646 | x0 >>= 4; |
| 647 | |
| 648 | const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb); |
| 649 | const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb); |
| 650 | |
| 651 | Filter_32_direct_alpha(subX, subY, table[row0[x0]], |
| 652 | table[row0[x1]], |
| 653 | table[row1[x0]], |
| 654 | table[row1[x1]], colors, scale); |
| 655 | colors++; |
| 656 | } while (--count != 0); |
| 657 | |
| 658 | s.fBitmap->getColorTable()->unlockColors(false); |
| 659 | } |
| 660 | #endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 661 | |
| 662 | /////////////////////////////////////////////////////////////////////////////// |
| 663 | |
| 664 | void SkBitmapProcState::platformProcs() { |
| 665 | bool doFilter = fDoFilter; |
| 666 | bool isOpaque = 256 == fAlphaScale; |
| 667 | bool justDx = false; |
| 668 | |
| 669 | if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) { |
| 670 | justDx = true; |
| 671 | } |
| 672 | |
| 673 | switch (fBitmap->config()) { |
| 674 | case SkBitmap::kRGB_565_Config: |
| 675 | #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 676 | if (justDx && doFilter) { |
| 677 | fSampleProc16 = S16_D16_filter_DX_arm; |
| 678 | } |
| 679 | #endif |
| 680 | #if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN) |
| 681 | if (justDx && !doFilter) { |
| 682 | fSampleProc16 = S16_D16_nofilter_DX_arm; |
| 683 | } |
| 684 | #endif |
| 685 | break; // k565 |
| 686 | case SkBitmap::kIndex8_Config: |
| 687 | #if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) |
| 688 | if (justDx && !doFilter) { |
| 689 | fSampleProc16 = SI8_D16_nofilter_DX_arm; |
| 690 | if (isOpaque) { |
| 691 | fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm; |
| 692 | } |
| 693 | } |
| 694 | #endif |
| 695 | #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) |
| 696 | if (doFilter) { |
| 697 | if (isOpaque) { |
| 698 | if (justDx) { |
| 699 | fSampleProc32 = SI8_opaque_D32_filter_DX_arm; |
| 700 | } else { |
| 701 | fSampleProc32 = SI8_opaque_D32_filter_DXDY_arm; |
| 702 | } |
| 703 | } else { // !isOpaque |
| 704 | if (justDx) { |
| 705 | fSampleProc32 = SI8_alpha_D32_filter_DX_arm; |
| 706 | } else { |
| 707 | fSampleProc32 = SI8_alpha_D32_filter_DXDY_arm; |
| 708 | } |
| 709 | } |
| 710 | } |
| 711 | #endif |
| 712 | break; // kIndex8 |
| 713 | default: |
| 714 | break; |
| 715 | } |
| 716 | } |
| 717 | |