blob: 1bd4d91949df7d574cea9d39b55c686d087a9e80 [file] [log] [blame]
reed@android.coma0bd7f42009-08-03 17:22:46 +00001/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
reed@android.com6123e472009-08-04 01:52:27 +000017#include <machine/cpu-features.h>
reed@android.coma0bd7f42009-08-03 17:22:46 +000018#include "SkBitmapProcState.h"
reed@android.com6123e472009-08-04 01:52:27 +000019#include "SkColorPriv.h"
20#include "SkUtils.h"
reed@android.coma0bd7f42009-08-03 17:22:46 +000021
22#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
23void S16_D16_nofilter_DX_arm(const SkBitmapProcState& s,
24 const uint32_t* SK_RESTRICT xy,
25 int count, uint16_t* SK_RESTRICT colors) {
26 SkASSERT(count > 0 && colors != NULL);
27 SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
28 SkASSERT(s.fDoFilter == false);
29
30 const uint16_t* SK_RESTRICT srcAddr = (const uint16_t*)s.fBitmap->getPixels();
31
32 // buffer is y32, x16, x16, x16, x16, x16
33 // bump srcAddr to the proper row, since we're told Y never changes
34 SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
35 srcAddr = (const uint16_t*)((const char*)srcAddr +
36 xy[0] * s.fBitmap->rowBytes());
37
38 uint16_t src;
39
40 if (1 == s.fBitmap->width()) {
41 src = srcAddr[0];
42 uint16_t dstValue = src;
43 sk_memset16(colors, dstValue, count);
44 } else {
45 int i;
46 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
47
48 if((count >> 2) > 0) {
49 asm volatile (
50 "mov r8, %[count], lsr #2 \n\t" // shift down count so we iterate in fours
51 "1: \n\t"
52 "subs r8, r8, #1 \n\t" // decrement loop counter
53 "ldrh r4, [%[xx]], #2 \n\t" // load xx value, update ptr
54 "ldrh r5, [%[xx]], #2 \n\t" // load xx value, update ptr
55 "ldrh r6, [%[xx]], #2 \n\t" // load xx value, update ptr
56 "add r4, r4, r4 \n\t" // double offset for half word addressing
57 "ldrh r7, [%[xx]], #2 \n\t" // load xx value, update ptr
58 "add r5, r5, r5 \n\t" // double offset for half word addressing
59 "ldrh r4, [%[srcAddr], r4] \n\t" // load value from srcAddr[*xx]
60 "add r6, r6, r6 \n\t" // double offset for half word addressing
61 "ldrh r5, [%[srcAddr], r5] \n\t" // load value from srcAddr[*xx]
62 "add r7, r7, r7 \n\t" // double offset for half word addressing
63 "ldrh r6, [%[srcAddr], r6] \n\t" // load value from srcAddr[*xx]
64 "ldrh r7, [%[srcAddr], r7] \n\t" // load value from srcAddr[*xx]
65 "strh r4, [%[colors]], #2 \n\t" // store value to colors, update ptr
66 "strh r5, [%[colors]], #2 \n\t" // store value to colors, update ptr
67 "strh r6, [%[colors]], #2 \n\t" // store value to colors, update ptr
68 "strh r7, [%[colors]], #2 \n\t" // store value to colors, update ptr
69 "bgt 1b \n\t" // branch if loop counter > 0
70 : [count] "+r" (count), [xx] "+r" (xx), [srcAddr] "+r" (srcAddr), [colors] "+r" (colors)
71 :
72 : "cc", "memory", "r4", "r5", "r6", "r7", "r8"
73 );
74 }
75 for (i = (count & 3); i > 0; --i) {
76 SkASSERT(*xx < (unsigned)s.fBitmap->width());
77 src = srcAddr[*xx++]; *colors++ = src;
78 }
79 }
80}
81#endif //__ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
82
83#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
84void S16_D16_filter_DX_arm(const SkBitmapProcState& s,
85 const uint32_t* SK_RESTRICT xy,
86 int count, uint16_t* SK_RESTRICT colors)
87{
88 SkASSERT(count > 0 && colors != NULL);
89 SkASSERT(s.fDoFilter);
reed@android.com6123e472009-08-04 01:52:27 +000090
reed@android.coma0bd7f42009-08-03 17:22:46 +000091 const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
92 unsigned rb = s.fBitmap->rowBytes();
93 unsigned subY;
94 const uint16_t* SK_RESTRICT row0;
95 const uint16_t* SK_RESTRICT row1;
96 unsigned int rowgap;
97 const uint32_t c7ffe = 0x7ffe;
reed@android.com6123e472009-08-04 01:52:27 +000098
reed@android.coma0bd7f42009-08-03 17:22:46 +000099 // setup row ptrs and update proc_table
100 {
101 uint32_t XY = *xy++;
102 unsigned y0 = XY >> 14;
103 row0 = (const uint16_t*)(srcAddr + (y0 >> 4) * rb);
104 row1 = (const uint16_t*)(srcAddr + (XY & 0x3FFF) * rb);
105 rowgap = (unsigned int)row1 - (unsigned int)row0;
106 subY = y0 & 0xF;
107 }
reed@android.com6123e472009-08-04 01:52:27 +0000108
reed@android.coma0bd7f42009-08-03 17:22:46 +0000109 unsigned int count4 = ((count >> 2) << 4) | subY;
110 count &= 3;
reed@android.com6123e472009-08-04 01:52:27 +0000111
reed@android.coma0bd7f42009-08-03 17:22:46 +0000112 asm volatile (
113 "and r4, %[count4], #0xF \n\t" // mask off subY
114 "vmov.u16 d2[0], r4 \n\t" // move subY to Neon
115 "rsb r4, r4, #16 \n\t" // r4 = 16-subY
116 "vmov.u16 d2[1], r4 \n\t" // move 16-subY to Neon
117 "movs %[count4], %[count4], lsr #4 \n\t" // shift count down, lose subY
118 "vmov.u16 d3, #16 \n\t" // create constant
119 "vmov.u16 q2, #31 \n\t" // set up blue mask
120 "beq 2f \n\t" // if count4 == 0, exit
121
122 "1: \n\t"
123 "ldmia %[xy]!, {r4, r5, r6, r7} \n\t" // load four xy values
124 // xy = [ x0:14 | subX:4 | x1:14 ]
125 // extract subX for iter 0-3
126 "vmov d0, r4, r5 \n\t" // move xy to Neon, iter 0-1
127 "vmov d1, r6, r7 \n\t" // move xy to Neon, iter 2-3
128
129 // Load 16 pixels for four filter iterations from memory.
130 // Because the source pixels are potentially scattered, each lane
131 // of each vector is loaded separately. Also, the X sub pixel
132 // offset is extracted.
133
134 // iter 0
135 "mov r8, r4, lsr #18 \n\t" // extract x0
136 "and r4, %[c7ffe], r4, lsl #1 \n\t" // extract x1 and make byte offset
137 "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
138 "add r4, %[row0], r4 \n\t" // calculate address of row0[x1]
139 "vld1.u16 {d16[0]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
140 "vld1.u16 {d17[0]}, [r4], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
141 "vld1.u16 {d18[0]}, [r8] \n\t" // load row1[x0]
142 "vld1.u16 {d19[0]}, [r4] \n\t" // load row1[x1]
143
144 // iter 1
145 "mov r8, r5, lsr #18 \n\t" // extract x0
146 "and r5, %[c7ffe], r5, lsl #1 \n\t" // extract x1 and make byte offset
147 "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
148 "add r5, %[row0], r5 \n\t" // calculate address of row0[x1]
149 "vld1.u16 {d16[1]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
150 "vld1.u16 {d17[1]}, [r5], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
151 "vld1.u16 {d18[1]}, [r8] \n\t" // load row1[x0]
152 "vld1.u16 {d19[1]}, [r5] \n\t" // load row1[x1]
153
154 "vshrn.u32 d0, q0, #2 \n\t" // shift right subX by 2 and narrow
155 // iter 2
156 "mov r8, r6, lsr #18 \n\t" // extract x0
157 "and r6, %[c7ffe], r6, lsl #1 \n\t" // extract x1 and make byte offset
158 "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
159 "add r6, %[row0], r6 \n\t" // calculate address of row0[x1]
160 "vld1.u16 {d16[2]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
161 "vld1.u16 {d17[2]}, [r6], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
162 "vld1.u16 {d18[2]}, [r8] \n\t" // load row1[x0]
163 "vld1.u16 {d19[2]}, [r6] \n\t" // load row1[x1]
164
165 "vshr.u16 d0, d0, #12 \n\t" // shift right subX to bottom 4 bits
166 // iter 3
167 "mov r8, r7, lsr #18 \n\t" // extract x0
168 "and r7, %[c7ffe], r7, lsl #1 \n\t" // extract x1 and make byte offset
169 "add r8, %[row0], r8, lsl #1 \n\t" // calculate address of row0[x0]
170 "add r7, %[row0], r7 \n\t" // calculate address of row0[x1]
171 "vld1.u16 {d16[3]}, [r8], %[rowgap] \n\t" // load row0[x0] and move ptr to row1
172 "vld1.u16 {d17[3]}, [r7], %[rowgap] \n\t" // load row0[x1] and move ptr to row1
173 "vld1.u16 {d18[3]}, [r8] \n\t" // load row1[x0]
174 "vld1.u16 {d19[3]}, [r7] \n\t" // load row1[x1]
175
176 // Registers d16-d19 now contain pixels a00-a11 for 4 iterations:
177 // d16 = [ a00_3 | a00_2 | a00_1 | a00_0 ]
178 // d17 = [ a01_3 | a01_2 | a01_1 | a01_0 ]
179 // d18 = [ a10_3 | a10_2 | a10_1 | a10_0 ]
180 // d19 = [ a11_3 | a11_2 | a11_1 | a11_0 ]
181 //
182 // Extract RGB channels from each 565 pixel.
183
184 "vshl.i16 q11, q8, #5 \n\t" // shift greens to top of each lane
185 "vand q12, q8, q2 \n\t" // mask blues
186 "vshr.u16 q10, q8, #11 \n\t" // shift reds to bottom of each lane
187 "vshr.u16 q11, q11, #10 \n\t" // shift greens to bottom of each lane
188 "vshl.i16 q14, q9, #5 \n\t" // shift greens to top of each lane
189 "vand q15, q9, q2 \n\t" // mask blues
190 "vshr.u16 q13, q9, #11 \n\t" // shift reds to bottom of each lane
191 "vshr.u16 q14, q14, #10 \n\t" // shift greens to bottom of each lane
192
193 // There are now six Q regs, containing
194 // q10 = [ a01r3 | a01r2 | a01r1 | a01r0 | a00r3 | a00r2 | a00r1 | a00r0 ]
195 // q11 = [ a01g3 | a01g2 | a01g1 | a01g0 | a00g3 | a00g2 | a00g1 | a00g0 ]
196 // q12 = [ a01b3 | a01b2 | a01b1 | a01b0 | a00b3 | a00b2 | a00b1 | a00b0 ]
197 // q13 = [ a11r3 | a11r2 | a11r1 | a11r0 | a01r3 | a01r2 | a01r1 | a01r0 ]
198 // q14 = [ a11g3 | a11g2 | a11g1 | a11g0 | a01g3 | a01g2 | a01g1 | a01g0 ]
199 // q15 = [ a11b3 | a11b2 | a11b1 | a11b0 | a01b3 | a01b2 | a01b1 | a01b0 ]
200 // where aXXyZ: XX = pixel position, y = colour channel, Z = iteration
201 // d0 = subX, d1 = 16-subX
202 // d2[0] = subY, d2[1] = 16-subY
203 // d3 = 16, q2(d4d5) = 31
204
205 // The filter:
206 //
207 // | |
208 // ---- a00 ---- a01 ----> * (16-y)
209 // | |
210 // -----a10 ---- a11 ----> * y
211 // | |
212 // V V
213 // * (16-x) * x
214 //
215 // result = (a00.(16-y).(16-x) + a01.(16-y).x + a10.(16-x).y + a11.x.y) >> 8
216 //
217
218 "vsub.u16 d1, d3, d0 \n\t" // calculate 16-subX
219 // multiply top pixel pair by (16-y)
220 "vmul.i16 q10, q10, d2[1] \n\t" // top reds multiplied by (16-y)
221 "vmul.i16 q11, q11, d2[1] \n\t" // top greens multiplied by (16-y)
222 "vmul.i16 q12, q12, d2[1] \n\t" // top blues multiplied by (16-y)
223 // multiply bottom pixel pair by y
224 "vmul.i16 q13, q13, d2[0] \n\t" // bottom reds multiplied by y
225 "vmul.i16 q14, q14, d2[0] \n\t" // bottom greens multiplied by y
226 "vmul.i16 q15, q15, d2[0] \n\t" // bottom blues multiplied by y
227 // mul/acc left pixels by (16-x)
228 "vmul.i16 d16, d20, d1 \n\t" // resultr = a00r * (16-x)
229 "vmul.i16 d17, d22, d1 \n\t" // resultg = a00g * (16-x)
230 "vmul.i16 d18, d24, d1 \n\t" // resultb = a00b * (16-x)
231 "vmla.i16 d16, d26, d1 \n\t" // resultr += a00r * (16-x)
232 "vmla.i16 d17, d28, d1 \n\t" // resultg += a00g * (16-x)
233 "vmla.i16 d18, d30, d1 \n\t" // resultb += a00b * (16-x)
234 // mul/acc right pixels by x
235 "vmla.i16 d16, d21, d0 \n\t" // resultr += a01r * x
236 "vmla.i16 d17, d23, d0 \n\t" // resultg += a01g * x
237 "vmla.i16 d18, d25, d0 \n\t" // resultb += a01b * x
238 "vmla.i16 d16, d27, d0 \n\t" // resultr += a11r * x
239 "vmla.i16 d17, d29, d0 \n\t" // resultg += a11g * x
240 "vmla.i16 d18, d31, d0 \n\t" // resultb += a11b * x
241 "subs %[count4], %[count4], #1 \n\t" // decrement counter
242 // shift results down 8 bits
243 "vshr.u16 q8, q8, #8 \n\t" // resultr >>= 8, resultg >>=8
244 "vshr.u16 d18, d18, #8 \n\t" // resultb >>= 8
245 // put rgb into 565
246 "vsli.i16 d18, d17, #5 \n\t" // shift greens into blues
247 "vsli.i16 d18, d16, #11 \n\t" // shift reds into greens and blues
248 "vst1.i16 {d18}, [%[colors]]! \n\t" // store result
249 "bgt 1b \n\t" // if counter > 0, loop
250 "2: \n\t" // exit
251 : [xy] "+r" (xy), [count4] "+r" (count4), [colors] "+r" (colors)
252 : [row0] "r" (row0), [rowgap] "r" (rowgap), [c7ffe] "r" (c7ffe)
253 : "cc", "memory", "r4", "r5", "r6", "r7", "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
254 );
255
reed@android.com6123e472009-08-04 01:52:27 +0000256 while (count != 0) {
reed@android.coma0bd7f42009-08-03 17:22:46 +0000257 uint32_t XX = *xy++; // x0:14 | subX:4 | x1:14
258 unsigned x0 = XX >> 14;
259 unsigned x1 = XX & 0x3FFF;
260 unsigned subX = x0 & 0xF;
261 x0 >>= 4;
reed@android.com6123e472009-08-04 01:52:27 +0000262
reed@android.coma0bd7f42009-08-03 17:22:46 +0000263 uint32_t a00 = SkExpand_rgb_16(row0[x0]);
264 uint32_t a01 = SkExpand_rgb_16(row0[x1]);
265 uint32_t a10 = SkExpand_rgb_16(row1[x0]);
266 uint32_t a11 = SkExpand_rgb_16(row1[x1]);
reed@android.com6123e472009-08-04 01:52:27 +0000267
reed@android.coma0bd7f42009-08-03 17:22:46 +0000268 int xy = subX * subY >> 3;
269 uint32_t c = a00 * (32 - 2*subY - 2*subX + xy) +
270 a01 * (2*subX - xy) +
271 a10 * (2*subY - xy) +
272 a11 * xy;
reed@android.com6123e472009-08-04 01:52:27 +0000273
reed@android.coma0bd7f42009-08-03 17:22:46 +0000274 *colors++ = SkCompact_rgb_16(c>>5);
275 count--;
276 }
277}
278#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
279
280#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
281void SI8_D16_nofilter_DX_arm(const SkBitmapProcState& s,
282 const uint32_t* SK_RESTRICT xy,
283 int count, uint16_t* SK_RESTRICT colors) {
284 SkASSERT(count > 0 && colors != NULL);
285 SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
286 SkASSERT(s.fDoFilter == false);
287
288 const uint16_t* SK_RESTRICT table = s.fBitmap->getColorTable()->lock16BitCache();
289 const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels();
290
291 // buffer is y32, x16, x16, x16, x16, x16
292 // bump srcAddr to the proper row, since we're told Y never changes
293 SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
294 srcAddr = (const uint8_t*)((const char*)srcAddr +
295 xy[0] * s.fBitmap->rowBytes());
296
297 uint8_t src;
298
299 if (1 == s.fBitmap->width()) {
300 src = srcAddr[0];
301 uint16_t dstValue = table[src];
302 sk_memset16(colors, dstValue, count);
303 } else {
304 int i;
305 int count8 = count >> 3;
306 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy + 1);
307
308 asm volatile (
309 "cmp %[count8], #0 \n\t" // compare loop counter with 0
310 "beq 2f \n\t" // if loop counter == 0, exit
311 "1: \n\t"
312 "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7
313 "subs %[count8], %[count8], #1 \n\t" // decrement loop counter
314 "uxth r4, r5 \n\t" // extract ptr 0
315 "mov r5, r5, lsr #16 \n\t" // extract ptr 1
316 "uxth r6, r7 \n\t" // extract ptr 2
317 "mov r7, r7, lsr #16 \n\t" // extract ptr 3
318 "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image
319 "uxth r8, r9 \n\t" // extract ptr 4
320 "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image
321 "mov r9, r9, lsr #16 \n\t" // extract ptr 5
322 "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image
323 "uxth r10, r11 \n\t" // extract ptr 6
324 "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image
325 "mov r11, r11, lsr #16 \n\t" // extract ptr 7
326 "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image
327 "add r4, r4, r4 \n\t" // double pixel 0 for RGB565 lookup
328 "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image
329 "add r5, r5, r5 \n\t" // double pixel 1 for RGB565 lookup
330 "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image
331 "add r6, r6, r6 \n\t" // double pixel 2 for RGB565 lookup
332 "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image
333 "add r7, r7, r7 \n\t" // double pixel 3 for RGB565 lookup
334 "ldrh r4, [%[table], r4] \n\t" // load pixel 0 RGB565 from colmap
335 "add r8, r8, r8 \n\t" // double pixel 4 for RGB565 lookup
336 "ldrh r5, [%[table], r5] \n\t" // load pixel 1 RGB565 from colmap
337 "add r9, r9, r9 \n\t" // double pixel 5 for RGB565 lookup
338 "ldrh r6, [%[table], r6] \n\t" // load pixel 2 RGB565 from colmap
339 "add r10, r10, r10 \n\t" // double pixel 6 for RGB565 lookup
340 "ldrh r7, [%[table], r7] \n\t" // load pixel 3 RGB565 from colmap
341 "add r11, r11, r11 \n\t" // double pixel 7 for RGB565 lookup
342 "ldrh r8, [%[table], r8] \n\t" // load pixel 4 RGB565 from colmap
343 "ldrh r9, [%[table], r9] \n\t" // load pixel 5 RGB565 from colmap
344 "ldrh r10, [%[table], r10] \n\t" // load pixel 6 RGB565 from colmap
345 "ldrh r11, [%[table], r11] \n\t" // load pixel 7 RGB565 from colmap
346 "pkhbt r5, r4, r5, lsl #16 \n\t" // pack pixels 0 and 1
347 "pkhbt r6, r6, r7, lsl #16 \n\t" // pack pixels 2 and 3
348 "pkhbt r8, r8, r9, lsl #16 \n\t" // pack pixels 4 and 5
349 "pkhbt r10, r10, r11, lsl #16 \n\t" // pack pixels 6 and 7
350 "stmia %[colors]!, {r5, r6, r8, r10} \n\t" // store last 8 pixels
351 "bgt 1b \n\t" // loop if counter > 0
352 "2: \n\t"
353 : [xx] "+r" (xx), [count8] "+r" (count8), [colors] "+r" (colors)
354 : [table] "r" (table), [srcAddr] "r" (srcAddr)
355 : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
356 );
357
358 for (i = (count & 7); i > 0; --i) {
359 src = srcAddr[*xx++]; *colors++ = table[src];
360 }
361 }
reed@android.com6123e472009-08-04 01:52:27 +0000362
reed@android.coma0bd7f42009-08-03 17:22:46 +0000363 s.fBitmap->getColorTable()->unlock16BitCache();
364}
365
366void SI8_opaque_D32_nofilter_DX_arm(const SkBitmapProcState& s,
367 const uint32_t* SK_RESTRICT xy,
368 int count, SkPMColor* SK_RESTRICT colors) {
369 SkASSERT(count > 0 && colors != NULL);
370 SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask));
371 SkASSERT(s.fDoFilter == false);
reed@android.com6123e472009-08-04 01:52:27 +0000372
reed@android.coma0bd7f42009-08-03 17:22:46 +0000373 const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
374 const uint8_t* SK_RESTRICT srcAddr = (const uint8_t*)s.fBitmap->getPixels();
reed@android.com6123e472009-08-04 01:52:27 +0000375
reed@android.coma0bd7f42009-08-03 17:22:46 +0000376 // buffer is y32, x16, x16, x16, x16, x16
377 // bump srcAddr to the proper row, since we're told Y never changes
378 SkASSERT((unsigned)xy[0] < (unsigned)s.fBitmap->height());
379 srcAddr = (const uint8_t*)((const char*)srcAddr + xy[0] * s.fBitmap->rowBytes());
reed@android.com6123e472009-08-04 01:52:27 +0000380
reed@android.coma0bd7f42009-08-03 17:22:46 +0000381 if (1 == s.fBitmap->width()) {
382 uint8_t src = srcAddr[0];
383 SkPMColor dstValue = table[src];
384 sk_memset32(colors, dstValue, count);
385 } else {
386 const uint16_t* xx = (const uint16_t*)(xy + 1);
reed@android.com6123e472009-08-04 01:52:27 +0000387
reed@android.coma0bd7f42009-08-03 17:22:46 +0000388 asm volatile (
389 "subs %[count], %[count], #8 \n\t" // decrement count by 8, set flags
390 "blt 2f \n\t" // if count < 0, branch to singles
391 "1: \n\t" // eights loop
392 "ldmia %[xx]!, {r5, r7, r9, r11} \n\t" // load ptrs to pixels 0-7
393 "uxth r4, r5 \n\t" // extract ptr 0
394 "mov r5, r5, lsr #16 \n\t" // extract ptr 1
395 "uxth r6, r7 \n\t" // extract ptr 2
396 "mov r7, r7, lsr #16 \n\t" // extract ptr 3
397 "ldrb r4, [%[srcAddr], r4] \n\t" // load pixel 0 from image
398 "uxth r8, r9 \n\t" // extract ptr 4
399 "ldrb r5, [%[srcAddr], r5] \n\t" // load pixel 1 from image
400 "mov r9, r9, lsr #16 \n\t" // extract ptr 5
401 "ldrb r6, [%[srcAddr], r6] \n\t" // load pixel 2 from image
402 "uxth r10, r11 \n\t" // extract ptr 6
403 "ldrb r7, [%[srcAddr], r7] \n\t" // load pixel 3 from image
404 "mov r11, r11, lsr #16 \n\t" // extract ptr 7
405 "ldrb r8, [%[srcAddr], r8] \n\t" // load pixel 4 from image
406 "ldrb r9, [%[srcAddr], r9] \n\t" // load pixel 5 from image
407 "ldrb r10, [%[srcAddr], r10] \n\t" // load pixel 6 from image
408 "ldrb r11, [%[srcAddr], r11] \n\t" // load pixel 7 from image
409 "ldr r4, [%[table], r4, lsl #2] \n\t" // load pixel 0 SkPMColor from colmap
410 "ldr r5, [%[table], r5, lsl #2] \n\t" // load pixel 1 SkPMColor from colmap
411 "ldr r6, [%[table], r6, lsl #2] \n\t" // load pixel 2 SkPMColor from colmap
412 "ldr r7, [%[table], r7, lsl #2] \n\t" // load pixel 3 SkPMColor from colmap
413 "ldr r8, [%[table], r8, lsl #2] \n\t" // load pixel 4 SkPMColor from colmap
414 "ldr r9, [%[table], r9, lsl #2] \n\t" // load pixel 5 SkPMColor from colmap
415 "ldr r10, [%[table], r10, lsl #2] \n\t" // load pixel 6 SkPMColor from colmap
416 "ldr r11, [%[table], r11, lsl #2] \n\t" // load pixel 7 SkPMColor from colmap
417 "subs %[count], %[count], #8 \n\t" // decrement loop counter
418 "stmia %[colors]!, {r4-r11} \n\t" // store 8 pixels
419 "bge 1b \n\t" // loop if counter >= 0
420 "2: \n\t"
421 "adds %[count], %[count], #8 \n\t" // fix up counter, set flags
422 "beq 4f \n\t" // if count == 0, branch to exit
423 "3: \n\t" // singles loop
424 "ldrh r4, [%[xx]], #2 \n\t" // load pixel ptr
425 "subs %[count], %[count], #1 \n\t" // decrement loop counter
426 "ldrb r5, [%[srcAddr], r4] \n\t" // load pixel from image
427 "ldr r6, [%[table], r5, lsl #2] \n\t" // load SkPMColor from colmap
428 "str r6, [%[colors]], #4 \n\t" // store pixel, update ptr
429 "bne 3b \n\t" // loop if counter != 0
430 "4: \n\t" // exit
431 : [xx] "+r" (xx), [count] "+r" (count), [colors] "+r" (colors)
432 : [table] "r" (table), [srcAddr] "r" (srcAddr)
433 : "memory", "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
434 );
435 }
reed@android.com6123e472009-08-04 01:52:27 +0000436
reed@android.coma0bd7f42009-08-03 17:22:46 +0000437 s.fBitmap->getColorTable()->unlockColors(false);
438}
439#endif //__ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
440
441#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
442static inline void Filter_32_direct(unsigned x, unsigned y,
443 SkPMColor a00, SkPMColor a01,
444 SkPMColor a10, SkPMColor a11,
445 SkPMColor *dst) {
446 asm volatile(
447 "vdup.8 d0, %[y] \n\t" // duplicate y into d0
448 "vmov.u8 d16, #16 \n\t" // set up constant in d16
449 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
450
451 "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
452 "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
453 "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
454 "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
455
456 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
457 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
458
459 "vdup.16 d5, %[x] \n\t" // duplicate x into d5
460 "vmov.u16 d16, #16 \n\t" // set up constant in d16
461 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
462
463 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
464 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
465 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
466 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
467 "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
468 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
469 :
470 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst)
471 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
472 );
473}
474
475static inline void Filter_32_direct_alpha(unsigned x, unsigned y,
476 SkPMColor a00, SkPMColor a01,
477 SkPMColor a10, SkPMColor a11,
478 SkPMColor *dst, uint16_t scale) {
479 asm volatile(
480 "vdup.8 d0, %[y] \n\t" // duplicate y into d0
481 "vmov.u8 d16, #16 \n\t" // set up constant in d16
482 "vsub.u8 d1, d16, d0 \n\t" // d1 = 16-y
483
484 "vdup.32 d4, %[a00] \n\t" // duplicate a00 into d4
485 "vdup.32 d5, %[a10] \n\t" // duplicate a10 into d5
486 "vmov.32 d4[1], %[a01] \n\t" // set top of d4 to a01
487 "vmov.32 d5[1], %[a11] \n\t" // set top of d5 to a11
488
489 "vmull.u8 q3, d4, d1 \n\t" // q3 = [a01|a00] * (16-y)
490 "vmull.u8 q0, d5, d0 \n\t" // q0 = [a11|a10] * y
491
492 "vdup.16 d5, %[x] \n\t" // duplicate x into d5
493 "vmov.u16 d16, #16 \n\t" // set up constant in d16
494 "vsub.u16 d3, d16, d5 \n\t" // d3 = 16-x
495
496 "vmul.i16 d4, d7, d5 \n\t" // d4 = a01 * x
497 "vmla.i16 d4, d1, d5 \n\t" // d4 += a11 * x
498 "vmla.i16 d4, d6, d3 \n\t" // d4 += a00 * (16-x)
499 "vmla.i16 d4, d0, d3 \n\t" // d4 += a10 * (16-x)
500 "vdup.16 d3, %[scale] \n\t" // duplicate scale into d3
501 "vshr.u16 d4, d4, #8 \n\t" // shift down result by 8
502 "vmul.i16 d4, d4, d3 \n\t" // multiply result by scale
503 "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
504 "vst1.32 {d0[0]}, [%[dst]] \n\t" // store result
505 :
506 : [x] "r" (x), [y] "r" (y), [a00] "r" (a00), [a01] "r" (a01), [a10] "r" (a10), [a11] "r" (a11), [dst] "r" (dst), [scale] "r" (scale)
507 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16"
508 );
509}
510
511void SI8_opaque_D32_filter_DX_arm(const SkBitmapProcState& s,
512 const uint32_t* SK_RESTRICT xy,
513 int count, SkPMColor* SK_RESTRICT colors) {
514 SkASSERT(count > 0 && colors != NULL);
515 SkASSERT(s.fDoFilter);
516
517 const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
518 const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
519 unsigned rb = s.fBitmap->rowBytes();
520 unsigned subY;
521 const uint8_t* SK_RESTRICT row0;
522 const uint8_t* SK_RESTRICT row1;
523
524 // setup row ptrs and update proc_table
525 {
526 uint32_t XY = *xy++;
527 unsigned y0 = XY >> 14;
528 row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
529 row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
530 subY = y0 & 0xF;
531 }
532
533 do {
534 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
535 unsigned x0 = XX >> 14;
536 unsigned x1 = XX & 0x3FFF;
537 unsigned subX = x0 & 0xF;
538 x0 >>= 4;
539
540 Filter_32_direct(subX, subY, table[row0[x0]],
541 table[row0[x1]],
542 table[row1[x0]],
543 table[row1[x1]], colors);
544 colors++;
545 } while (--count != 0);
546
547 s.fBitmap->getColorTable()->unlockColors(false);
548}
549
550void SI8_opaque_D32_filter_DXDY_arm(const SkBitmapProcState& s,
551 const uint32_t* SK_RESTRICT xy,
552 int count, SkPMColor* SK_RESTRICT colors) {
553 SkASSERT(count > 0 && colors != NULL);
554 SkASSERT(s.fDoFilter);
555
556 const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
557 const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
558 int rb = s.fBitmap->rowBytes();
559
560 do {
561 uint32_t data = *xy++;
562 unsigned y0 = data >> 14;
563 unsigned y1 = data & 0x3FFF;
564 unsigned subY = y0 & 0xF;
565 y0 >>= 4;
566
567 data = *xy++;
568 unsigned x0 = data >> 14;
569 unsigned x1 = data & 0x3FFF;
570 unsigned subX = x0 & 0xF;
571 x0 >>= 4;
572
573 const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
574 const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
575
576 Filter_32_direct(subX, subY, table[row0[x0]],
577 table[row0[x1]],
578 table[row1[x0]],
579 table[row1[x1]], colors);
580 colors++;
581 } while (--count != 0);
582
583 s.fBitmap->getColorTable()->unlockColors(false);
584}
585
586void SI8_alpha_D32_filter_DX_arm(const SkBitmapProcState& s,
587 const uint32_t* SK_RESTRICT xy,
588 int count, SkPMColor* SK_RESTRICT colors) {
589 SkASSERT(count > 0 && colors != NULL);
590 SkASSERT(s.fDoFilter);
591
592 unsigned scale = s.fAlphaScale;
593 const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
594 const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
595 unsigned rb = s.fBitmap->rowBytes();
596 unsigned subY;
597 const uint8_t* SK_RESTRICT row0;
598 const uint8_t* SK_RESTRICT row1;
599
600 // setup row ptrs and update proc_table
601 {
602 uint32_t XY = *xy++;
603 unsigned y0 = XY >> 14;
604 row0 = (const uint8_t*)(srcAddr + (y0 >> 4) * rb);
605 row1 = (const uint8_t*)(srcAddr + (XY & 0x3FFF) * rb);
606 subY = y0 & 0xF;
607 }
608
609 do {
610 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
611 unsigned x0 = XX >> 14;
612 unsigned x1 = XX & 0x3FFF;
613 unsigned subX = x0 & 0xF;
614 x0 >>= 4;
615
616 Filter_32_direct_alpha(subX, subY, table[row0[x0]],
617 table[row0[x1]],
618 table[row1[x0]],
619 table[row1[x1]], colors, scale);
620 colors++;
621 } while (--count != 0);
622
623 s.fBitmap->getColorTable()->unlockColors(false);
624}
625
626void SI8_alpha_D32_filter_DXDY_arm(const SkBitmapProcState& s,
627 const uint32_t* SK_RESTRICT xy,
628 int count, SkPMColor* SK_RESTRICT colors) {
629 SkASSERT(count > 0 && colors != NULL);
630 SkASSERT(s.fDoFilter);
631
632 unsigned scale = s.fAlphaScale;
633 const SkPMColor* SK_RESTRICT table = s.fBitmap->getColorTable()->lockColors();
634 const char* SK_RESTRICT srcAddr = (const char*)s.fBitmap->getPixels();
635 int rb = s.fBitmap->rowBytes();
636
637 do {
638 uint32_t data = *xy++;
639 unsigned y0 = data >> 14;
640 unsigned y1 = data & 0x3FFF;
641 unsigned subY = y0 & 0xF;
642 y0 >>= 4;
643
644 data = *xy++;
645 unsigned x0 = data >> 14;
646 unsigned x1 = data & 0x3FFF;
647 unsigned subX = x0 & 0xF;
648 x0 >>= 4;
649
650 const uint8_t* SK_RESTRICT row0 = (const uint8_t*)(srcAddr + y0 * rb);
651 const uint8_t* SK_RESTRICT row1 = (const uint8_t*)(srcAddr + y1 * rb);
652
653 Filter_32_direct_alpha(subX, subY, table[row0[x0]],
654 table[row0[x1]],
655 table[row1[x0]],
656 table[row1[x1]], colors, scale);
657 colors++;
658 } while (--count != 0);
659
660 s.fBitmap->getColorTable()->unlockColors(false);
661}
662#endif //defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
663
664///////////////////////////////////////////////////////////////////////////////
665
reed@android.com6123e472009-08-04 01:52:27 +0000666/* If we replace a sampleproc, then we null-out the associated shaderproc,
667 otherwise the shader won't even look at the matrix/sampler
668 */
reed@android.coma0bd7f42009-08-03 17:22:46 +0000669void SkBitmapProcState::platformProcs() {
670 bool doFilter = fDoFilter;
671 bool isOpaque = 256 == fAlphaScale;
672 bool justDx = false;
673
674 if (fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) {
675 justDx = true;
676 }
677
678 switch (fBitmap->config()) {
679 case SkBitmap::kRGB_565_Config:
680#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
681 if (justDx && doFilter) {
682 fSampleProc16 = S16_D16_filter_DX_arm;
reed@android.com6123e472009-08-04 01:52:27 +0000683 fShaderProc16 = NULL;
reed@android.coma0bd7f42009-08-03 17:22:46 +0000684 }
685#endif
686#if __ARM_ARCH__ >= 5 && !defined(SK_CPU_BENDIAN)
687 if (justDx && !doFilter) {
688 fSampleProc16 = S16_D16_nofilter_DX_arm;
reed@android.com6123e472009-08-04 01:52:27 +0000689 fShaderProc16 = NULL;
reed@android.coma0bd7f42009-08-03 17:22:46 +0000690 }
691#endif
692 break; // k565
693 case SkBitmap::kIndex8_Config:
694#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
695 if (justDx && !doFilter) {
reed@android.com6123e472009-08-04 01:52:27 +0000696#if 0 /* crashing on android device */
reed@android.coma0bd7f42009-08-03 17:22:46 +0000697 fSampleProc16 = SI8_D16_nofilter_DX_arm;
reed@android.com6123e472009-08-04 01:52:27 +0000698 fShaderProc16 = NULL;
699#endif
reed@android.coma0bd7f42009-08-03 17:22:46 +0000700 if (isOpaque) {
701 fSampleProc32 = SI8_opaque_D32_nofilter_DX_arm;
reed@android.com6123e472009-08-04 01:52:27 +0000702 fShaderProc32 = NULL;
reed@android.coma0bd7f42009-08-03 17:22:46 +0000703 }
704 }
705#endif
706#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
707 if (doFilter) {
708 if (isOpaque) {
709 if (justDx) {
710 fSampleProc32 = SI8_opaque_D32_filter_DX_arm;
711 } else {
712 fSampleProc32 = SI8_opaque_D32_filter_DXDY_arm;
713 }
714 } else { // !isOpaque
715 if (justDx) {
716 fSampleProc32 = SI8_alpha_D32_filter_DX_arm;
717 } else {
718 fSampleProc32 = SI8_alpha_D32_filter_DXDY_arm;
719 }
720 }
reed@android.com6123e472009-08-04 01:52:27 +0000721 fShaderProc32 = NULL;
reed@android.coma0bd7f42009-08-03 17:22:46 +0000722 }
723#endif
724 break; // kIndex8
725 default:
726 break;
727 }
728}
729