blob: 30d26ed3369e237ee7d59de2910c155f261c5ee8 [file] [log] [blame]
reed@android.com7d2e3222009-07-30 02:22:31 +00001/*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
agl@chromium.org28bee952009-11-19 03:21:57 +000018#ifdef ANDROID
19 #include <machine/cpu-features.h>
20#endif
21
reed@android.com7d2e3222009-07-30 02:22:31 +000022#include "SkBlitRow.h"
reed@android.com6123e472009-08-04 01:52:27 +000023#include "SkColorPriv.h"
24#include "SkDither.h"
reed@android.com7d2e3222009-07-30 02:22:31 +000025
reed@android.com4e635f92009-10-19 17:39:46 +000026#if defined(__ARM_HAVE_NEON)
27#include <arm_neon.h>
28#endif
29
reed@android.coma98a21e2009-10-19 18:13:18 +000030#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com7d2e3222009-07-30 02:22:31 +000031static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
32 const SkPMColor* SK_RESTRICT src, int count,
33 U8CPU alpha, int /*x*/, int /*y*/) {
34 SkASSERT(255 == alpha);
reed@android.com229d9b32010-04-09 18:44:46 +000035
reed@android.com7d2e3222009-07-30 02:22:31 +000036 if (count >= 8) {
37 uint16_t* SK_RESTRICT keep_dst;
38
39 asm volatile (
40 "ands ip, %[count], #7 \n\t"
41 "vmov.u8 d31, #1<<7 \n\t"
42 "vld1.16 {q12}, [%[dst]] \n\t"
43 "vld4.8 {d0-d3}, [%[src]] \n\t"
44 "moveq ip, #8 \n\t"
45 "mov %[keep_dst], %[dst] \n\t"
46
47 "add %[src], %[src], ip, LSL#2 \n\t"
48 "add %[dst], %[dst], ip, LSL#1 \n\t"
49 "subs %[count], %[count], ip \n\t"
50 "b 9f \n\t"
51 // LOOP
52 "2: \n\t"
53
54 "vld1.16 {q12}, [%[dst]]! \n\t"
55 "vld4.8 {d0-d3}, [%[src]]! \n\t"
56 "vst1.16 {q10}, [%[keep_dst]] \n\t"
57 "sub %[keep_dst], %[dst], #8*2 \n\t"
58 "subs %[count], %[count], #8 \n\t"
59 "9: \n\t"
60 "pld [%[dst],#32] \n\t"
61 // expand 0565 q12 to 8888 {d4-d7}
62 "vmovn.u16 d4, q12 \n\t"
63 "vshr.u16 q11, q12, #5 \n\t"
64 "vshr.u16 q10, q12, #6+5 \n\t"
65 "vmovn.u16 d5, q11 \n\t"
66 "vmovn.u16 d6, q10 \n\t"
67 "vshl.u8 d4, d4, #3 \n\t"
68 "vshl.u8 d5, d5, #2 \n\t"
69 "vshl.u8 d6, d6, #3 \n\t"
70
71 "vmovl.u8 q14, d31 \n\t"
72 "vmovl.u8 q13, d31 \n\t"
73 "vmovl.u8 q12, d31 \n\t"
74
75 // duplicate in 4/2/1 & 8pix vsns
76 "vmvn.8 d30, d3 \n\t"
77 "vmlal.u8 q14, d30, d6 \n\t"
78 "vmlal.u8 q13, d30, d5 \n\t"
79 "vmlal.u8 q12, d30, d4 \n\t"
80 "vshr.u16 q8, q14, #5 \n\t"
81 "vshr.u16 q9, q13, #6 \n\t"
82 "vaddhn.u16 d6, q14, q8 \n\t"
83 "vshr.u16 q8, q12, #5 \n\t"
84 "vaddhn.u16 d5, q13, q9 \n\t"
85 "vqadd.u8 d6, d6, d0 \n\t" // moved up
86 "vaddhn.u16 d4, q12, q8 \n\t"
87 // intentionally don't calculate alpha
88 // result in d4-d6
89
90 "vqadd.u8 d5, d5, d1 \n\t"
91 "vqadd.u8 d4, d4, d2 \n\t"
92
93 // pack 8888 {d4-d6} to 0565 q10
94 "vshll.u8 q10, d6, #8 \n\t"
95 "vshll.u8 q3, d5, #8 \n\t"
96 "vshll.u8 q2, d4, #8 \n\t"
97 "vsri.u16 q10, q3, #5 \n\t"
98 "vsri.u16 q10, q2, #11 \n\t"
99
100 "bne 2b \n\t"
101
102 "1: \n\t"
103 "vst1.16 {q10}, [%[keep_dst]] \n\t"
104 : [count] "+r" (count)
105 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
106 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
107 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
108 "d30","d31"
109 );
reed@android.com522aa8d2009-10-22 20:26:53 +0000110 }
111 else
112 { // handle count < 8
reed@android.com7d2e3222009-07-30 02:22:31 +0000113 uint16_t* SK_RESTRICT keep_dst;
114
115 asm volatile (
116 "vmov.u8 d31, #1<<7 \n\t"
117 "mov %[keep_dst], %[dst] \n\t"
118
119 "tst %[count], #4 \n\t"
120 "beq 14f \n\t"
121 "vld1.16 {d25}, [%[dst]]! \n\t"
122 "vld1.32 {q1}, [%[src]]! \n\t"
123
124 "14: \n\t"
125 "tst %[count], #2 \n\t"
126 "beq 12f \n\t"
127 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
128 "vld1.32 {d1}, [%[src]]! \n\t"
129
130 "12: \n\t"
131 "tst %[count], #1 \n\t"
132 "beq 11f \n\t"
133 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
134 "vld1.32 {d0[1]}, [%[src]]! \n\t"
135
136 "11: \n\t"
137 // unzips achieve the same as a vld4 operation
138 "vuzpq.u16 q0, q1 \n\t"
139 "vuzp.u8 d0, d1 \n\t"
140 "vuzp.u8 d2, d3 \n\t"
141 // expand 0565 q12 to 8888 {d4-d7}
142 "vmovn.u16 d4, q12 \n\t"
143 "vshr.u16 q11, q12, #5 \n\t"
144 "vshr.u16 q10, q12, #6+5 \n\t"
145 "vmovn.u16 d5, q11 \n\t"
146 "vmovn.u16 d6, q10 \n\t"
147 "vshl.u8 d4, d4, #3 \n\t"
148 "vshl.u8 d5, d5, #2 \n\t"
149 "vshl.u8 d6, d6, #3 \n\t"
150
151 "vmovl.u8 q14, d31 \n\t"
152 "vmovl.u8 q13, d31 \n\t"
153 "vmovl.u8 q12, d31 \n\t"
154
155 // duplicate in 4/2/1 & 8pix vsns
156 "vmvn.8 d30, d3 \n\t"
157 "vmlal.u8 q14, d30, d6 \n\t"
158 "vmlal.u8 q13, d30, d5 \n\t"
159 "vmlal.u8 q12, d30, d4 \n\t"
160 "vshr.u16 q8, q14, #5 \n\t"
161 "vshr.u16 q9, q13, #6 \n\t"
162 "vaddhn.u16 d6, q14, q8 \n\t"
163 "vshr.u16 q8, q12, #5 \n\t"
164 "vaddhn.u16 d5, q13, q9 \n\t"
165 "vqadd.u8 d6, d6, d0 \n\t" // moved up
166 "vaddhn.u16 d4, q12, q8 \n\t"
167 // intentionally don't calculate alpha
168 // result in d4-d6
169
170 "vqadd.u8 d5, d5, d1 \n\t"
171 "vqadd.u8 d4, d4, d2 \n\t"
172
173 // pack 8888 {d4-d6} to 0565 q10
174 "vshll.u8 q10, d6, #8 \n\t"
175 "vshll.u8 q3, d5, #8 \n\t"
176 "vshll.u8 q2, d4, #8 \n\t"
177 "vsri.u16 q10, q3, #5 \n\t"
178 "vsri.u16 q10, q2, #11 \n\t"
179
180 // store
181 "tst %[count], #4 \n\t"
182 "beq 24f \n\t"
183 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
184
185 "24: \n\t"
186 "tst %[count], #2 \n\t"
187 "beq 22f \n\t"
188 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
189
190 "22: \n\t"
191 "tst %[count], #1 \n\t"
192 "beq 21f \n\t"
193 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
194
195 "21: \n\t"
196 : [count] "+r" (count)
197 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
198 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
199 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
200 "d30","d31"
201 );
202 }
203}
reed@android.com4bda7a52009-07-30 20:40:47 +0000204
205static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
206 const SkPMColor* SK_RESTRICT src, int count,
207 U8CPU alpha, int /*x*/, int /*y*/) {
reed@android.com229d9b32010-04-09 18:44:46 +0000208
209 U8CPU alpha_for_asm = alpha;
210
reed@android.com4bda7a52009-07-30 20:40:47 +0000211 asm volatile (
212 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
213 * the original in two respects:
214 * 1. The results have a few mismatches compared to the original code. These mismatches
215 * never exceed 1. It's possible to improve accuracy vs. a floating point
216 * implementation by introducing rounding right shifts (vrshr) for the final stage.
217 * Rounding is not present in the code below, because although results would be closer
218 * to a floating point implementation, the number of mismatches compared to the
219 * original code would be far greater.
220 * 2. On certain inputs, the original code can overflow, causing colour channels to
221 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
222 * to affect another.
223 */
224
reed@android.com229d9b32010-04-09 18:44:46 +0000225#if 1
226 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
227 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
228#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000229 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
reed@android.com229d9b32010-04-09 18:44:46 +0000230#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000231 "vmov.u16 q3, #255 \n\t" // set up constant
232 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
233 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
234 "beq 2f \n\t" // if count8 == 0, exit
235 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
236
237 "1: \n\t"
238 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
239 "subs r4, r4, #1 \n\t" // decrement loop counter
240 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
241 // and deinterleave
242
243 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
244 "vand q10, q0, q15 \n\t" // extract blue
245 "vshr.u16 q8, q0, #11 \n\t" // extract red
246 "vshr.u16 q9, q9, #10 \n\t" // extract green
247 // dstrgb = {q8, q9, q10}
248
249 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
250 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
251 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
252
253 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
254 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
255 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
256 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
257 // srcrgba = {q11, q12, q13, q14}
258
259 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
260 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
261 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
262 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
263
264 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
265 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
266 // dst_scale = q2
267
268 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
269 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
270 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
reed@android.com229d9b32010-04-09 18:44:46 +0000271
272#if 1
273 // trying for a better match with SkDiv255Round(a)
274 // C alg is: a+=128; (a+a>>8)>>8
275 // we'll use just a rounding shift [q2 is available for scratch]
276 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
277 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
278 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
279#else
280 // arm's original "truncating divide by 256"
reed@android.com4bda7a52009-07-30 20:40:47 +0000281 "vshr.u16 q11, q11, #8 \n\t" // shift down red
282 "vshr.u16 q12, q12, #8 \n\t" // shift down green
283 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
reed@android.com229d9b32010-04-09 18:44:46 +0000284#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000285
286 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
287 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
288 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
289
290 "bne 1b \n\t" // if counter != 0, loop
291 "2: \n\t" // exit
292
reed@android.com229d9b32010-04-09 18:44:46 +0000293 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
reed@android.com4bda7a52009-07-30 20:40:47 +0000294 :
295 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
296 );
reed@android.com229d9b32010-04-09 18:44:46 +0000297
reed@android.com4bda7a52009-07-30 20:40:47 +0000298 count &= 7;
299 if (count > 0) {
300 do {
301 SkPMColor sc = *src++;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000302 if (sc) {
reed@android.com4bda7a52009-07-30 20:40:47 +0000303 uint16_t dc = *dst;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000304 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
305 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
306 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
307 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
308 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
reed@android.com4bda7a52009-07-30 20:40:47 +0000309 }
310 dst += 1;
311 } while (--count != 0);
312 }
313}
314
315/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
316 * each dither value is spaced out into byte lanes, and repeated
317 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
318 * start of each row.
319 */
320static const uint8_t gDitherMatrix_Neon[48] = {
321 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
322 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
323 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
324 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
325
326};
327
328static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
329 int count, U8CPU alpha, int x, int y)
330{
331 /* select row and offset for dither array */
332 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
333
334 /* rescale alpha to range 0 - 256 */
335 int scale = SkAlpha255To256(alpha);
336
337 asm volatile (
338 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values
339 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values
340 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg
341 "vmov.i8 d29, #0x3f \n\t" // set up green mask
342 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
343 "1: \n\t"
344 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb
345 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5
346 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6
347 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5
348 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
349 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
350 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
351 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
352 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
353 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
354 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits
355 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits
356 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits
357 // load 8 pixels from dst, extract rgb
358 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels
359 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits
360 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes
361 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red
362 "vand d17, d17, d29 \n\t" // and green with green mask
363 "vand d18, d18, d28 \n\t" // and blue with blue mask
364 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes
365 // src = {d22 (r), d23 (g), d24 (b)}
366 // dst = {d16 (r), d17 (g), d18 (b)}
367 // subtract dst from src and widen
368 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst
369 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst
370 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst
371 // multiply diffs by scale and shift
372 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale
373 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale
374 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale
375 "subs %[count], %[count], #8 \n\t" // decrement loop counter
376 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow
377 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow
378 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow
379 // add dst to result
380 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
381 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
382 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
383 // put result into 565 format
384 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue
385 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue
386 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result
387 "bgt 1b \n\t" // loop if count > 0
388 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
389 : [dstart] "r" (dstart), [scale] "r" (scale)
390 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
391 );
392
393 DITHER_565_SCAN(y);
394
395 while((count & 7) > 0)
396 {
397 SkPMColor c = *src++;
398
399 int dither = DITHER_VALUE(x);
400 int sr = SkGetPackedR32(c);
401 int sg = SkGetPackedG32(c);
402 int sb = SkGetPackedB32(c);
403 sr = SkDITHER_R32To565(sr, dither);
404 sg = SkDITHER_G32To565(sg, dither);
405 sb = SkDITHER_B32To565(sb, dither);
406
407 uint16_t d = *dst;
408 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
409 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
410 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
411 DITHER_INC_X(x);
412 count--;
413 }
414}
415
416#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
417#define S32A_D565_Blend_PROC S32A_D565_Blend_neon
418#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
reed@android.com7d2e3222009-07-30 02:22:31 +0000419#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000420#define S32A_D565_Opaque_PROC NULL
421#define S32A_D565_Blend_PROC NULL
422#define S32_D565_Blend_Dither_PROC NULL
reed@android.com7d2e3222009-07-30 02:22:31 +0000423#endif
424
425/* Don't have a special version that assumes each src is opaque, but our S32A
426 is still faster than the default, so use it here
427 */
reed@android.com4bda7a52009-07-30 20:40:47 +0000428#define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC
429#define S32_D565_Blend_PROC S32A_D565_Blend_PROC
reed@android.com7d2e3222009-07-30 02:22:31 +0000430
431///////////////////////////////////////////////////////////////////////////////
432
reed@android.coma98a21e2009-10-19 18:13:18 +0000433#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000434
435static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
436 const SkPMColor* SK_RESTRICT src,
437 int count, U8CPU alpha) {
438
439 SkASSERT(255 == alpha);
440 if (count > 0) {
441
reed@android.com522aa8d2009-10-22 20:26:53 +0000442
443 uint8x8_t alpha_mask;
444
445 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
446 alpha_mask = vld1_u8(alpha_mask_setup);
447
reed@android.com4e635f92009-10-19 17:39:46 +0000448 /* do the NEON unrolled code */
449#define UNROLL 4
450 while (count >= UNROLL) {
451 uint8x8_t src_raw, dst_raw, dst_final;
452 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
reed@android.com4e635f92009-10-19 17:39:46 +0000453
454 /* get the source */
455 src_raw = vreinterpret_u8_u32(vld1_u32(src));
456#if UNROLL > 2
457 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
458#endif
459
460 /* get and hold the dst too */
461 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
462#if UNROLL > 2
463 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
464#endif
465
reed@android.com4e635f92009-10-19 17:39:46 +0000466 /* 1st and 2nd bits of the unrolling */
467 {
468 uint8x8_t dst_cooked;
469 uint16x8_t dst_wide;
470 uint8x8_t alpha_narrow;
471 uint16x8_t alpha_wide;
472
473 /* get the alphas spread out properly */
474 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000475#if 1
476 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
477 /* we collapsed (255-a)+1 ... */
478 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
479#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000480 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000481 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000482#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000483
reed@android.com522aa8d2009-10-22 20:26:53 +0000484 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000485 dst_wide = vmovl_u8(dst_raw);
486
487 /* alpha mul the dest */
488 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
489 dst_cooked = vshrn_n_u16(dst_wide, 8);
490
491 /* sum -- ignoring any byte lane overflows */
492 dst_final = vadd_u8(src_raw, dst_cooked);
493 }
reed@android.com4e635f92009-10-19 17:39:46 +0000494
495#if UNROLL > 2
496 /* the 3rd and 4th bits of our unrolling */
497 {
498 uint8x8_t dst_cooked;
499 uint16x8_t dst_wide;
500 uint8x8_t alpha_narrow;
501 uint16x8_t alpha_wide;
502
503 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000504#if 1
505 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
506 /* we collapsed (255-a)+1 ... */
507 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
508#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000509 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000510 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000511#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000512
reed@android.com522aa8d2009-10-22 20:26:53 +0000513 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000514 dst_wide = vmovl_u8(dst_raw_2);
515
516 /* alpha mul the dest */
517 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
518 dst_cooked = vshrn_n_u16(dst_wide, 8);
519
520 /* sum -- ignoring any byte lane overflows */
521 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
522 }
523#endif
524
525 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
526#if UNROLL > 2
527 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
528#endif
529
530 src += UNROLL;
531 dst += UNROLL;
532 count -= UNROLL;
533 }
534#undef UNROLL
535
536 /* do any residual iterations */
537 while (--count >= 0) {
538#ifdef TEST_SRC_ALPHA
539 SkPMColor sc = *src;
540 if (sc) {
541 unsigned srcA = SkGetPackedA32(sc);
542 SkPMColor result = sc;
543 if (srcA != 255) {
544 result = SkPMSrcOver(sc, *dst);
545 }
546 *dst = result;
547 }
548#else
549 *dst = SkPMSrcOver(*src, *dst);
550#endif
551 src += 1;
552 dst += 1;
553 }
554 }
555}
556
agl@chromium.orga40390c2010-06-18 15:30:12 +0000557#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
agl@chromium.org94d14642010-08-17 16:24:15 +0000558
reed@android.com4e635f92009-10-19 17:39:46 +0000559#else
agl@chromium.org94d14642010-08-17 16:24:15 +0000560
561#ifdef TEST_SRC_ALPHA
562#error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
563#endif
564
565static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
566 const SkPMColor* SK_RESTRICT src,
567 int count, U8CPU alpha) {
568
569 SkASSERT(255 == alpha);
570
571 /* Does not support the TEST_SRC_ALPHA case */
572 asm volatile (
573 "cmp %[count], #0 \n\t" /* comparing count with 0 */
574 "beq 3f \n\t" /* if zero exit */
575
576 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */
577 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */
578
579 "cmp %[count], #2 \n\t" /* compare count with 2 */
580 "blt 2f \n\t" /* if less than 2 -> single loop */
581
582 /* Double Loop */
583 "1: \n\t" /* <double loop> */
584 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */
585 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */
586 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
587
588 /* ----------- */
589 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
590 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
591 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
592
593 "mul r9, r9, r4 \n\t" /* br = br * scale */
594 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
595 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
596
597 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
598 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
599 "orr r7, r9, r10 \n\t" /* br | ag*/
600
601 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */
602 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */
603
604 /* ----------- */
605 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */
606
607 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */
608 "mul r9, r9, r4 \n\t" /* br = br * scale */
609 "sub %[count], %[count], #2 \n\t"
610 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
611
612 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
613 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
614 "cmp %[count], #1 \n\t" /* comparing count with 1 */
615 "orr r8, r9, r10 \n\t" /* br | ag */
616
617 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */
618
619 /* ----------------- */
620 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */
621 /* ----------------- */
622
623 "bgt 1b \n\t" /* if greater than 1 -> reloop */
624 "blt 3f \n\t" /* if less than 1 -> exit */
625
626 /* Single Loop */
627 "2: \n\t" /* <single loop> */
628 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */
629 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */
630 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
631
632 /* ----------- */
633 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
634 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
635
636 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
637 "mul r9, r9, r4 \n\t" /* br = br * scale */
638 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
639 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
640
641 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */
642 "orr r7, r9, r10 \n\t" /* br | ag */
643
644 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */
645
646 /* ----------------- */
647 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */
648 /* ----------------- */
649
650 "3: \n\t" /* <exit> */
651 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
652 :
653 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
654 );
655}
656#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
reed@android.com4e635f92009-10-19 17:39:46 +0000657#endif
658
agl@chromium.org8b17ac32010-09-10 15:09:42 +0000659/*
660 * ARM asm version of S32A_Blend_BlitRow32
661 */
662static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
663 const SkPMColor* SK_RESTRICT src,
664 int count, U8CPU alpha) {
665 SkASSERT(255 == alpha);
666
667 asm volatile (
668 "cmp %[count], #0 \n\t" /* comparing count with 0 */
669 "beq 3f \n\t" /* if zero exit */
670
671 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */
672 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */
673
674 /* src1,2_scale */
675 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */
676
677 "cmp %[count], #2 \n\t" /* comparing count with 2 */
678 "blt 2f \n\t" /* if less than 2 -> single loop */
679
680 /* Double Loop */
681 "1: \n\t" /* <double loop> */
682 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */
683 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */
684
685 /* dst1_scale and dst2_scale*/
686 "lsr r9, r5, #24 \n\t" /* src >> 24 */
687 "lsr r10, r6, #24 \n\t" /* src >> 24 */
688 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */
689 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */
690 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */
691 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */
692 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
693 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
694
695 /* ---------------------- */
696
697 /* src1, src1_scale */
698 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
699 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */
700 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
701 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
702 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
703 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
704 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */
705
706 /* dst1, dst1_scale */
707 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
708 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */
709 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */
710 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */
711 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
712 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
713 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */
714
715 /* ---------------------- */
716 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */
717 /* ---------------------- */
718
719 /* ====================== */
720
721 /* src2, src2_scale */
722 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
723 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */
724 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
725 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
726 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
727 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
728 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */
729
730 /* dst2, dst2_scale */
731 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
732 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */
733 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */
734 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */
735 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
736 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
737 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */
738
739 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */
740 /* ---------------------- */
741 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */
742 /* ---------------------- */
743 "cmp %[count], #1 \n\t" /* compare count with 1 */
744 /* ----------------- */
745 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */
746 /* ----------------- */
747
748 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */
749 "blt 3f \n\t" /* if %[count] less than 1 exit */
750 /* else get into the single loop */
751 /* Single Loop */
752 "2: \n\t" /* <single loop> */
753 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */
754 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */
755
756 "lsr r6, r5, #24 \n\t" /* src >> 24 */
757 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
758 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */
759 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */
760 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */
761 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */
762 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */
763
764 /* src, src_scale */
765 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */
766 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
767 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
768 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */
769
770 /* dst, dst_scale */
771 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
772 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */
773 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */
774 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */
775 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
776 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
777 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */
778
779 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */
780
781 /* ----------------- */
782 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */
783 /* ----------------- */
784
785 "3: \n\t" /* <exit> */
786 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
787 :
788 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
789 );
790
791}
792#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm
793
reed@android.com4e635f92009-10-19 17:39:46 +0000794/* Neon version of S32_Blend_BlitRow32()
reed@android.com522aa8d2009-10-22 20:26:53 +0000795 * portable version is in src/core/SkBlitRow_D32.cpp
reed@android.com4e635f92009-10-19 17:39:46 +0000796 */
reed@android.coma98a21e2009-10-19 18:13:18 +0000797#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000798static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
799 const SkPMColor* SK_RESTRICT src,
800 int count, U8CPU alpha) {
801 SkASSERT(alpha <= 255);
802 if (count > 0) {
803 uint16_t src_scale = SkAlpha255To256(alpha);
804 uint16_t dst_scale = 256 - src_scale;
805
806 /* run them N at a time through the NEON unit */
807 /* note that each 1 is 4 bytes, each treated exactly the same,
808 * so we can work under that guise. We *do* know that the src&dst
809 * will be 32-bit aligned quantities, so we can specify that on
810 * the load/store ops and do a neon 'reinterpret' to get us to
811 * byte-sized (pun intended) pieces that we widen/multiply/shift
812 * we're limited at 128 bits in the wide ops, which is 8x16bits
813 * or a pair of 32 bit src/dsts.
814 */
815 /* we *could* manually unroll this loop so that we load 128 bits
816 * (as a pair of 64s) from each of src and dst, processing them
817 * in pieces. This might give us a little better management of
818 * the memory latency, but my initial attempts here did not
819 * produce an instruction stream that looked all that nice.
820 */
821#define UNROLL 2
822 while (count >= UNROLL) {
823 uint8x8_t src_raw, dst_raw, dst_final;
824 uint16x8_t src_wide, dst_wide;
825
826 /* get 64 bits of src, widen it, multiply by src_scale */
827 src_raw = vreinterpret_u8_u32(vld1_u32(src));
828 src_wide = vmovl_u8(src_raw);
reed@android.com522aa8d2009-10-22 20:26:53 +0000829 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
reed@android.com4e635f92009-10-19 17:39:46 +0000830 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
831
832 /* ditto with dst */
833 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
834 dst_wide = vmovl_u8(dst_raw);
reed@android.com4e635f92009-10-19 17:39:46 +0000835
reed@android.com522aa8d2009-10-22 20:26:53 +0000836 /* combine add with dst multiply into mul-accumulate */
837 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
838
reed@android.com4e635f92009-10-19 17:39:46 +0000839 dst_final = vshrn_n_u16(dst_wide, 8);
reed@android.com4e635f92009-10-19 17:39:46 +0000840 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
841
842 src += UNROLL;
843 dst += UNROLL;
844 count -= UNROLL;
845 }
846 /* RBE: well, i don't like how gcc manages src/dst across the above
847 * loop it's constantly calculating src+bias, dst+bias and it only
848 * adjusts the real ones when we leave the loop. Not sure why
849 * it's "hoisting down" (hoisting implies above in my lexicon ;))
850 * the adjustments to src/dst/count, but it does...
851 * (might be SSA-style internal logic...
852 */
853
854#if UNROLL == 2
855 if (count == 1) {
856 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
857 }
858#else
859 if (count > 0) {
860 do {
861 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
862 src += 1;
863 dst += 1;
864 } while (--count > 0);
865 }
866#endif
867
868#undef UNROLL
869 }
870}
871
872#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon
873#else
874#define S32_Blend_BlitRow32_PROC NULL
875#endif
876
877///////////////////////////////////////////////////////////////////////////////
878
reed@android.coma98a21e2009-10-19 18:13:18 +0000879#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com522aa8d2009-10-22 20:26:53 +0000880
881#undef DEBUG_OPAQUE_DITHER
882
883#if defined(DEBUG_OPAQUE_DITHER)
884static void showme8(char *str, void *p, int len)
885{
886 static char buf[256];
887 char tbuf[32];
888 int i;
889 char *pc = (char*) p;
890 sprintf(buf,"%8s:", str);
891 for(i=0;i<len;i++) {
892 sprintf(tbuf, " %02x", pc[i]);
893 strcat(buf, tbuf);
894 }
895 SkDebugf("%s\n", buf);
896}
897static void showme16(char *str, void *p, int len)
898{
899 static char buf[256];
900 char tbuf[32];
901 int i;
902 uint16_t *pc = (uint16_t*) p;
903 sprintf(buf,"%8s:", str);
904 len = (len / sizeof(uint16_t)); /* passed as bytes */
905 for(i=0;i<len;i++) {
906 sprintf(tbuf, " %04x", pc[i]);
907 strcat(buf, tbuf);
908 }
909 SkDebugf("%s\n", buf);
910}
911#endif
912
913static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
reed@android.com4e635f92009-10-19 17:39:46 +0000914 const SkPMColor* SK_RESTRICT src,
915 int count, U8CPU alpha, int x, int y) {
916 SkASSERT(255 == alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +0000917
918#define UNROLL 8
919
920 if (count >= UNROLL) {
921 uint8x8_t dbase;
922
923#if defined(DEBUG_OPAQUE_DITHER)
924 uint16_t tmpbuf[UNROLL];
925 int td[UNROLL];
926 int tdv[UNROLL];
927 int ta[UNROLL];
928 int tap[UNROLL];
929 uint16_t in_dst[UNROLL];
930 int offset = 0;
931 int noisy = 0;
932#endif
933
934 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
935 dbase = vld1_u8(dstart);
936
937 do {
938 uint8x8_t sr, sg, sb, sa, d;
reed@android.com229d9b32010-04-09 18:44:46 +0000939 uint16x8_t dst8, scale8, alpha8;
reed@android.com522aa8d2009-10-22 20:26:53 +0000940 uint16x8_t dst_r, dst_g, dst_b;
941
942#if defined(DEBUG_OPAQUE_DITHER)
943 /* calculate 8 elements worth into a temp buffer */
944 {
945 int my_y = y;
946 int my_x = x;
947 SkPMColor* my_src = (SkPMColor*)src;
948 uint16_t* my_dst = dst;
949 int i;
950
951 DITHER_565_SCAN(my_y);
952 for(i=0;i<UNROLL;i++) {
953 SkPMColor c = *my_src++;
954 SkPMColorAssert(c);
955 if (c) {
956 unsigned a = SkGetPackedA32(c);
957
958 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
959 tdv[i] = DITHER_VALUE(my_x);
960 ta[i] = a;
961 tap[i] = SkAlpha255To256(a);
962 td[i] = d;
963
964 unsigned sr = SkGetPackedR32(c);
965 unsigned sg = SkGetPackedG32(c);
966 unsigned sb = SkGetPackedB32(c);
967 sr = SkDITHER_R32_FOR_565(sr, d);
968 sg = SkDITHER_G32_FOR_565(sg, d);
969 sb = SkDITHER_B32_FOR_565(sb, d);
970
971 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
972 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
973 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
974 // now src and dst expanded are in g:11 r:10 x:1 b:10
975 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
976 td[i] = d;
977
978 } else {
979 tmpbuf[i] = *my_dst;
980 ta[i] = tdv[i] = td[i] = 0xbeef;
981 }
982 in_dst[i] = *my_dst;
983 my_dst += 1;
984 DITHER_INC_X(my_x);
985 }
986 }
987#endif
988
989 /* source is in ABGR */
990 {
991 register uint8x8_t d0 asm("d0");
992 register uint8x8_t d1 asm("d1");
993 register uint8x8_t d2 asm("d2");
994 register uint8x8_t d3 asm("d3");
995
996 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
997 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
998 : "r" (src)
999 );
1000 sr = d0; sg = d1; sb = d2; sa = d3;
1001 }
1002
1003 /* calculate 'd', which will be 0..7 */
1004 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
agl@chromium.orge932a422010-04-26 21:45:08 +00001005#if ANDROID
reed@android.com229d9b32010-04-09 18:44:46 +00001006 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1007 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1008#else
1009 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1010#endif
1011 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1012 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
reed@android.com522aa8d2009-10-22 20:26:53 +00001013
1014 /* sr = sr - (sr>>5) + d */
1015 /* watching for 8-bit overflow. d is 0..7; risky range of
1016 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1017 * safe as long as we do ((sr-sr>>5) + d) */
1018 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1019 sr = vadd_u8(sr, d);
1020
1021 /* sb = sb - (sb>>5) + d */
1022 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1023 sb = vadd_u8(sb, d);
1024
1025 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1026 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1027 sg = vadd_u8(sg, vshr_n_u8(d,1));
1028
1029 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1030 dst8 = vld1q_u16(dst);
1031 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1032 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1033 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1034
1035 /* blend */
reed@android.com229d9b32010-04-09 18:44:46 +00001036#if 1
1037 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1038 /* originally 255-sa + 1 */
1039 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1040#else
reed@android.com522aa8d2009-10-22 20:26:53 +00001041 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1042 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
reed@android.com229d9b32010-04-09 18:44:46 +00001043#endif
1044
1045#if 1
1046 /* combine the addq and mul, save 3 insns */
1047 scale8 = vshrq_n_u16(scale8, 3);
1048 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1049 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1050 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1051#else
1052 /* known correct, but +3 insns over above */
reed@android.com522aa8d2009-10-22 20:26:53 +00001053 scale8 = vshrq_n_u16(scale8, 3);
1054 dst_b = vmulq_u16(dst_b, scale8);
1055 dst_g = vmulq_u16(dst_g, scale8);
1056 dst_r = vmulq_u16(dst_r, scale8);
1057
1058 /* combine */
1059 /* NB: vshll widens, need to preserve those bits */
1060 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1061 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1062 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
reed@android.com229d9b32010-04-09 18:44:46 +00001063#endif
reed@android.com522aa8d2009-10-22 20:26:53 +00001064
1065 /* repack to store */
1066 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1067 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1068 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1069
1070 vst1q_u16(dst, dst8);
1071
1072#if defined(DEBUG_OPAQUE_DITHER)
1073 /* verify my 8 elements match the temp buffer */
1074 {
1075 int i, bad=0;
1076 static int invocation;
1077
1078 for (i=0;i<UNROLL;i++)
1079 if (tmpbuf[i] != dst[i]) bad=1;
reed@android.com229d9b32010-04-09 18:44:46 +00001080 if (bad) {
reed@android.com522aa8d2009-10-22 20:26:53 +00001081 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1082 invocation, offset);
reed@android.com229d9b32010-04-09 18:44:46 +00001083 SkDebugf(" alpha 0x%x\n", alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +00001084 for (i=0;i<UNROLL;i++)
1085 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1086 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1087 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1088
reed@android.com229d9b32010-04-09 18:44:46 +00001089 showme16("alpha8", &alpha8, sizeof(alpha8));
1090 showme16("scale8", &scale8, sizeof(scale8));
1091 showme8("d", &d, sizeof(d));
1092 showme16("dst8", &dst8, sizeof(dst8));
1093 showme16("dst_b", &dst_b, sizeof(dst_b));
1094 showme16("dst_g", &dst_g, sizeof(dst_g));
1095 showme16("dst_r", &dst_r, sizeof(dst_r));
1096 showme8("sb", &sb, sizeof(sb));
1097 showme8("sg", &sg, sizeof(sg));
1098 showme8("sr", &sr, sizeof(sr));
1099
reed@android.com522aa8d2009-10-22 20:26:53 +00001100 /* cop out */
1101 return;
1102 }
1103 offset += UNROLL;
1104 invocation++;
1105 }
1106#endif
1107
1108 dst += UNROLL;
1109 src += UNROLL;
1110 count -= UNROLL;
1111 /* skip x += UNROLL, since it's unchanged mod-4 */
1112 } while (count >= UNROLL);
1113 }
1114#undef UNROLL
1115
1116 /* residuals */
reed@android.com4e635f92009-10-19 17:39:46 +00001117 if (count > 0) {
1118 DITHER_565_SCAN(y);
1119 do {
1120 SkPMColor c = *src++;
1121 SkPMColorAssert(c);
reed@android.com4e635f92009-10-19 17:39:46 +00001122 if (c) {
reed@android.com4e635f92009-10-19 17:39:46 +00001123 unsigned a = SkGetPackedA32(c);
1124
reed@android.comd7a0083a2010-01-14 00:24:12 +00001125 // dither and alpha are just temporary variables to work-around
1126 // an ICE in debug.
1127 unsigned dither = DITHER_VALUE(x);
1128 unsigned alpha = SkAlpha255To256(a);
1129 int d = SkAlphaMul(dither, alpha);
reed@android.com4e635f92009-10-19 17:39:46 +00001130
1131 unsigned sr = SkGetPackedR32(c);
1132 unsigned sg = SkGetPackedG32(c);
1133 unsigned sb = SkGetPackedB32(c);
reed@android.com4e635f92009-10-19 17:39:46 +00001134 sr = SkDITHER_R32_FOR_565(sr, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001135 sg = SkDITHER_G32_FOR_565(sg, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001136 sb = SkDITHER_B32_FOR_565(sb, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001137
reed@android.com4e635f92009-10-19 17:39:46 +00001138 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1139 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
reed@android.com4e635f92009-10-19 17:39:46 +00001140 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1141 // now src and dst expanded are in g:11 r:10 x:1 b:10
1142 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1143 }
1144 dst += 1;
reed@android.com4e635f92009-10-19 17:39:46 +00001145 DITHER_INC_X(x);
1146 } while (--count != 0);
1147 }
1148}
1149
1150#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1151#else
1152#define S32A_D565_Opaque_Dither_PROC NULL
1153#endif
1154
1155///////////////////////////////////////////////////////////////////////////////
1156
reed@android.comb577b412009-10-27 17:49:32 +00001157#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1158/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1159 * speedup untested, but ARM version is 26 insns/iteration and
1160 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1161 * which is 10x the native version; that's pure instruction counts,
1162 * not accounting for any instruction or memory latencies.
1163 */
1164
1165#undef DEBUG_S32_OPAQUE_DITHER
1166
1167static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1168 const SkPMColor* SK_RESTRICT src,
1169 int count, U8CPU alpha, int x, int y) {
1170 SkASSERT(255 == alpha);
1171
1172#define UNROLL 8
1173 if (count >= UNROLL) {
1174 uint8x8_t d;
1175 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1176 d = vld1_u8(dstart);
1177
1178 while (count >= UNROLL) {
1179 uint8x8_t sr, sg, sb, sa;
1180 uint16x8_t dr, dg, db, da;
1181 uint16x8_t dst8;
1182
1183 /* source is in ABGR ordering (R == lsb) */
1184 {
1185 register uint8x8_t d0 asm("d0");
1186 register uint8x8_t d1 asm("d1");
1187 register uint8x8_t d2 asm("d2");
1188 register uint8x8_t d3 asm("d3");
1189
1190 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1191 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1192 : "r" (src)
1193 );
1194 sr = d0; sg = d1; sb = d2; sa = d3;
1195 }
1196 /* XXX: if we want to prefetch, hide it in the above asm()
1197 * using the gcc __builtin_prefetch(), the prefetch will
1198 * fall to the bottom of the loop -- it won't stick up
1199 * at the top of the loop, just after the vld4.
1200 */
1201
1202 /* sr = sr - (sr>>5) + d */
1203 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1204 dr = vaddl_u8(sr, d);
1205
1206 /* sb = sb - (sb>>5) + d */
1207 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1208 db = vaddl_u8(sb, d);
1209
1210 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1211 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1212 dg = vaddl_u8(sg, vshr_n_u8(d,1));
1213 /* XXX: check that the "d>>1" here is hoisted */
1214
1215 /* pack high bits of each into 565 format (rgb, b is lsb) */
1216 dst8 = vshrq_n_u16(db, 3);
1217 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1218 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1219
1220 /* store it */
1221 vst1q_u16(dst, dst8);
1222
1223#if defined(DEBUG_S32_OPAQUE_DITHER)
1224 /* always good to know if we generated good results */
1225 {
1226 int i, myx = x, myy = y;
1227 DITHER_565_SCAN(myy);
1228 for (i=0;i<UNROLL;i++) {
1229 SkPMColor c = src[i];
1230 unsigned dither = DITHER_VALUE(myx);
1231 uint16_t val = SkDitherRGB32To565(c, dither);
1232 if (val != dst[i]) {
1233 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1234 c, dither, val, dst[i], dstart[i]);
1235 }
1236 DITHER_INC_X(myx);
1237 }
1238 }
1239#endif
1240
1241 dst += UNROLL;
1242 src += UNROLL;
1243 count -= UNROLL;
1244 x += UNROLL; /* probably superfluous */
1245 }
1246 }
1247#undef UNROLL
1248
1249 /* residuals */
1250 if (count > 0) {
1251 DITHER_565_SCAN(y);
1252 do {
1253 SkPMColor c = *src++;
1254 SkPMColorAssert(c);
1255 SkASSERT(SkGetPackedA32(c) == 255);
1256
1257 unsigned dither = DITHER_VALUE(x);
1258 *dst++ = SkDitherRGB32To565(c, dither);
1259 DITHER_INC_X(x);
1260 } while (--count != 0);
1261 }
1262}
1263
1264#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1265#else
1266#define S32_D565_Opaque_Dither_PROC NULL
1267#endif
1268
1269///////////////////////////////////////////////////////////////////////////////
1270
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001271static const SkBlitRow::Proc platform_565_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001272 // no dither
reed@android.com4bda7a52009-07-30 20:40:47 +00001273 S32_D565_Opaque_PROC,
1274 S32_D565_Blend_PROC,
1275 S32A_D565_Opaque_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001276 S32A_D565_Blend_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001277
1278 // dither
reed@android.comb577b412009-10-27 17:49:32 +00001279 S32_D565_Opaque_Dither_PROC,
reed@android.com4bda7a52009-07-30 20:40:47 +00001280 S32_D565_Blend_Dither_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001281 S32A_D565_Opaque_Dither_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001282 NULL, // S32A_D565_Blend_Dither
1283};
1284
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001285static const SkBlitRow::Proc platform_4444_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001286 // no dither
1287 NULL, // S32_D4444_Opaque,
1288 NULL, // S32_D4444_Blend,
1289 NULL, // S32A_D4444_Opaque,
1290 NULL, // S32A_D4444_Blend,
1291
1292 // dither
1293 NULL, // S32_D4444_Opaque_Dither,
1294 NULL, // S32_D4444_Blend_Dither,
1295 NULL, // S32A_D4444_Opaque_Dither,
1296 NULL, // S32A_D4444_Blend_Dither
1297};
1298
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001299static const SkBlitRow::Proc32 platform_32_procs[] = {
reed@android.com3bbac132009-09-23 18:48:10 +00001300 NULL, // S32_Opaque,
reed@android.com4e635f92009-10-19 17:39:46 +00001301 S32_Blend_BlitRow32_PROC, // S32_Blend,
1302 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque,
agl@chromium.org8b17ac32010-09-10 15:09:42 +00001303 S32A_Blend_BlitRow32_PROC // S32A_Blend
reed@android.com3bbac132009-09-23 18:48:10 +00001304};
1305
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001306SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001307 return platform_4444_procs[flags];
1308}
1309
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001310SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001311 return platform_565_procs[flags];
1312}
1313
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001314SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001315 return platform_32_procs[flags];
1316}
reed@android.com229d9b32010-04-09 18:44:46 +00001317
senorblanco@chromium.org31e62302010-12-15 16:20:59 +00001318SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1319 return NULL;
1320}
reed@google.com981d4792011-03-09 12:55:47 +00001321
1322
1323SkBlitMask::Proc SkBlitMask::PlatformProcs(SkBitmap::Config dstConfig,
1324 SkColor color)
1325{
1326 return NULL;
1327}