blob: 556627b9eccaf923d41d6e9ecf9ee14788078829 [file] [log] [blame]
epoger@google.comec3ed6a2011-07-28 14:26:00 +00001
reed@android.com7d2e3222009-07-30 02:22:31 +00002/*
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 * Copyright 2009 The Android Open Source Project
4 *
5 * Use of this source code is governed by a BSD-style license that can be
6 * found in the LICENSE file.
reed@android.com7d2e3222009-07-30 02:22:31 +00007 */
8
epoger@google.comec3ed6a2011-07-28 14:26:00 +00009
agl@chromium.org28bee952009-11-19 03:21:57 +000010#ifdef ANDROID
11 #include <machine/cpu-features.h>
12#endif
13
reed@android.com7d2e3222009-07-30 02:22:31 +000014#include "SkBlitRow.h"
reed@android.com6123e472009-08-04 01:52:27 +000015#include "SkColorPriv.h"
16#include "SkDither.h"
reed@android.com7d2e3222009-07-30 02:22:31 +000017
reed@android.com4e635f92009-10-19 17:39:46 +000018#if defined(__ARM_HAVE_NEON)
19#include <arm_neon.h>
20#endif
21
reed@android.coma98a21e2009-10-19 18:13:18 +000022#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com7d2e3222009-07-30 02:22:31 +000023static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
24 const SkPMColor* SK_RESTRICT src, int count,
25 U8CPU alpha, int /*x*/, int /*y*/) {
26 SkASSERT(255 == alpha);
reed@android.com229d9b32010-04-09 18:44:46 +000027
reed@android.com7d2e3222009-07-30 02:22:31 +000028 if (count >= 8) {
29 uint16_t* SK_RESTRICT keep_dst;
30
31 asm volatile (
32 "ands ip, %[count], #7 \n\t"
33 "vmov.u8 d31, #1<<7 \n\t"
34 "vld1.16 {q12}, [%[dst]] \n\t"
35 "vld4.8 {d0-d3}, [%[src]] \n\t"
36 "moveq ip, #8 \n\t"
37 "mov %[keep_dst], %[dst] \n\t"
38
39 "add %[src], %[src], ip, LSL#2 \n\t"
40 "add %[dst], %[dst], ip, LSL#1 \n\t"
41 "subs %[count], %[count], ip \n\t"
42 "b 9f \n\t"
43 // LOOP
44 "2: \n\t"
45
46 "vld1.16 {q12}, [%[dst]]! \n\t"
47 "vld4.8 {d0-d3}, [%[src]]! \n\t"
48 "vst1.16 {q10}, [%[keep_dst]] \n\t"
49 "sub %[keep_dst], %[dst], #8*2 \n\t"
50 "subs %[count], %[count], #8 \n\t"
51 "9: \n\t"
52 "pld [%[dst],#32] \n\t"
53 // expand 0565 q12 to 8888 {d4-d7}
54 "vmovn.u16 d4, q12 \n\t"
55 "vshr.u16 q11, q12, #5 \n\t"
56 "vshr.u16 q10, q12, #6+5 \n\t"
57 "vmovn.u16 d5, q11 \n\t"
58 "vmovn.u16 d6, q10 \n\t"
59 "vshl.u8 d4, d4, #3 \n\t"
60 "vshl.u8 d5, d5, #2 \n\t"
61 "vshl.u8 d6, d6, #3 \n\t"
62
63 "vmovl.u8 q14, d31 \n\t"
64 "vmovl.u8 q13, d31 \n\t"
65 "vmovl.u8 q12, d31 \n\t"
66
67 // duplicate in 4/2/1 & 8pix vsns
68 "vmvn.8 d30, d3 \n\t"
69 "vmlal.u8 q14, d30, d6 \n\t"
70 "vmlal.u8 q13, d30, d5 \n\t"
71 "vmlal.u8 q12, d30, d4 \n\t"
72 "vshr.u16 q8, q14, #5 \n\t"
73 "vshr.u16 q9, q13, #6 \n\t"
74 "vaddhn.u16 d6, q14, q8 \n\t"
75 "vshr.u16 q8, q12, #5 \n\t"
76 "vaddhn.u16 d5, q13, q9 \n\t"
77 "vqadd.u8 d6, d6, d0 \n\t" // moved up
78 "vaddhn.u16 d4, q12, q8 \n\t"
79 // intentionally don't calculate alpha
80 // result in d4-d6
81
82 "vqadd.u8 d5, d5, d1 \n\t"
83 "vqadd.u8 d4, d4, d2 \n\t"
84
85 // pack 8888 {d4-d6} to 0565 q10
86 "vshll.u8 q10, d6, #8 \n\t"
87 "vshll.u8 q3, d5, #8 \n\t"
88 "vshll.u8 q2, d4, #8 \n\t"
89 "vsri.u16 q10, q3, #5 \n\t"
90 "vsri.u16 q10, q2, #11 \n\t"
91
92 "bne 2b \n\t"
93
94 "1: \n\t"
95 "vst1.16 {q10}, [%[keep_dst]] \n\t"
96 : [count] "+r" (count)
97 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
98 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
99 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
100 "d30","d31"
101 );
reed@android.com522aa8d2009-10-22 20:26:53 +0000102 }
103 else
104 { // handle count < 8
reed@android.com7d2e3222009-07-30 02:22:31 +0000105 uint16_t* SK_RESTRICT keep_dst;
106
107 asm volatile (
108 "vmov.u8 d31, #1<<7 \n\t"
109 "mov %[keep_dst], %[dst] \n\t"
110
111 "tst %[count], #4 \n\t"
112 "beq 14f \n\t"
113 "vld1.16 {d25}, [%[dst]]! \n\t"
114 "vld1.32 {q1}, [%[src]]! \n\t"
115
116 "14: \n\t"
117 "tst %[count], #2 \n\t"
118 "beq 12f \n\t"
119 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
120 "vld1.32 {d1}, [%[src]]! \n\t"
121
122 "12: \n\t"
123 "tst %[count], #1 \n\t"
124 "beq 11f \n\t"
125 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
126 "vld1.32 {d0[1]}, [%[src]]! \n\t"
127
128 "11: \n\t"
129 // unzips achieve the same as a vld4 operation
130 "vuzpq.u16 q0, q1 \n\t"
131 "vuzp.u8 d0, d1 \n\t"
132 "vuzp.u8 d2, d3 \n\t"
133 // expand 0565 q12 to 8888 {d4-d7}
134 "vmovn.u16 d4, q12 \n\t"
135 "vshr.u16 q11, q12, #5 \n\t"
136 "vshr.u16 q10, q12, #6+5 \n\t"
137 "vmovn.u16 d5, q11 \n\t"
138 "vmovn.u16 d6, q10 \n\t"
139 "vshl.u8 d4, d4, #3 \n\t"
140 "vshl.u8 d5, d5, #2 \n\t"
141 "vshl.u8 d6, d6, #3 \n\t"
142
143 "vmovl.u8 q14, d31 \n\t"
144 "vmovl.u8 q13, d31 \n\t"
145 "vmovl.u8 q12, d31 \n\t"
146
147 // duplicate in 4/2/1 & 8pix vsns
148 "vmvn.8 d30, d3 \n\t"
149 "vmlal.u8 q14, d30, d6 \n\t"
150 "vmlal.u8 q13, d30, d5 \n\t"
151 "vmlal.u8 q12, d30, d4 \n\t"
152 "vshr.u16 q8, q14, #5 \n\t"
153 "vshr.u16 q9, q13, #6 \n\t"
154 "vaddhn.u16 d6, q14, q8 \n\t"
155 "vshr.u16 q8, q12, #5 \n\t"
156 "vaddhn.u16 d5, q13, q9 \n\t"
157 "vqadd.u8 d6, d6, d0 \n\t" // moved up
158 "vaddhn.u16 d4, q12, q8 \n\t"
159 // intentionally don't calculate alpha
160 // result in d4-d6
161
162 "vqadd.u8 d5, d5, d1 \n\t"
163 "vqadd.u8 d4, d4, d2 \n\t"
164
165 // pack 8888 {d4-d6} to 0565 q10
166 "vshll.u8 q10, d6, #8 \n\t"
167 "vshll.u8 q3, d5, #8 \n\t"
168 "vshll.u8 q2, d4, #8 \n\t"
169 "vsri.u16 q10, q3, #5 \n\t"
170 "vsri.u16 q10, q2, #11 \n\t"
171
172 // store
173 "tst %[count], #4 \n\t"
174 "beq 24f \n\t"
175 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
176
177 "24: \n\t"
178 "tst %[count], #2 \n\t"
179 "beq 22f \n\t"
180 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
181
182 "22: \n\t"
183 "tst %[count], #1 \n\t"
184 "beq 21f \n\t"
185 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
186
187 "21: \n\t"
188 : [count] "+r" (count)
189 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
190 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
191 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
192 "d30","d31"
193 );
194 }
195}
reed@android.com4bda7a52009-07-30 20:40:47 +0000196
197static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
198 const SkPMColor* SK_RESTRICT src, int count,
199 U8CPU alpha, int /*x*/, int /*y*/) {
reed@android.com229d9b32010-04-09 18:44:46 +0000200
201 U8CPU alpha_for_asm = alpha;
202
reed@android.com4bda7a52009-07-30 20:40:47 +0000203 asm volatile (
204 /* This code implements a Neon version of S32A_D565_Blend. The output differs from
205 * the original in two respects:
206 * 1. The results have a few mismatches compared to the original code. These mismatches
207 * never exceed 1. It's possible to improve accuracy vs. a floating point
208 * implementation by introducing rounding right shifts (vrshr) for the final stage.
209 * Rounding is not present in the code below, because although results would be closer
210 * to a floating point implementation, the number of mismatches compared to the
211 * original code would be far greater.
212 * 2. On certain inputs, the original code can overflow, causing colour channels to
213 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel
214 * to affect another.
215 */
216
reed@android.com229d9b32010-04-09 18:44:46 +0000217#if 1
218 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
219 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256
220#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000221 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256
reed@android.com229d9b32010-04-09 18:44:46 +0000222#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000223 "vmov.u16 q3, #255 \n\t" // set up constant
224 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3
225 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon
226 "beq 2f \n\t" // if count8 == 0, exit
227 "vmov.u16 q15, #0x1f \n\t" // set up blue mask
228
229 "1: \n\t"
230 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels
231 "subs r4, r4, #1 \n\t" // decrement loop counter
232 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels
233 // and deinterleave
234
235 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes
236 "vand q10, q0, q15 \n\t" // extract blue
237 "vshr.u16 q8, q0, #11 \n\t" // extract red
238 "vshr.u16 q9, q9, #10 \n\t" // extract green
239 // dstrgb = {q8, q9, q10}
240
241 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range
242 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range
243 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range
244
245 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits
246 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits
247 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits
248 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits
249 // srcrgba = {q11, q12, q13, q14}
250
251 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale
252 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale
253 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale
254 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale
255
256 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8
257 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8)
258 // dst_scale = q2
259
260 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale
261 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale
262 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale
reed@android.com229d9b32010-04-09 18:44:46 +0000263
264#if 1
265 // trying for a better match with SkDiv255Round(a)
266 // C alg is: a+=128; (a+a>>8)>>8
267 // we'll use just a rounding shift [q2 is available for scratch]
268 "vrshr.u16 q11, q11, #8 \n\t" // shift down red
269 "vrshr.u16 q12, q12, #8 \n\t" // shift down green
270 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue
271#else
272 // arm's original "truncating divide by 256"
reed@android.com4bda7a52009-07-30 20:40:47 +0000273 "vshr.u16 q11, q11, #8 \n\t" // shift down red
274 "vshr.u16 q12, q12, #8 \n\t" // shift down green
275 "vshr.u16 q13, q13, #8 \n\t" // shift down blue
reed@android.com229d9b32010-04-09 18:44:46 +0000276#endif
reed@android.com4bda7a52009-07-30 20:40:47 +0000277
278 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue
279 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue
280 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr
281
282 "bne 1b \n\t" // if counter != 0, loop
283 "2: \n\t" // exit
284
reed@android.com229d9b32010-04-09 18:44:46 +0000285 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
reed@android.com4bda7a52009-07-30 20:40:47 +0000286 :
287 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
288 );
reed@android.com229d9b32010-04-09 18:44:46 +0000289
reed@android.com4bda7a52009-07-30 20:40:47 +0000290 count &= 7;
291 if (count > 0) {
292 do {
293 SkPMColor sc = *src++;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000294 if (sc) {
reed@android.com4bda7a52009-07-30 20:40:47 +0000295 uint16_t dc = *dst;
reed@android.com8e4c93b2010-03-09 15:21:28 +0000296 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
297 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
298 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
299 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
300 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
reed@android.com4bda7a52009-07-30 20:40:47 +0000301 }
302 dst += 1;
303 } while (--count != 0);
304 }
305}
306
307/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
308 * each dither value is spaced out into byte lanes, and repeated
309 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
310 * start of each row.
311 */
312static const uint8_t gDitherMatrix_Neon[48] = {
313 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
314 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
315 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
316 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
317
318};
319
320static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
321 int count, U8CPU alpha, int x, int y)
322{
323 /* select row and offset for dither array */
324 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
325
326 /* rescale alpha to range 0 - 256 */
327 int scale = SkAlpha255To256(alpha);
328
329 asm volatile (
330 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values
331 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values
332 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg
333 "vmov.i8 d29, #0x3f \n\t" // set up green mask
334 "vmov.i8 d28, #0x1f \n\t" // set up blue mask
335 "1: \n\t"
336 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb
337 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5
338 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6
339 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5
340 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen
341 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen
342 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen
343 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result
344 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result
345 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result
346 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits
347 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits
348 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits
349 // load 8 pixels from dst, extract rgb
350 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels
351 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits
352 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes
353 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red
354 "vand d17, d17, d29 \n\t" // and green with green mask
355 "vand d18, d18, d28 \n\t" // and blue with blue mask
356 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes
357 // src = {d22 (r), d23 (g), d24 (b)}
358 // dst = {d16 (r), d17 (g), d18 (b)}
359 // subtract dst from src and widen
360 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst
361 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst
362 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst
363 // multiply diffs by scale and shift
364 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale
365 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale
366 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale
367 "subs %[count], %[count], #8 \n\t" // decrement loop counter
368 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow
369 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow
370 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow
371 // add dst to result
372 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red
373 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green
374 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue
375 // put result into 565 format
376 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue
377 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue
378 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result
379 "bgt 1b \n\t" // loop if count > 0
380 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
381 : [dstart] "r" (dstart), [scale] "r" (scale)
382 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
383 );
384
385 DITHER_565_SCAN(y);
386
387 while((count & 7) > 0)
388 {
389 SkPMColor c = *src++;
390
391 int dither = DITHER_VALUE(x);
392 int sr = SkGetPackedR32(c);
393 int sg = SkGetPackedG32(c);
394 int sb = SkGetPackedB32(c);
395 sr = SkDITHER_R32To565(sr, dither);
396 sg = SkDITHER_G32To565(sg, dither);
397 sb = SkDITHER_B32To565(sb, dither);
398
399 uint16_t d = *dst;
400 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
401 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
402 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
403 DITHER_INC_X(x);
404 count--;
405 }
406}
407
408#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
409#define S32A_D565_Blend_PROC S32A_D565_Blend_neon
410#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
reed@android.com7d2e3222009-07-30 02:22:31 +0000411#else
reed@android.com4bda7a52009-07-30 20:40:47 +0000412#define S32A_D565_Opaque_PROC NULL
413#define S32A_D565_Blend_PROC NULL
414#define S32_D565_Blend_Dither_PROC NULL
reed@android.com7d2e3222009-07-30 02:22:31 +0000415#endif
416
417/* Don't have a special version that assumes each src is opaque, but our S32A
418 is still faster than the default, so use it here
419 */
reed@android.com4bda7a52009-07-30 20:40:47 +0000420#define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC
421#define S32_D565_Blend_PROC S32A_D565_Blend_PROC
reed@android.com7d2e3222009-07-30 02:22:31 +0000422
423///////////////////////////////////////////////////////////////////////////////
424
reed@android.coma98a21e2009-10-19 18:13:18 +0000425#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000426
427static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
428 const SkPMColor* SK_RESTRICT src,
429 int count, U8CPU alpha) {
430
431 SkASSERT(255 == alpha);
432 if (count > 0) {
433
reed@android.com522aa8d2009-10-22 20:26:53 +0000434
435 uint8x8_t alpha_mask;
436
437 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
438 alpha_mask = vld1_u8(alpha_mask_setup);
439
reed@android.com4e635f92009-10-19 17:39:46 +0000440 /* do the NEON unrolled code */
441#define UNROLL 4
442 while (count >= UNROLL) {
443 uint8x8_t src_raw, dst_raw, dst_final;
444 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
reed@android.com4e635f92009-10-19 17:39:46 +0000445
446 /* get the source */
447 src_raw = vreinterpret_u8_u32(vld1_u32(src));
448#if UNROLL > 2
449 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
450#endif
451
452 /* get and hold the dst too */
453 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
454#if UNROLL > 2
455 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
456#endif
457
reed@android.com4e635f92009-10-19 17:39:46 +0000458 /* 1st and 2nd bits of the unrolling */
459 {
460 uint8x8_t dst_cooked;
461 uint16x8_t dst_wide;
462 uint8x8_t alpha_narrow;
463 uint16x8_t alpha_wide;
464
465 /* get the alphas spread out properly */
466 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000467#if 1
468 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
469 /* we collapsed (255-a)+1 ... */
470 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
471#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000472 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000473 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000474#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000475
reed@android.com522aa8d2009-10-22 20:26:53 +0000476 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000477 dst_wide = vmovl_u8(dst_raw);
478
479 /* alpha mul the dest */
480 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
481 dst_cooked = vshrn_n_u16(dst_wide, 8);
482
483 /* sum -- ignoring any byte lane overflows */
484 dst_final = vadd_u8(src_raw, dst_cooked);
485 }
reed@android.com4e635f92009-10-19 17:39:46 +0000486
487#if UNROLL > 2
488 /* the 3rd and 4th bits of our unrolling */
489 {
490 uint8x8_t dst_cooked;
491 uint16x8_t dst_wide;
492 uint8x8_t alpha_narrow;
493 uint16x8_t alpha_wide;
494
495 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
reed@android.com229d9b32010-04-09 18:44:46 +0000496#if 1
497 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
498 /* we collapsed (255-a)+1 ... */
499 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
500#else
reed@android.com522aa8d2009-10-22 20:26:53 +0000501 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
reed@android.com4e635f92009-10-19 17:39:46 +0000502 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
reed@android.com229d9b32010-04-09 18:44:46 +0000503#endif
reed@android.com4e635f92009-10-19 17:39:46 +0000504
reed@android.com522aa8d2009-10-22 20:26:53 +0000505 /* spread the dest */
reed@android.com4e635f92009-10-19 17:39:46 +0000506 dst_wide = vmovl_u8(dst_raw_2);
507
508 /* alpha mul the dest */
509 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
510 dst_cooked = vshrn_n_u16(dst_wide, 8);
511
512 /* sum -- ignoring any byte lane overflows */
513 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
514 }
515#endif
516
517 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
518#if UNROLL > 2
519 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
520#endif
521
522 src += UNROLL;
523 dst += UNROLL;
524 count -= UNROLL;
525 }
526#undef UNROLL
527
528 /* do any residual iterations */
529 while (--count >= 0) {
530#ifdef TEST_SRC_ALPHA
531 SkPMColor sc = *src;
532 if (sc) {
533 unsigned srcA = SkGetPackedA32(sc);
534 SkPMColor result = sc;
535 if (srcA != 255) {
536 result = SkPMSrcOver(sc, *dst);
537 }
538 *dst = result;
539 }
540#else
541 *dst = SkPMSrcOver(*src, *dst);
542#endif
543 src += 1;
544 dst += 1;
545 }
546 }
547}
548
agl@chromium.orga40390c2010-06-18 15:30:12 +0000549#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon
agl@chromium.org94d14642010-08-17 16:24:15 +0000550
reed@android.com4e635f92009-10-19 17:39:46 +0000551#else
agl@chromium.org94d14642010-08-17 16:24:15 +0000552
553#ifdef TEST_SRC_ALPHA
554#error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA
555#endif
556
557static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
558 const SkPMColor* SK_RESTRICT src,
559 int count, U8CPU alpha) {
560
561 SkASSERT(255 == alpha);
562
563 /* Does not support the TEST_SRC_ALPHA case */
564 asm volatile (
565 "cmp %[count], #0 \n\t" /* comparing count with 0 */
566 "beq 3f \n\t" /* if zero exit */
567
568 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */
569 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */
570
571 "cmp %[count], #2 \n\t" /* compare count with 2 */
572 "blt 2f \n\t" /* if less than 2 -> single loop */
573
574 /* Double Loop */
575 "1: \n\t" /* <double loop> */
576 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */
577 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */
578 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
579
580 /* ----------- */
581 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
582 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
583 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
584
585 "mul r9, r9, r4 \n\t" /* br = br * scale */
586 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
587 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
588
589 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
590 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
591 "orr r7, r9, r10 \n\t" /* br | ag*/
592
593 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */
594 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */
595
596 /* ----------- */
597 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */
598
599 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */
600 "mul r9, r9, r4 \n\t" /* br = br * scale */
601 "sub %[count], %[count], #2 \n\t"
602 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
603
604 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
605 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */
606 "cmp %[count], #1 \n\t" /* comparing count with 1 */
607 "orr r8, r9, r10 \n\t" /* br | ag */
608
609 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */
610
611 /* ----------------- */
612 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */
613 /* ----------------- */
614
615 "bgt 1b \n\t" /* if greater than 1 -> reloop */
616 "blt 3f \n\t" /* if less than 1 -> exit */
617
618 /* Single Loop */
619 "2: \n\t" /* <single loop> */
620 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */
621 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */
622 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */
623
624 /* ----------- */
625 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */
626 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */
627
628 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */
629 "mul r9, r9, r4 \n\t" /* br = br * scale */
630 "mul r10, r10, r4 \n\t" /* ag = ag * scale */
631 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */
632
633 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */
634 "orr r7, r9, r10 \n\t" /* br | ag */
635
636 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */
637
638 /* ----------------- */
639 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */
640 /* ----------------- */
641
642 "3: \n\t" /* <exit> */
643 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
644 :
645 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
646 );
647}
648#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm
reed@android.com4e635f92009-10-19 17:39:46 +0000649#endif
650
agl@chromium.org8b17ac32010-09-10 15:09:42 +0000651/*
652 * ARM asm version of S32A_Blend_BlitRow32
653 */
654static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
655 const SkPMColor* SK_RESTRICT src,
656 int count, U8CPU alpha) {
657 SkASSERT(255 == alpha);
658
659 asm volatile (
660 "cmp %[count], #0 \n\t" /* comparing count with 0 */
661 "beq 3f \n\t" /* if zero exit */
662
663 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */
664 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */
665
666 /* src1,2_scale */
667 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */
668
669 "cmp %[count], #2 \n\t" /* comparing count with 2 */
670 "blt 2f \n\t" /* if less than 2 -> single loop */
671
672 /* Double Loop */
673 "1: \n\t" /* <double loop> */
674 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */
675 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */
676
677 /* dst1_scale and dst2_scale*/
678 "lsr r9, r5, #24 \n\t" /* src >> 24 */
679 "lsr r10, r6, #24 \n\t" /* src >> 24 */
680 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */
681 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */
682 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */
683 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */
684 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
685 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
686
687 /* ---------------------- */
688
689 /* src1, src1_scale */
690 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
691 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */
692 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
693 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
694 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
695 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
696 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */
697
698 /* dst1, dst1_scale */
699 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
700 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */
701 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */
702 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */
703 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
704 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
705 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */
706
707 /* ---------------------- */
708 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */
709 /* ---------------------- */
710
711 /* ====================== */
712
713 /* src2, src2_scale */
714 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
715 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */
716 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */
717 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */
718 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
719 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
720 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */
721
722 /* dst2, dst2_scale */
723 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
724 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */
725 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */
726 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */
727 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
728 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */
729 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */
730
731 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */
732 /* ---------------------- */
733 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */
734 /* ---------------------- */
735 "cmp %[count], #1 \n\t" /* compare count with 1 */
736 /* ----------------- */
737 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */
738 /* ----------------- */
739
740 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */
741 "blt 3f \n\t" /* if %[count] less than 1 exit */
742 /* else get into the single loop */
743 /* Single Loop */
744 "2: \n\t" /* <single loop> */
745 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */
746 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */
747
748 "lsr r6, r5, #24 \n\t" /* src >> 24 */
749 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
750 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */
751 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */
752 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */
753 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */
754 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */
755
756 /* src, src_scale */
757 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */
758 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
759 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
760 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */
761
762 /* dst, dst_scale */
763 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
764 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */
765 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */
766 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */
767 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */
768 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */
769 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */
770
771 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */
772
773 /* ----------------- */
774 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */
775 /* ----------------- */
776
777 "3: \n\t" /* <exit> */
778 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
779 :
780 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
781 );
782
783}
784#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm
785
reed@android.com4e635f92009-10-19 17:39:46 +0000786/* Neon version of S32_Blend_BlitRow32()
reed@android.com522aa8d2009-10-22 20:26:53 +0000787 * portable version is in src/core/SkBlitRow_D32.cpp
reed@android.com4e635f92009-10-19 17:39:46 +0000788 */
reed@android.coma98a21e2009-10-19 18:13:18 +0000789#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com4e635f92009-10-19 17:39:46 +0000790static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
791 const SkPMColor* SK_RESTRICT src,
792 int count, U8CPU alpha) {
793 SkASSERT(alpha <= 255);
794 if (count > 0) {
795 uint16_t src_scale = SkAlpha255To256(alpha);
796 uint16_t dst_scale = 256 - src_scale;
797
798 /* run them N at a time through the NEON unit */
799 /* note that each 1 is 4 bytes, each treated exactly the same,
800 * so we can work under that guise. We *do* know that the src&dst
801 * will be 32-bit aligned quantities, so we can specify that on
802 * the load/store ops and do a neon 'reinterpret' to get us to
803 * byte-sized (pun intended) pieces that we widen/multiply/shift
804 * we're limited at 128 bits in the wide ops, which is 8x16bits
805 * or a pair of 32 bit src/dsts.
806 */
807 /* we *could* manually unroll this loop so that we load 128 bits
808 * (as a pair of 64s) from each of src and dst, processing them
809 * in pieces. This might give us a little better management of
810 * the memory latency, but my initial attempts here did not
811 * produce an instruction stream that looked all that nice.
812 */
813#define UNROLL 2
814 while (count >= UNROLL) {
815 uint8x8_t src_raw, dst_raw, dst_final;
816 uint16x8_t src_wide, dst_wide;
817
818 /* get 64 bits of src, widen it, multiply by src_scale */
819 src_raw = vreinterpret_u8_u32(vld1_u32(src));
820 src_wide = vmovl_u8(src_raw);
reed@android.com522aa8d2009-10-22 20:26:53 +0000821 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
reed@android.com4e635f92009-10-19 17:39:46 +0000822 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
823
824 /* ditto with dst */
825 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
826 dst_wide = vmovl_u8(dst_raw);
reed@android.com4e635f92009-10-19 17:39:46 +0000827
reed@android.com522aa8d2009-10-22 20:26:53 +0000828 /* combine add with dst multiply into mul-accumulate */
829 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
830
reed@android.com4e635f92009-10-19 17:39:46 +0000831 dst_final = vshrn_n_u16(dst_wide, 8);
reed@android.com4e635f92009-10-19 17:39:46 +0000832 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
833
834 src += UNROLL;
835 dst += UNROLL;
836 count -= UNROLL;
837 }
838 /* RBE: well, i don't like how gcc manages src/dst across the above
839 * loop it's constantly calculating src+bias, dst+bias and it only
840 * adjusts the real ones when we leave the loop. Not sure why
841 * it's "hoisting down" (hoisting implies above in my lexicon ;))
842 * the adjustments to src/dst/count, but it does...
843 * (might be SSA-style internal logic...
844 */
845
846#if UNROLL == 2
847 if (count == 1) {
848 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
849 }
850#else
851 if (count > 0) {
852 do {
853 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
854 src += 1;
855 dst += 1;
856 } while (--count > 0);
857 }
858#endif
859
860#undef UNROLL
861 }
862}
863
864#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon
865#else
866#define S32_Blend_BlitRow32_PROC NULL
867#endif
868
869///////////////////////////////////////////////////////////////////////////////
870
reed@android.coma98a21e2009-10-19 18:13:18 +0000871#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
reed@android.com522aa8d2009-10-22 20:26:53 +0000872
873#undef DEBUG_OPAQUE_DITHER
874
875#if defined(DEBUG_OPAQUE_DITHER)
876static void showme8(char *str, void *p, int len)
877{
878 static char buf[256];
879 char tbuf[32];
880 int i;
881 char *pc = (char*) p;
882 sprintf(buf,"%8s:", str);
883 for(i=0;i<len;i++) {
884 sprintf(tbuf, " %02x", pc[i]);
885 strcat(buf, tbuf);
886 }
887 SkDebugf("%s\n", buf);
888}
889static void showme16(char *str, void *p, int len)
890{
891 static char buf[256];
892 char tbuf[32];
893 int i;
894 uint16_t *pc = (uint16_t*) p;
895 sprintf(buf,"%8s:", str);
896 len = (len / sizeof(uint16_t)); /* passed as bytes */
897 for(i=0;i<len;i++) {
898 sprintf(tbuf, " %04x", pc[i]);
899 strcat(buf, tbuf);
900 }
901 SkDebugf("%s\n", buf);
902}
903#endif
904
905static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
reed@android.com4e635f92009-10-19 17:39:46 +0000906 const SkPMColor* SK_RESTRICT src,
907 int count, U8CPU alpha, int x, int y) {
908 SkASSERT(255 == alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +0000909
910#define UNROLL 8
911
912 if (count >= UNROLL) {
913 uint8x8_t dbase;
914
915#if defined(DEBUG_OPAQUE_DITHER)
916 uint16_t tmpbuf[UNROLL];
917 int td[UNROLL];
918 int tdv[UNROLL];
919 int ta[UNROLL];
920 int tap[UNROLL];
921 uint16_t in_dst[UNROLL];
922 int offset = 0;
923 int noisy = 0;
924#endif
925
926 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
927 dbase = vld1_u8(dstart);
928
929 do {
930 uint8x8_t sr, sg, sb, sa, d;
reed@android.com229d9b32010-04-09 18:44:46 +0000931 uint16x8_t dst8, scale8, alpha8;
reed@android.com522aa8d2009-10-22 20:26:53 +0000932 uint16x8_t dst_r, dst_g, dst_b;
933
934#if defined(DEBUG_OPAQUE_DITHER)
935 /* calculate 8 elements worth into a temp buffer */
936 {
937 int my_y = y;
938 int my_x = x;
939 SkPMColor* my_src = (SkPMColor*)src;
940 uint16_t* my_dst = dst;
941 int i;
942
943 DITHER_565_SCAN(my_y);
944 for(i=0;i<UNROLL;i++) {
945 SkPMColor c = *my_src++;
946 SkPMColorAssert(c);
947 if (c) {
948 unsigned a = SkGetPackedA32(c);
949
950 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
951 tdv[i] = DITHER_VALUE(my_x);
952 ta[i] = a;
953 tap[i] = SkAlpha255To256(a);
954 td[i] = d;
955
956 unsigned sr = SkGetPackedR32(c);
957 unsigned sg = SkGetPackedG32(c);
958 unsigned sb = SkGetPackedB32(c);
959 sr = SkDITHER_R32_FOR_565(sr, d);
960 sg = SkDITHER_G32_FOR_565(sg, d);
961 sb = SkDITHER_B32_FOR_565(sb, d);
962
963 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
964 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
965 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
966 // now src and dst expanded are in g:11 r:10 x:1 b:10
967 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
968 td[i] = d;
969
970 } else {
971 tmpbuf[i] = *my_dst;
972 ta[i] = tdv[i] = td[i] = 0xbeef;
973 }
974 in_dst[i] = *my_dst;
975 my_dst += 1;
976 DITHER_INC_X(my_x);
977 }
978 }
979#endif
980
981 /* source is in ABGR */
982 {
983 register uint8x8_t d0 asm("d0");
984 register uint8x8_t d1 asm("d1");
985 register uint8x8_t d2 asm("d2");
986 register uint8x8_t d3 asm("d3");
987
988 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
989 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
990 : "r" (src)
991 );
992 sr = d0; sg = d1; sb = d2; sa = d3;
993 }
994
995 /* calculate 'd', which will be 0..7 */
996 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
agl@chromium.orge932a422010-04-26 21:45:08 +0000997#if ANDROID
reed@android.com229d9b32010-04-09 18:44:46 +0000998 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
999 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1000#else
1001 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1002#endif
1003 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1004 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
reed@android.com522aa8d2009-10-22 20:26:53 +00001005
1006 /* sr = sr - (sr>>5) + d */
1007 /* watching for 8-bit overflow. d is 0..7; risky range of
1008 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1009 * safe as long as we do ((sr-sr>>5) + d) */
1010 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1011 sr = vadd_u8(sr, d);
1012
1013 /* sb = sb - (sb>>5) + d */
1014 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1015 sb = vadd_u8(sb, d);
1016
1017 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1018 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1019 sg = vadd_u8(sg, vshr_n_u8(d,1));
1020
1021 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1022 dst8 = vld1q_u16(dst);
1023 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1024 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1025 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1026
1027 /* blend */
reed@android.com229d9b32010-04-09 18:44:46 +00001028#if 1
1029 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1030 /* originally 255-sa + 1 */
1031 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1032#else
reed@android.com522aa8d2009-10-22 20:26:53 +00001033 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1034 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
reed@android.com229d9b32010-04-09 18:44:46 +00001035#endif
1036
1037#if 1
1038 /* combine the addq and mul, save 3 insns */
1039 scale8 = vshrq_n_u16(scale8, 3);
1040 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1041 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1042 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1043#else
1044 /* known correct, but +3 insns over above */
reed@android.com522aa8d2009-10-22 20:26:53 +00001045 scale8 = vshrq_n_u16(scale8, 3);
1046 dst_b = vmulq_u16(dst_b, scale8);
1047 dst_g = vmulq_u16(dst_g, scale8);
1048 dst_r = vmulq_u16(dst_r, scale8);
1049
1050 /* combine */
1051 /* NB: vshll widens, need to preserve those bits */
1052 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1053 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1054 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
reed@android.com229d9b32010-04-09 18:44:46 +00001055#endif
reed@android.com522aa8d2009-10-22 20:26:53 +00001056
1057 /* repack to store */
1058 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1059 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1060 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1061
1062 vst1q_u16(dst, dst8);
1063
1064#if defined(DEBUG_OPAQUE_DITHER)
1065 /* verify my 8 elements match the temp buffer */
1066 {
1067 int i, bad=0;
1068 static int invocation;
1069
1070 for (i=0;i<UNROLL;i++)
1071 if (tmpbuf[i] != dst[i]) bad=1;
reed@android.com229d9b32010-04-09 18:44:46 +00001072 if (bad) {
reed@android.com522aa8d2009-10-22 20:26:53 +00001073 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1074 invocation, offset);
reed@android.com229d9b32010-04-09 18:44:46 +00001075 SkDebugf(" alpha 0x%x\n", alpha);
reed@android.com522aa8d2009-10-22 20:26:53 +00001076 for (i=0;i<UNROLL;i++)
1077 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1078 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1079 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1080
reed@android.com229d9b32010-04-09 18:44:46 +00001081 showme16("alpha8", &alpha8, sizeof(alpha8));
1082 showme16("scale8", &scale8, sizeof(scale8));
1083 showme8("d", &d, sizeof(d));
1084 showme16("dst8", &dst8, sizeof(dst8));
1085 showme16("dst_b", &dst_b, sizeof(dst_b));
1086 showme16("dst_g", &dst_g, sizeof(dst_g));
1087 showme16("dst_r", &dst_r, sizeof(dst_r));
1088 showme8("sb", &sb, sizeof(sb));
1089 showme8("sg", &sg, sizeof(sg));
1090 showme8("sr", &sr, sizeof(sr));
1091
reed@android.com522aa8d2009-10-22 20:26:53 +00001092 /* cop out */
1093 return;
1094 }
1095 offset += UNROLL;
1096 invocation++;
1097 }
1098#endif
1099
1100 dst += UNROLL;
1101 src += UNROLL;
1102 count -= UNROLL;
1103 /* skip x += UNROLL, since it's unchanged mod-4 */
1104 } while (count >= UNROLL);
1105 }
1106#undef UNROLL
1107
1108 /* residuals */
reed@android.com4e635f92009-10-19 17:39:46 +00001109 if (count > 0) {
1110 DITHER_565_SCAN(y);
1111 do {
1112 SkPMColor c = *src++;
1113 SkPMColorAssert(c);
reed@android.com4e635f92009-10-19 17:39:46 +00001114 if (c) {
reed@android.com4e635f92009-10-19 17:39:46 +00001115 unsigned a = SkGetPackedA32(c);
1116
reed@android.comd7a0083a2010-01-14 00:24:12 +00001117 // dither and alpha are just temporary variables to work-around
1118 // an ICE in debug.
1119 unsigned dither = DITHER_VALUE(x);
1120 unsigned alpha = SkAlpha255To256(a);
1121 int d = SkAlphaMul(dither, alpha);
reed@android.com4e635f92009-10-19 17:39:46 +00001122
1123 unsigned sr = SkGetPackedR32(c);
1124 unsigned sg = SkGetPackedG32(c);
1125 unsigned sb = SkGetPackedB32(c);
reed@android.com4e635f92009-10-19 17:39:46 +00001126 sr = SkDITHER_R32_FOR_565(sr, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001127 sg = SkDITHER_G32_FOR_565(sg, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001128 sb = SkDITHER_B32_FOR_565(sb, d);
reed@android.com4e635f92009-10-19 17:39:46 +00001129
reed@android.com4e635f92009-10-19 17:39:46 +00001130 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1131 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
reed@android.com4e635f92009-10-19 17:39:46 +00001132 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1133 // now src and dst expanded are in g:11 r:10 x:1 b:10
1134 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1135 }
1136 dst += 1;
reed@android.com4e635f92009-10-19 17:39:46 +00001137 DITHER_INC_X(x);
1138 } while (--count != 0);
1139 }
1140}
1141
1142#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
1143#else
1144#define S32A_D565_Opaque_Dither_PROC NULL
1145#endif
1146
1147///////////////////////////////////////////////////////////////////////////////
1148
reed@android.comb577b412009-10-27 17:49:32 +00001149#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
1150/* 2009/10/27: RBE says "a work in progress"; debugging says ok;
1151 * speedup untested, but ARM version is 26 insns/iteration and
1152 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
1153 * which is 10x the native version; that's pure instruction counts,
1154 * not accounting for any instruction or memory latencies.
1155 */
1156
1157#undef DEBUG_S32_OPAQUE_DITHER
1158
1159static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1160 const SkPMColor* SK_RESTRICT src,
1161 int count, U8CPU alpha, int x, int y) {
1162 SkASSERT(255 == alpha);
1163
1164#define UNROLL 8
1165 if (count >= UNROLL) {
1166 uint8x8_t d;
1167 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1168 d = vld1_u8(dstart);
1169
1170 while (count >= UNROLL) {
1171 uint8x8_t sr, sg, sb, sa;
1172 uint16x8_t dr, dg, db, da;
1173 uint16x8_t dst8;
1174
1175 /* source is in ABGR ordering (R == lsb) */
1176 {
1177 register uint8x8_t d0 asm("d0");
1178 register uint8x8_t d1 asm("d1");
1179 register uint8x8_t d2 asm("d2");
1180 register uint8x8_t d3 asm("d3");
1181
1182 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1183 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1184 : "r" (src)
1185 );
1186 sr = d0; sg = d1; sb = d2; sa = d3;
1187 }
1188 /* XXX: if we want to prefetch, hide it in the above asm()
1189 * using the gcc __builtin_prefetch(), the prefetch will
1190 * fall to the bottom of the loop -- it won't stick up
1191 * at the top of the loop, just after the vld4.
1192 */
1193
1194 /* sr = sr - (sr>>5) + d */
1195 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1196 dr = vaddl_u8(sr, d);
1197
1198 /* sb = sb - (sb>>5) + d */
1199 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1200 db = vaddl_u8(sb, d);
1201
1202 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1203 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1204 dg = vaddl_u8(sg, vshr_n_u8(d,1));
1205 /* XXX: check that the "d>>1" here is hoisted */
1206
1207 /* pack high bits of each into 565 format (rgb, b is lsb) */
1208 dst8 = vshrq_n_u16(db, 3);
1209 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1210 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
1211
1212 /* store it */
1213 vst1q_u16(dst, dst8);
1214
1215#if defined(DEBUG_S32_OPAQUE_DITHER)
1216 /* always good to know if we generated good results */
1217 {
1218 int i, myx = x, myy = y;
1219 DITHER_565_SCAN(myy);
1220 for (i=0;i<UNROLL;i++) {
1221 SkPMColor c = src[i];
1222 unsigned dither = DITHER_VALUE(myx);
1223 uint16_t val = SkDitherRGB32To565(c, dither);
1224 if (val != dst[i]) {
1225 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1226 c, dither, val, dst[i], dstart[i]);
1227 }
1228 DITHER_INC_X(myx);
1229 }
1230 }
1231#endif
1232
1233 dst += UNROLL;
1234 src += UNROLL;
1235 count -= UNROLL;
1236 x += UNROLL; /* probably superfluous */
1237 }
1238 }
1239#undef UNROLL
1240
1241 /* residuals */
1242 if (count > 0) {
1243 DITHER_565_SCAN(y);
1244 do {
1245 SkPMColor c = *src++;
1246 SkPMColorAssert(c);
1247 SkASSERT(SkGetPackedA32(c) == 255);
1248
1249 unsigned dither = DITHER_VALUE(x);
1250 *dst++ = SkDitherRGB32To565(c, dither);
1251 DITHER_INC_X(x);
1252 } while (--count != 0);
1253 }
1254}
1255
1256#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
1257#else
1258#define S32_D565_Opaque_Dither_PROC NULL
1259#endif
1260
1261///////////////////////////////////////////////////////////////////////////////
1262
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001263static const SkBlitRow::Proc platform_565_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001264 // no dither
reed@android.com4bda7a52009-07-30 20:40:47 +00001265 S32_D565_Opaque_PROC,
1266 S32_D565_Blend_PROC,
1267 S32A_D565_Opaque_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001268 S32A_D565_Blend_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001269
1270 // dither
reed@android.comb577b412009-10-27 17:49:32 +00001271 S32_D565_Opaque_Dither_PROC,
reed@android.com4bda7a52009-07-30 20:40:47 +00001272 S32_D565_Blend_Dither_PROC,
reed@android.com229d9b32010-04-09 18:44:46 +00001273 S32A_D565_Opaque_Dither_PROC,
reed@android.com7d2e3222009-07-30 02:22:31 +00001274 NULL, // S32A_D565_Blend_Dither
1275};
1276
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001277static const SkBlitRow::Proc platform_4444_procs[] = {
reed@android.com7d2e3222009-07-30 02:22:31 +00001278 // no dither
1279 NULL, // S32_D4444_Opaque,
1280 NULL, // S32_D4444_Blend,
1281 NULL, // S32A_D4444_Opaque,
1282 NULL, // S32A_D4444_Blend,
1283
1284 // dither
1285 NULL, // S32_D4444_Opaque_Dither,
1286 NULL, // S32_D4444_Blend_Dither,
1287 NULL, // S32A_D4444_Opaque_Dither,
1288 NULL, // S32A_D4444_Blend_Dither
1289};
1290
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001291static const SkBlitRow::Proc32 platform_32_procs[] = {
reed@android.com3bbac132009-09-23 18:48:10 +00001292 NULL, // S32_Opaque,
reed@android.com4e635f92009-10-19 17:39:46 +00001293 S32_Blend_BlitRow32_PROC, // S32_Blend,
1294 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque,
agl@chromium.org8b17ac32010-09-10 15:09:42 +00001295 S32A_Blend_BlitRow32_PROC // S32A_Blend
reed@android.com3bbac132009-09-23 18:48:10 +00001296};
1297
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001298SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001299 return platform_4444_procs[flags];
1300}
1301
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001302SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001303 return platform_565_procs[flags];
1304}
1305
reed@android.comf0f4e9a2009-11-13 19:00:49 +00001306SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001307 return platform_32_procs[flags];
1308}
reed@android.com229d9b32010-04-09 18:44:46 +00001309
senorblanco@chromium.org31e62302010-12-15 16:20:59 +00001310SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
1311 return NULL;
1312}
reed@google.com981d4792011-03-09 12:55:47 +00001313
1314
1315SkBlitMask::Proc SkBlitMask::PlatformProcs(SkBitmap::Config dstConfig,
1316 SkColor color)
1317{
1318 return NULL;
1319}